home *** CD-ROM | disk | FTP | other *** search
- /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
- *
- * The contents of this file are subject to the Netscape Public License
- * Version 1.0 (the "NPL"); you may not use this file except in
- * compliance with the NPL. You may obtain a copy of the NPL at
- * http://www.mozilla.org/NPL/
- *
- * Software distributed under the NPL is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
- * for the specific language governing rights and limitations under the
- * NPL.
- *
- * The Initial Developer of this code under the NPL is Netscape
- * Communications Corporation. Portions created by Netscape are
- * Copyright (C) 1998 Netscape Communications Corporation. All Rights
- * Reserved.
- */
- /*** robotxt.h ****************************************************/
- /* description: parses the robots.txt file */
- /* - not dependent on the crawler */
-
-
- /********************************************************************
- See the robots.txt specification at:
-
- http://info.webcrawler.com/mak/projects/robots/norobots.html (original spec)
- http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
-
- Note: the original spec says that at least one Disallow field must be present
- in a record. That is what I am following.
-
- $Revision: 3.1 $
- $Date: 1998/03/28 03:31:59 $
-
- *********************************************************************/
-
- #ifndef robotctl_h___
- #define robotctl_h___
-
- #include "prtypes.h"
- #include "ntypes.h"
- #include "net.h"
-
- typedef uint8 CRAWL_RobotControlStatus;
- #define CRAWL_ROBOT_DISALLOWED ((CRAWL_RobotControlStatus)0x00)
- #define CRAWL_ROBOT_ALLOWED ((CRAWL_RobotControlStatus)0x01)
- #define CRAWL_ROBOTS_TXT_NOT_QUERIED ((CRAWL_RobotControlStatus)0x02)
-
- typedef struct _CRAWL_RobotControlStruct *CRAWL_RobotControl;
-
- /*
- * Typedef for function callback called after robots.txt is read.
- */
- typedef void
- (PR_CALLBACK *CRAWL_RobotControlStatusFunc)(void *data);
-
- /* stream function */
- PUBLIC NET_StreamClass*
- CRAWL_RobotsTxtConverter(int format_out,
- void *data_object,
- URL_Struct *URL_s,
- MWContext *window_id);
-
- /****************************************************************************************/
- /* public API */
- /****************************************************************************************/
-
- NSPR_BEGIN_EXTERN_C
-
- /* Creates a robot control for the site.
- Parameters:
- context - context for libnet
- site - protocol and host portion of url. "/robots.txt" will be appended to this to get the
- location of robots.txt.
- */
- PR_EXTERN(CRAWL_RobotControl)
- CRAWL_MakeRobotControl(MWContext *context, char *site);
-
- /* Destroys a robot control and all memory associated with it (except for the context or the
- opaque data supplied to CRAWL_ReadRobotControlFile)
- */
- PR_EXTERN(void)
- CRAWL_DestroyRobotControl(CRAWL_RobotControl control);
-
- /* Parses the robots.txt at the site specified in the control, and performs a callback when
- it is done. This function returns after issuing a request to netlib.
- Parameters:
- control - the robot control for the site
- func - completion callback
- data - data to provide to the callback which is opaque to the robots.txt parser
- freeData - if true, frees data (previous param) on completion
- */
- PR_EXTERN(PRBool)
- CRAWL_ReadRobotControlFile(CRAWL_RobotControl control, CRAWL_RobotControlStatusFunc func, void *data, PRBool freeData);
-
- /* Returns a status code indicating the robot directive for the url supplied */
- PR_EXTERN(CRAWL_RobotControlStatus)
- CRAWL_GetRobotControl(CRAWL_RobotControl, char *url);
-
- NSPR_END_EXTERN_C
-
- #endif
-
-