home *** CD-ROM | disk | FTP | other *** search
- /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
- *
- * The contents of this file are subject to the Netscape Public License
- * Version 1.0 (the "NPL"); you may not use this file except in
- * compliance with the NPL. You may obtain a copy of the NPL at
- * http://www.mozilla.org/NPL/
- *
- * Software distributed under the NPL is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
- * for the specific language governing rights and limitations under the
- * NPL.
- *
- * The Initial Developer of this code under the NPL is Netscape
- * Communications Corporation. Portions created by Netscape are
- * Copyright (C) 1998 Netscape Communications Corporation. All Rights
- * Reserved.
- */
- /*** pagescan.h ***************************************************/
- /* description: page scanning */
- /* - not dependent on the crawler */
-
-
- /********************************************************************
-
- $Revision: 3.1 $
- $Date: 1998/03/28 03:31:58 $
-
- *********************************************************************/
-
- #ifndef pagescan_h___
- #define pagescan_h___
- #include "prtypes.h"
- #include "ntypes.h" /* for MWContext */
- #include "net.h" /* for URLStruct */
-
- /* any content length over this number is assumed to be a bogus content length field. This
- is sometimes used to keep a connection open. If we encounter a content length this size or
- greater we won't automatically reject it because it's larger than the remaining cache size.
- */
- #define BOGUS_CONTENT_LENGTH ((uint32)100000000)
-
- typedef char **CRAWL_ItemList;
-
- typedef char *CRAWL_Item;
-
- /* Most APIs require a CRAWL_PageInfo reference, which is created in crawl_makePage. */
- typedef struct _CRAWL_PageInfoStruct *CRAWL_PageInfo;
-
- /* Prototype describing callback function for when a page is done. */
- typedef void
- (*CRAWL_ScanPageStatusFunc)(void *data, CRAWL_PageInfo pageInfo);
-
- /* Creates a page object which will be freed by the completion or abort stream routines (the
- URL_Struct is freed by the exit routine). Returns NULL if there is not enough memory.
- Parameters:
-
- siteURL - url of the site (used to make absolute url). Note this may be different than the
- site that the page is on (in case we are crawling), in which case the pageURL should be absolute.
- pageURL - url of the page.
- cache - external cache. If non-null, the url will be cached here.
- */
- CRAWL_PageInfo crawl_makePage(char *siteURL, char *pageURL, ExtCacheDBInfo *db);
-
- /* Deallocates the list structures but not the list content (which should be done by the consumer).
- This is normally called internally by the completion and abort stream routines.
- */
- void crawl_destroyPage(CRAWL_PageInfo page);
-
-
- /* Requests the url from netlib and begins scanning. This function returns after the request
- to netlib is issued. The callback function is called when page scanning is complete or aborts.
- It is not guaranteed to be called in an out of memory situation.
- */
- void crawl_scanPage(CRAWL_PageInfo page, MWContext *context, CRAWL_ScanPageStatusFunc func, void *data);
-
- /****************************************************************************************/
- /* accessors */
- /****************************************************************************************/
-
- /* returns true if the page does not have a META tag that specifies that the page should
- not be indexed.
- */
- PRBool crawl_pageCanBeIndexed(CRAWL_PageInfo pageInfo);
-
- char* crawl_getPageURL(CRAWL_PageInfo pageInfo);
-
- time_t crawl_getPageLastModified(CRAWL_PageInfo pageInfo);
-
- URL_Struct* crawl_getPageURL_Struct(CRAWL_PageInfo pageInfo);
-
- CRAWL_ItemList crawl_getPageLinks(CRAWL_PageInfo pageInfo);
- uint16 crawl_getPageLinkCount(CRAWL_PageInfo pageInfo);
-
- CRAWL_ItemList crawl_getPageImages(CRAWL_PageInfo pageInfo);
- uint16 crawl_getPageImageCount(CRAWL_PageInfo pageInfo);
-
- CRAWL_ItemList crawl_getPageResources(CRAWL_PageInfo pageInfo);
- uint16 crawl_getPageResourceCount(CRAWL_PageInfo pageInfo);
-
- CRAWL_ItemList crawl_getPageRequiredResources(CRAWL_PageInfo pageInfo);
- uint16 crawl_getPageRequiredResourceCount(CRAWL_PageInfo pageInfo);
-
- CRAWL_ItemList crawl_getPageFrames(CRAWL_PageInfo pageInfo);
- uint16 crawl_getPageFrameCount(CRAWL_PageInfo pageInfo);
-
- CRAWL_ItemList crawl_getPageLayers(CRAWL_PageInfo pageInfo);
- uint16 crawl_getPageLayerCount(CRAWL_PageInfo pageInfo);
-
- /****************************************************************************************/
- /* stream function */
- /****************************************************************************************/
-
- PUBLIC NET_StreamClass*
- CRAWL_CrawlerConverter(int format_out,
- void *data_object,
- URL_Struct *URL_s,
- MWContext *window_id);
- #endif
-