OS/2 Shareware BBS: 35 Internet

home *** CD-ROM | disk | FTP | other *** search

/ OS/2 Shareware BBS: 35 Internet / 35-Internet.zip / grabsite.zip / grabsite.cmd next >

Wrap

OS/2 REXX Batch file | 1999-08-18 | 39KB | 1,365 lines

/* This will take an html document and find all <a and <img links. It will not find "FORM links. */ /********** BEGIN USER changeable parameters ***********/ /* default "base url" -- only used if a file:// url is entered For example, to start at d:\www\guide.htm, you'ld enter file://d:\www\guide.htm You'ld then need to enter the "default" address (for use with relative URLS contained in this file) */ defbaseurl='http://www/' /* default root directory -- only used if a file:// url is entered */ defrootdir='/' /* default name (for urls that end with / ). For example, given a link of "<a href="/sports/scoreboard/"> and def_tofile='index.htm' then the contents of this url would be written to: destination_dir\sports\scoreboard\index.htm */ def_tofile='INDEX.HTM' /*If HTML Document mode is selected, then only links ending with these extensions are downloaded, examined, and written. Notes: * In all cases, if the content-type header is NOT text/html, the contents will NOT be examined. * If HTMLEXTS='', then this test is not performed * If NOT_HTMLEXTS='', then this test is not performed */ htmlexts='SHTML SHT HTM HTML HTML-SSI HTM-SSI' /* log file. If none desired, set=0. Otherwise, enter a filename. Note that old log files will be deleted/overwritten */ logfile='GRABSITE.LOG' /* nocgi=1 to skip CGI urls (that have a /CGI in their path */ nocgi=1 /* nosearch=1 to skip urls that end with ?xxx (where xxx is a string of any length)*/ nosearch=1 /*If HTML Document mode is selected, then links ending with these extensions are NOT downloaded. Notes: * If "retrieve all links" mode is specified, then not_htmlexts is ignored * If HTMLEXTS<>'', then this test is not performed * If "retrieve all links" mode is specified, then htmlexts is ignored * If HTMLEXTS='', then this test is not performed */ not_htmlexts='JPG GIF BMP ZIP GZ TIF TIFF MOV AU EXE COM WAV XBM PDF PS EPS ' /* overwrite=1 means "overwrite preexisting files. Otherwise, don't overwrite */ overwrite=1 /* optional request header(s) to send to servers Note: use '0d0a'x to seperate multiple request headers */ reqheaders='User-agent: GrabSite' /* if URL's path starts with remove_prefix, then trim the beginning of the path (remove everything up to the first /) For example, if remove_prefix='!RANGE and a link is /!RANGE:bytes=100-200/surplus/prices.lst then /surplus/prices.lst is used */ remove_prefix='!RANGE' /* If robot_check=1, then check for a /ROBOTS.TXT file. This contains instructions on what paths should not be visited by "web robots". */ robot_check=1 /* if URL's path starts with skip_prefix, then skip it This is only needed when the "retrieve" test is /. */ skip_prefix='!' /* Status reports: -2 for NO status output, -1 for minimal, 0 for average 1 for some, 2 for too much */ verbose=1 /********** END USER changeable parameters ***********/ parse arg afile destdir includer includer2 write_all cmdline=0 if afile<>'' then do afile=translate(afile,'/','\') cmdline=1 end /* do */ write_all0=write_all if afile='?' then do say "GrabSite -- GET a linked set of pages from the WWW" say say "Calling syntax: GrabSite URL DestDir Test1 Test2 Get_all " say " where:" say " URL = a fully qualified URL (the home page to start at)" say " DestDir = destination directory (on local disk) to write results to" say " Test1 = only parse documents in/under this prefix " say " Test2 = only retrieve documents in/under this prefix " say " Get_all = if 0, then do NOT get non-html documents " say " " say " Note: to avoid command line problems: use \ instead of /" say " " say "Example: " say " D:>grabsite http:\\fu.br.net\circ\index.htm d:\foob \circ\ \ 1 " say say "Or .. enter without arguments for user prompts" exit end /* do */ /* initialize some stuff */ baseurl='' rootdir='' includer=translate(translate(includer,'/','\')) includer2=translate(translate(includer2,'/','\')) remove_prefix=translate(remove_prefix) skip_prefix=translate(skip_prefix) htmlexts=translate(htmlexts) not_htmlexts=translate(not_htmlexts) ndeleted=0 nwritten=0 ; noconnects=0 ngets=0; n400s=0 nparsed=0 crlf='0d0a'x fileurls.0=0 flist.0=0 call loaddll /* load some dlls, set some parameters */ say say " "cy_ye"GrabSite -- GET a set of linked documents from a WWW site"normal say if logfile=0 | logfile=' ' then do logfile=0 end /* do */ else do aa=stream(logfile,'c','query exists') if aa<>'' then do foo=sysfiledelete(logfile) if verbose>0 then say "Old logfile deleted: "logfile call lineout logfile,'GrabSite log file. Created '||time('n')||' '||date('n') end /* do */ end /***** determine file/url to read, and other info */ say jump1: nop if afile='' then do afile=getstring("Home page to grab, or enter ? for a brief description.",'?',reverse' 1)'normal) if afile='?' then do call helpme1 afile='' signal jump1 end /* do */ end afile=strip(afile) afileu=translate(strip(afile)) if abbrev(afileu,'FILE://') then do parse var afile . '://' afile afileu=translate(afile) afile_isurl=0 if stream(afile,'c','query exists')="" then do call printsay "No such file: "afile exit end /* do */ call printsay " ... reading "||cutstrg(afile,50) "...." stuff=charin(afile,1,chars(afile)) afile_isurl=0 if baseurl='' then baseurl=getstring("Default site (the dotted ip address)",defbaseurl,reverse' 1a)'normal) if rootdir='' then rootdir=getstring("Default 'root' directory ",defrootdir,reverse' 1b)'normal) end /* local file as base */ else do /* it's a url */ if abbrev(afileu,'HTTP://')<>1 then do afile='http://'afile afileu=translate(afile) end /* do */ afile_isurl=1 parse var afile . '://' bb1 '/' bb2 baseurl=bb1 ii=lastpos('/',bb2) if ii=0 then rootdir='/' else rootdir=left(bb2,ii) end /* url entry */ if pos('://',baseurl)=0 then baseurl='http://'||baseurl rootdir=strip(rootdir) if rootdir<>'/' then rootdir='/'||strip(rootdir,,'/')||'/' baseurl=strip(strip(baseurl,'t','/')) /* destination directory */ atdestdir: nop if destdir='' then do destdir=getstring("Enter a destination directory ",directory(),reverse' 2)'normal) if destdir="?" then do call helpme1 destdir='' signal atdestdir end /* do */ didit=sysmkdir2(destdir,1) if didit<>0 then do say "Could not access, or create, "destdir exit end /* do */ end else do destdir=strip(strip(destdir),'t','\')'\' didit=sysmkdir2(destdir,1) if didit<>0 then do say "Could not access, or create, "destdir exit end /* do */ end destdir=strip(strip(destdir),'t','\')'\' /* get and set includers variables */ call get_includers /* Quick/skeleton mode */ getquick:nop if cmdline<>1 then do do until write_all<>'' al=getstring(' HTML documents only (Yes, No, or ? for help)','N',reverse' 4)'normal) al=strip(translate(al)) if al='?' then do al='' call help_writeall iterate end if abbrev(al,'N')=1 then write_all=1 else write_all=0 end if write_all=0 then call printsay "Ignoring non-html documents" else call printsay "Retrieving all links " say /* modify other parameters */ if write_all0='' then do if yesno(" Would you like to modify configuration parameters?")=1 then do call modify_config end /* do */ end end /************** Done with user input **********/ /******* copy file/url to destdir */ /* if local file, copy directly to destidr if url, then maybe copy relative to destdir */ if afile_isurl=0 then do /* local file -- jump start*/ ff=translate(afile,' ','\/') ff2=word(ff,words(ff)) ff2=destdir||ff2 say bold"Saving to "normal|| ff2 foo=translate(stream(ff2,'c','open write')) if foo<>'READY:' then do say "Could not open file for writing. Error was: " foo exit end /* do */ foo=charout(ff2,stuff,1) if foo<>0 then do say "Error. Problem writing file " exit end /* do */ foo=stream(ff2,'c','close') goo=time('e') /* get stuff from file */ foo=urls_in(stuff,baseurl,rootdir,afile) goo2=time('e') if verbose>0 then do if goo2-goo>5 then call printsay " ... done parsing "||cutstrg(afile,50) call printsay ' ' call printsay " " cy_ye " # links in "normal||bold||afile"="normal||" "||fileurls.0 end nparsed=1 end else do /* a url */ iurls=1 uaref=translate(strip(afile)) flist.uaref=1 flist.0=1 fileurls.iurls=afile fileurls.iurls.!ref='user' fileurls.0=iurls end /* do */ /**** get a robot.txt file first? */ if robot_check=1 then do aurl=baseurl'/robots.txt' rlist=get_url(aurl) exclist=add_robot(rlist) if verbose>0 then do call printsay "Excluding: "exclist ; call printsay ' ' end exclist.0=0 if exclist<>'' then do do ii=1 to words(exclist) exclist.ii=translate(strip(word(exclist,ii))) end /* do */ exclist.0=words(exclist) end /* do */ end /* build exclist. */ /************ Get urls in first file/url */ call printsay ' ' if write_all=0 then call printsay ' Examining html links starting from:'||bold||afile||normal else call printsay ' Examining links starting from:'||bold||afile||normal call printsay ' ' /********** now get the urls, parse, add to list.... */ mm=0 do forever mm=mm+1 if mm>fileurls.0 then leave goob=fileurls.mm goob2=translate(goob) parse var goob . '://' bb1 '/' asel baseurl=bb1 ii=lastpos('/',asel) if ii=0 then rootdir='/' else rootdir=left(asel,ii) if pos('://',baseurl)=0 then baseurl='http://'||baseurl rootdir=strip(rootdir) if rootdir<>'/' then rootdir='/'||strip(rootdir,,'/')||'/' baseurl=strip(strip(baseurl,'t','/')) if robot_no(asel)=1 then iterate /* robot excluded */ if includer2<>"" then do /* only GET if in/under this directory */ if abbrev(goob2,includer2)=0 then iterate end if nocgi=1 then do /* cgi? then skip */ if pos('/CGI',asel)=1 then iterate end if nosearch=1 then do /* skip "search string" calls (usually to scripts*/ if pos('?',asel)>0 then iterate end /* do */ if skip_prefix<>'' then do /* ignore if starts with this? */ if abbrev(asel,skip_prefix)=1 then iterate end /* do */ ara=lastpos('.',asel);anext='' if ara>0 then do /* check for html type of extentsion*/ anext=translate(strip(substr(asel,ara+1))) end if htmlexts<>'' & write_all<>1 then do /* only get possible htmls */ if pos(anext,htmlexts)=0 then iterate end if not_htmlexts<>"" & write_all<>1 then do /* don't get almost certainly NOT htmls */ if pos(anext,not_htmlexts)>0 then iterate end /* do */ f1f=goob if length(f1f)>40 then f1f='...'right(goob,36) oof='' if verbose>0 then oof=']--'||filespec('n',fileurls.mm.!ref) if verbose>-1 then call printsay "Checking "bold||mm||normal||" of "fileurls.0")"||f1f||oof /* get the url */ goo=time('e') stuff=get_url(goob,,verbose,reqheaders) goo2=time('e') if goo2-goo>5 & verbose>0 then call printsay " .... done GETting "||cutstrg(goob,50) if stuff="" then do noconnects=noconnects+1 iterate end /* do */ ngets=ngets+1 call extracts /* extract body and head */ /* look for return code */ parse var response_line . icode . r1=left(response_code,1) if r1=4 | r1=5 | r1=1 then do /* error response */ n400s=n400s+1 iterate end /* do */ /* get the content-type */ ss='!CONTENT-TYPE' if translate(headers.ss)<>'TEXT/HTML' then do /* not html -- don't parse */ if write_all=1 then call url_to_file goob /* but possibly save to disk */ iterate /* don't bother parsing this */ end /* does it satisfy the INCLUDER test? */ if includer<>"" then do if abbrev(goob2,includer)=0 then do call url_to_file goob iterate /* don't bother parsing this */ end /* do */ end /* extract links, but first write it to disk */ call url_to_file goob if result=0 then iterate /* if here, extract urls and add to list */ eek=fileurls.0 goo=time('e') if verbose>0 then call printsay " .... parsing "||cutstrg(goob,50) if r1=3 then do /* redirect -- extract location header */ ss='!LOCATION' asd=strip(headers.ss) if asd<>'' then do stuff=stuff||'<a href="'asd'"> ' /* convert location header to link (a small hack */ end /* do */ end /* do */ foo=urls_in(stuff,baseurl,rootdir,goob) goo2=time('e') if goo2-goo>5 & verbose>0 then call printsay " ... done parsing "||cutstrg(goob,50) nparsed=nparsed+1 if verbose>1 then do if eek<fileurls.0 & verbose>0 then call printsay " new links to check: "bold||(fileurls.0-eek)||normal end end /* ******* Read a url */ /**** Status info */ call printsay ' ' call printsay ' ------- Status: ' call printsay "Total number of unique URLs: "fileurls.0 call printsay "Total number retrieval attempts: " ngets '(400s='n400s'. No Connect='noconnects')' call printsay "Total number of parsed pages: "nparsed call printsay "Total number of files written: " nwritten '(files deleted='ndeleted')' call printsay " " call printsay "Reminder: files are written to "bold||destdir||normal if logfile<>0 then do say ' ** The log file is: ' logfile call lineout logfile end exit /********/ /* modify configuration parameters */ modify_config: params="def_tofile htmlexts logfile not_Htmlexts overwrite robot_check reqheaders " params=params||"reqheaders verbose nocgi nosearch remove_prefix skip_prefix" params=translate(params) say do forever aa=getstring("Select a parameter to modify (?=list,??=current values, X=done)","?",reverse" -->"normal) if aa="?" then do say say " "reverse"Configuration Parameters: "normal say bold" DEF_TOFILE"normal"= default filename, used when a URL does not contain a filename" say bold" HTMLEXTS"normal"= HTML extensions (if quick mode selected, only files with these " say " extensions are retrieved)" say bold" LOGFILE"normal"= Name of logfile (results are recorded here)" say bold" NOCGI"normal"= If 1, do NOT retrieve URLs containing /CGI (cgi-bin scripts)" say bold" NOSEARCH"normal"= If 1, do NOT retrieve URLs that end with a ?xxxx " say bold"NOT_HTMLEXTS"normal"= non-HTML extensions (if quick mode selected, files with these " say " extensions are ignored)" say bold" OVERWRITE"normal"= If 1, then overwrite preexisting files " say bold"REMOVE_PREFIX"normal"= If the URL's path starts with this, then trim the " say " beginning of the path (remove everything up to the first /) " say bold" SKIP_PREFIX"normal"= If the URL's path starts with this, then skip it " say bold" VERBOSE"normal"= If 1, verbose mode " say iterate end /* do */ if aa="??" then do say say " "reverse"Current values of configuration Parameters: "normal say bold" DEF_TOFILE"normal"= "def_tofile say bold" HTMLEXTS"normal"= "htmlexts say bold" LOGFILE"normal"= "logfile say bold" NOCGI"normal"= "nocgi say bold" NOSEARCH"normal"= "nosearch say bold"NOT_HTMLEXTS"normal"= "not_htmlexts say bold" OVERWRITE"normal"= "overwrite say bold"REMOVE_PREFIX"normal"= "remove_Prefix say bold" REQHEADERS"normal"= "reqheaders say bold" ROBOT_CHECK"normal"= "robot_check say bold" SKIP_PREFIX"normal"= "skip_prefix say bold" VERBOSE"normal"= "verbose say say "Note: you can permanently change these values by editing GRABSITE.CMD" say iterate end /* do */ aa=translate(strip(aa)) if aa='X' then leave if wordpos(aa,params)=0 then do say "No such parameter: " aa end /* do */ else do aaold=value(aa) bb=getstring("Enter new value for "aa,aaold,bold" --->"normal) foo=value(aa,bb) end end return 0 /********/ /* get and set includer and includers2 */ get_includers: include1: nop if includer='' then do includer=getstring(" Only GET & examine & save urls in or under (? for help) ",rootdir,reverse' 3)'normal) end if includer="?" then do call help_includer includer='' signal include1 end /* do */ includer=translate(includer) include2: nop if includer2='' then do includer2=getstring(" Only GET & save urls that being with ",includer,reverse' 3b)'normal) end if includer2="?" then do call help_includer includer2='' signal include2 end /* do */ includer2=translate(includer2) if includer='' then includer=baseurl||rootdir else includer=baseurl||'/'strip(includer,'l','/') say call printsay "Only examining URLs in/under: "includer if includer2='' then includer2=baseurl||'/' else includer2=baseurl||'/'strip(includer2,'l','/') call printsay "Only retrieving URLs in/under: "includer2 includer=translate(includer) includer2=translate(includer2) len_includer2=length(includer2) say return 0 /**************************************************/ /* copy a url to a file */ url_to_file: parse arg afil goob2=translate(afil) if includer2<>"" then do /*relative to includer2 directory */ tofile=substr(goob2,len_includer2) end else do parse var afil . '://' . '/' tofile end /* do */ if tofile='' | right(tofile,1)='/' then tofile=tofile||def_tofile /* save to destidr */ tofile=translate(tofile,'\','/') tofile=strip(strip(tofile),'l','\') tofile2=destdir||tofile todir=filespec('d',tofile2)||filespec('p',tofile2) mkit=sysmkdir2(todir) yow=stream(tofile2,'c','query exists') if yow<>'' then do if overwrite=2 then do if verbose>-1 then call printsay " "||cy_ye||tofile2||normal " old version used." return 1 /* use old copy */ end if overwrite=1 then do if verbose>0 then call printsay " .... deleting "tofile2 foo=sysfiledelete(tofile2) ndeleted=ndeleted+1 end /* do */ else do call printsay " > "tofile2 " exists; "bold"skipping "normal return 0 end /* do */ end /* do */ foo=stream(tofile2,'c','open write') wow=charout(tofile2,stuff,1) if wow<>0 then do call printsay "ERROR: could not write "tofile2 return 0 end /* do */ foo=stream(tofile2,'c','close') if foo="READY:" then do if verbose>-2 then call printsay " "||cy_ye||tofile2||normal " written." end /* do */ nwritten=nwritten+1 return 1 /* sets globals */ /********************/ /* search a file, find IMG SRC= and A HREF= urls. Add BASEURL if no / or http://.../ at beginning of URL */ urls_in:procedure expose fileurls. flist. remove_prefix bold normal logfile reverse cy_ye parse arg stuff, baseurl,rootdir,stuffname /* remove comments */ body="" do forever /*no comments within comments are allowed */ if stuff="" then leave parse var stuff t1 '' stuff body=body||t1 end /* do */ stuff=body body='' if verbose=1 then call printsay "Parsing "||length(stuff)||' characters' /* find all IMG SRC= and A HREF=, FRAME= throw away internal links */ do until stuff="" parse var stuff . '<' anarg '>' stuff aref=afindsrc(anarg) if aref='' then iterate uaref=translate(aref) if abbrev(uaref,'MAILTO:')=1 then iterate /* only keep https */ if abbrev(uaref,'FTP:')=1 then iterate if abbrev(uaref,'GOPHER:')=1 then iterate /* fix up name to be fully qualified url */ select when abbrev(translate(aref),'HTTP://')=1 then nop when abbrev(aref,'/')=1 then aref=baseurl||aref otherwise aref=baseurl||rootdir||aref end /* check for remove_prefix entries */ if remove_prefix<>'' then do parse var aref a1 '://' a2 '/' aaurl if abbrev(translate(aaurl),translate(remove_prefix))=1 then do parse var aaurl . '/' aaurl aref=a1'://'a2'/'aaurl if verbose=1 then call printsay " > " remove_prefix "removal yields: "aref end /* do */ end /* do */ /* record this entry only if not yet recorded -- else, just increment counter */ uaref=translate(aref) if datatype(flist.uaref)<>'NUM' then flist.uaref=0 flist.uaref=1+flist.uaref flist.0=flist.0+1 if flist.uaref=1 then do iurls=fileurls.0+1 fileurls.iurls=aref fileurls.iurls.!ref=stuffname fileurls.0=iurls end end /* do */ return iurls /*****************/ /* get a string from user */ getstring:procedure expose normal bold reverse logfile cy_ye parse arg prompt,def,prompt0 abold=bold if bold="BOLD" then abold='' anormal=normal if normal='NORMAL' then anormal='' l1=length(prompt) l2=length(def) if l1+l2>38 then do say prompt0' 'abold||prompt||anormal if l2>22 then do say ' (ENTER='abold||def||normal')' call charout, bold" ? "normal parse pull ans end /* do */ else do call charout,' (ENTER='abold||def||anormal')? ' parse pull ans end end else do call charout,prompt0' 'bold||prompt||normal' (ENTER='abold||def||anormal')? ' parse pull ans end if ans='' then ans=def return ans /* ---------------------------------------------*/ /* get a url from some site, return first maxchar characters (if maxchar missing, get 10million (the whole thing?) call as: stuff=get_url(aurl,maxchar,verbose,headers) where: aurl: the url to GET (required) the other 3 are optional: maxchar: max chars to get (default=10,000,000) verbose: verbose mode (default=OFF) headers: list of extra request headers, CRLF delimited */ /* ---------------------------------------------*/ get_url:procedure expose logfile bold normal reverse cy_ye parse arg aurl,maxchar,verbose,headers if maxchar="" then maxchar=10000000 got="" if abbrev(translate(aurl),'HTTP://')=0 then do if verbose>0 then call printsay "Error: URL not properly specified (it must begin with HTTP://)" return '' end parse var aurl . '://' server '/' request if VERBOSE>1 then call printsay " GETting http url : " server ", " request /* now get the url. This requires the RxSock.DLL be in your LIBPATH. */ /* Load RxSock */ if \RxFuncQuery("SockLoadFuncs") then nop else do call RxFuncAdd "SockLoadFuncs","rxSock","SockLoadFuncs" call SockLoadFuncs end crlf ='0d0a'x /* constants */ family ='AF_INET' httpport=80 rc=sockgethostbyname(server, "serv.0") /* get dotaddress of server */ if rc=0 then do call printsay ' Unable to resolve "'server'"' return 0 end dotserver=serv.0addr /* .. */ gosaddr.0family=family /* set up address */ gosaddr.0port =httpport gosaddr.0addr =dotserver gosock = SockSocket(family, "SOCK_STREAM", "IPPROTO_TCP") /* Set up request */ message="GET /"request' HTTP/1.0 'crlf||'Host: 'server||crlf if length(headers)>2 then do if right(headers,2)=crlf then headers=left(headers,length(headers)-2) end if headers<>'' then message=message||headers||crlf message=message||crlf got='' rc = SockConnect(gosock,"gosaddr.0") if rc<0 then do call printsay ' Unable to connect to "'server'"' return 0 end rc = SockSend(gosock, message) /* Now wait for the response */ do r=1 by 1 rc = SockRecv(gosock, "response", 1000) got=got||response if rc<=0 then leave tmplen=length(got) if tmplen> maxchar then leave end r rc = SockClose(gosock) return got /* --- Load the function library, if necessary --- */ loaddll: if RxFuncQuery("SockLoadFuncs")=1 then do /* already there */ call RxFuncAdd "SockLoadFuncs","rxSock","SockLoadFuncs" call SockLoadFuncs end foo=rxfuncquery('sysloadfuncs') if foo=1 then do call RxFuncAdd 'SysLoadFuncs', 'RexxUtil', 'SysLoadFuncs' call SysLoadFuncs end /**** foo=rxfuncquery('rexxlibregister') if foo=1 then do call rxfuncadd 'rexxlibregister','rexxlib', 'rexxlibregister' call rexxlibregister end foo=rxfuncquery('rexxlibregister') if foo=1 then do say " Could not find REXXLIB " exit end ***/ ansion=checkansi() if ansion=1 then do aesc='1B'x cy_ye=aesc||'[37;46;m' normal=aesc||'[0;m' bold=aesc||'[1;m' re_wh=aesc||'[31;47;m' reverse=aesc||'[7;m' end else do say " Warning: Could not detect ANSI.... output will look ugly ! " cy_ye="" ; normal="" ; bold="" ;re_wh="" ; reverse="" end /* Do */ return 1 /* -------------------- */ /* get a yes or no , return 1 if yes */ yesno:procedure expose normal reverse bold logfile cy_ye parse arg fooa , allopt,altans if altans<>" " & words(altans)>1 then do w1=strip(word(altans,1)) w2=strip(word(altans,2)) a1=left(w1,1) ; a2=left(w2,1) a1a=substr(w1,2) ; a2a=substr(w2,2) end else do a1='Y' ; a1a='es' a2='N' ; a2a='o' end /* Do */ ayn=' '||bold||a1||normal||a1a||'\'||bold||a2||normal||a2a if allopt=1 then ayn=ayn||'\'||bold||'A'||normal||'ll' do forever foo1=normal||reverse||fooa||normal||ayn call charout, foo1 normal ':' pull anans if abbrev(anans,a1)=1 then return 1 if abbrev(anans,a2)=1 then return 0 if allopt=1 & abbrev(anans,'A')=1 then return 2 end nocon: if rc=-7 then return 0 exit 0 /* ------------------------------------------------------------------ */ /* function: Check if ANSI is activated */ CheckAnsi: PROCEDURE thisRC = -1 trace off /* install a local error handler */ SIGNAL ON ERROR Name InitAnsiEnd "@ANSI 2>NUL | rxqueue 2>NUL" thisRC = 0 do while queued() <> 0 queueLine = lineIN( "QUEUE:" ) if pos( " on.", queueLine ) <> 0 | , /* USA */ pos( " (ON).", queueLine ) <> 0 then /* GER */ thisRC = 1 end /* do while queued() <> 0 */ InitAnsiEnd: signal off error RETURN thisRC /*************************/ /* return 1 if adir is an existing (possibly empty) directory , 0 if not */ dosisdir2:procedure parse arg adir adir=strip(adir) adir=strip(adir,'t','\') nowdir=directory() nowdrive=filespec('d',nowdir'\') nowpath=filespec('p',nowdir'\') adr=filespec('d',adir) if adr='' then do if abbrev(adir,'\')=0 then adir=nowdrive||nowpath||adir else adir=nowdrive||adir end /* do */ foo=sysfiletree(adir,goo,'D') if goo.0>0 then return 1 return 0 /*************************************/ /* parse GETten stuff to globals response_line = the response line */ response_code = the 200, 401, etc. code headers. = list of response headers stuff = the contents (the file) */ extracts: cr='0a'x parse var stuff response_line (cr) stuff parse var response_line . response_code . response_line=strip(response_line,,'0d'x) headers.0='' do forever parse var stuff ahead (cr) stuff ahead=strip(ahead,,'0d'x) if ahead='' then leave parse var ahead name ':' aval nn=translate('!'||name) headers.0=headers.0' 'nn headers.nn=aval end /* do */ /* remove html comments */ return 1 /* ------------- */ /* create a directory, arbitrarily deep. Returns 0 if succes, otherwise returns an error code adir: directory to create -- must be fully qualified. verbose: if 1, will write some status stuff to screen */ sysmkdir2:procedure parse arg adir,verbose adir=strip(adir,'t','\') if dosisdir2(adir)=1 then do /* already exists */ if verbose=1 then say " Using pre-existing directory: "adir return 0 end /* do */ ff=sysmkdir(adir) if ff=0 then return ff /* make the tree */ f2=adir'\' dd=filespec('d',f2) pp=filespec('p',f2) if pp='\' | pp='' then return -1 pp2=strip(translate(pp,' ','\')) do mm=1 to words(pp2) a1=subword(pp2,1,mm) a1=translate(a1,'\',' ') dd2=dd'\'a1 hoo=sysmkdir(dd2) if hoo=0 & verbose=1 then call printsay ' ... creating: 'dd2 end /* do */ return hoo /****************/ /* URL and DESTDIR help info */ helpme1: say say bold"GrabSite"normal" is designed to copy a WWW site to your local hard disk. " say say "It's easy to use: just specify a URL, and then specify a directory" say "on your hard drive to copy the web pages (and other files) retrieved" say "from this WWW site." say say "For example: suppose the 'home page' is" say " http://www.coolstuff.org/games/expert.htm" say "and the 'destination directory' is:" say " d:\localweb\game10 " say "Then..." say " a) GrabSite will GET (using socket calls) the /games/expert.htm HTML " say " document at www.coolstuff.org." say " b) A copy of /games/expert.htm will be written to d:\localweb\games10 " say " c) /games/expert.htm will be scanned for links " say " d) For each link found, repeat step a (changing names appropriately)" say say "Note: For hints on running from command line, run GrabSite with a ? argument." say" Example: D:>GrabSite ? " say call charout,reverse"Hit any key to continue "normal foo=sysgetkey('noecho') say return 1 /****************/ /* INCLUDER help info */ help_Includer: say say "You can, and should, limit the scope of "bold"GrabSite"normal"'s WWW downloads" say "(If you don't, you could end up downloading a significant chunk of the WWW!)" say say "There are two tests used to limit scope: " say say " a) Limiting what URLS are "bold"downloaded"normal" and "bold"examined"normal"." say " URLS that pass this test are retrieved (and saved to disk). " say " If they are text/html documents they will also be 'parsed' --" say " the links found in these text/html documents may also be retrieved." say say " b) Limiting what URLS are "bold"downloaded"normal", but "bold"not"normal" examined." say " URLS that pass this less stringent test are downloaded (and saved to disk)." say " They are "bold"not"normal" parsed -- links they may contain are ignored." say say " By using two tests, one can:" say " i) 'Recursively GET' URLS thar are in (or under) the directory " say " of the 'home page' you selected. " say " ii) Download & save (but not examine) files pointed to by these pages. " say " For example, .GIF files stored on a different part of the site." say call charout,reverse"Hit any key to continue "normal foo=sysgetkey('noecho') say return 1 /****************/ /* writeall help info */ help_writeall: say say "You can either: " say " a) Download all documents, images, etc. from the site (more precisely," say " documents, etc. that satisfy the 'scope tests')" say " b) Only download HTML documents " say say "The latter option is useful if you want a quick snapshot of the navigable" say "portion of the site -- if you do not care about images, text files, and " say "other such 'non-html' contents." say say "If you select this latter option, the following rule is used: " if htmlexts<>'' then say "Only retrieve links ending with: "htmlexts else say "Retrieve links that do NOT end with: "not_htmlexts say say cy_ye" Note: Configuration hint:"normal say " You can modify this rule by changing the HTMLEXTS and NOT_HTMLEXTS parameters" say call charout,reverse"Hit any key to continue "normal foo=sysgetkey('noecho') say return 1 /***************/ /* cut length of string to nn characters, if necessary */ cutstrg:procedure parse arg astr,ilen if ilen='' then return astr if length(astr)<ilen then return astr aa=left(astr,14)'...'||right(astr,33) return aa /***************/ /* say, and possible lineout, output */ printsay:procedure expose logfile bold normal reverse cy_ye parse arg aval say aval aval=removestrg(aval,bold) aval=removestrg(aval,normal) aval=removestrg(aval,reverse) aval=removestrg(aval,cy_ye) if logfile<>0 then call lineout logfile,aval return 0 /***********************************/ /* search a file, find IMG SRC=, FRAME SRC=, and A HREF= urls. Add BASEURL if no / or http://.../ at beginning of URL Return results in hrefs. and imgs. */ afindsrc:procedure parse arg anarg parse var anarg htype stuff htype=translate(strip(htype)) /* find all FRAME SRC=, IMG SRC= and A HREF=, throw away internal links */ chklist='BODY IMG A FRAME AREA EMBED LINK APPLET ' anctype=wordpos(htype,chklist) if anctype=0 then return '' /* not a url containing element */ /* depending on anctye, look for different things */ select when anctype=1 then do /* body background */ do forever if anarg='' then return '' /* nothing found */ parse var anarg a1 anarg ; a1=strip(a1) if abbrev(translate(a1),'BACKGROUND=')=0 then iterate parse var a1 . '=' gotimg . ; gotimg=strip(strip(gotimg),,'"') return gotimg end /* do */ end /* i3>0 */ when anctype=2 then do /* img */ do forever if anarg='' then return '' parse var anarg a1 anarg ; a1=strip(a1) if abbrev(translate(a1),'SRC=')=0 then iterate parse var a1 . '=' gotimg . ; gotimg=strip(strip(gotimg),,'"') return gotimg end /* do */ end when anctype=3 | anctype=5 | anctype=7 then do /* A AREA LINK */ do forever if anarg='' then leave parse var anarg a1 anarg ; a1=strip(a1) if abbrev(translate(a1),'HREF=')=0 then iterate parse var a1 . '=' gothref . ; gothref=strip(strip(gothref),,'"') parse var gothref gothref '#' . /* toss out internal jumps */ if gothref="" then return "" if abbrev(translate(gothref),'JAVASCRIPT:') then return "" /* don't do "javascript:" entries */ return gothref end /* do */ end when anctype=4 | anctype=6 then do /* FRAME EMBED */ do forever if anarg='' then leave parse var anarg a1 anarg ; a1=strip(a1) if abbrev(translate(a1),'SRC=')=0 then iterate parse var a1 . '=' gothref . ; gothref=strip(strip(gothref),,'"') parse var gothref gothref '#' . /* toss out internal jumps */ if gothref="" then return "" return gothref end /* do */ end when anctype=8 then do /* APPLET */ abase=''; aref='' do forever if anarg='' then leave parse var anarg a1 anarg ; a1=strip(a1) if abbrev(translate(a1),'CODE=') + , abbrev(translate(a1),'CODEBASE=')=0 then iterate if abbrev(translate(a1),'CODEBASE=')=1 then do parse var a1 '"' abase '"' . end /* do */ else do /* CODE */ parse var a1 '"' aref '"' end /* do */ if aref<>'' & abase<>'' then leave end if aref='' then return '' /* no CODE= found */ if abase<>'' then aref=strip(abase,'t','/')||'/'||strip(aref,'l','/') return aref end otherwise return '' end /* select */ return '' /***********/ /* remove substring */ removestrg:procedure parse arg aval,astr if pos(astr,aval)=0 then return aval aa='' do forever if aval='' then leave parse var aval a1 (astr) aval aa=aa||a1 end return aa /**************/ /******************************/ /* parse a robots.txt file, The algorithim: 1 ignore # lines (comments) 2a look for user-agent: grabsite lines 2b if none, look for user-agent:* lines 3 if 2a or 2b don't match, then no robot disallows exist 4 otherwise, from the look for disallow lines going starting from the user-agent line, until the first empty line (use 0a as line delimiter, and throw away the 0d) 5 add from each disallow: asel to exclusion_list --------------- # samples robots.txt -- will add cgi-* to exclusion_list user-agent: mozilla Disallow: /samples Disallow: /stuff/ #user-agent: checklink user-agent:gizmo disallow:fes/ user-agent:* disallow:cgi- --------------- */ add_robot:procedure expose verbose parse arg abody parse var abody . icode . if left(strip(icode),1)<>2 then return '' /* not 200 code, so no disallows */ cr='0a'x do forever /* get rid of response header */ if abody='' then return '' /* nothing in body */ parse var abody al1 (cr) abody al1=strip(al1,,'0d'x) if al1='' then leave /* found empty line*/ end nn=0 do forever if abody='' then leave parse var abody al1 (cr) abody al1=strip(al1,,'0d'x) if al1='#' then iterate parse var al1 al1a '#' . nn=nn+1 lins.nn=al1a end if nn=0 then return '' /* no entries, return */ lins.0=nn /* look for GRABSITE, or *, user-agent */ iat=0 do mm=1 to lins.0 al=strip(lins.mm) if abbrev(translate(al),'USER-AGENT')=0 then iterate parse var al . ':' dagent ; dagent=translate(strip(dagent)) if abbrev(dagent,'CHECKLINK')=1 then do iat=mm leave end if dagent='*' then do iat=mm end /* do */ end /* do */ exlist2='' if iat=0 then return ' ' /* no matching user-agent */ do mm=iat+1 to lins.0 al=translate(strip(lins.mm)) if al='' then leave /* blank line signals end of "record" */ if abbrev(al,'DISALLOW')<>1 then iterate parse var al . ':' dasel ; dasel=strip(dasel) exlist2=exlist2||' '||strip(dasel,'l','/') end /* do */ return exlist2 /*******************/ / compare arg against "robot" exclist. -- return 1 if a match */ robot_No:procedure expose exclist. parse upper arg asel asel=strip(asel,'l','/') do mm=1 to exclist.0 tt=exclist.mm if abbrev(asel,tt)=1 then return 1 end /* do */ return 0