home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 35 Internet
/
35-Internet.zip
/
grabsite.zip
/
grabsite.cmd
next >
Wrap
OS/2 REXX Batch file
|
1999-08-18
|
39KB
|
1,365 lines
/* This will take an html document and find all <a and <img links.
It will not find "FORM links.
*/
/********** BEGIN USER changeable parameters ***********/
/* default "base url" -- only used if a file:// url is entered
For example, to start at d:\www\guide.htm, you'ld enter
file://d:\www\guide.htm
You'ld then need to enter the "default" address (for use with
relative URLS contained in this file)
*/
defbaseurl='http://www/'
/* default root directory -- only used if a file:// url is entered
*/
defrootdir='/'
/* default name (for urls that end with / ).
For example,
given a link of "<a href="/sports/scoreboard/">
and def_tofile='index.htm'
then the contents of this url would be written to:
destination_dir\sports\scoreboard\index.htm */
def_tofile='INDEX.HTM'
/*If HTML Document mode is selected, then
only links ending with these extensions are downloaded, examined, and written.
Notes:
* In all cases, if the content-type header is NOT text/html,
the contents will NOT be examined.
* If HTMLEXTS='', then this test is not performed
* If NOT_HTMLEXTS='', then this test is not performed */
htmlexts='SHTML SHT HTM HTML HTML-SSI HTM-SSI'
/* log file. If none desired, set=0. Otherwise, enter a
filename. Note that old log files will be deleted/overwritten */
logfile='GRABSITE.LOG'
/* nocgi=1 to skip CGI urls (that have a /CGI in their path */
nocgi=1
/* nosearch=1 to skip urls that end with ?xxx
(where xxx is a string of any length)*/
nosearch=1
/*If HTML Document mode is selected, then
links ending with these extensions are NOT downloaded.
Notes:
* If "retrieve all links" mode is specified, then
not_htmlexts is ignored
* If HTMLEXTS<>'', then this test is not performed
* If "retrieve all links" mode is specified, then
htmlexts is ignored
* If HTMLEXTS='', then this test is not performed */
not_htmlexts='JPG GIF BMP ZIP GZ TIF TIFF MOV AU EXE COM WAV XBM PDF PS EPS '
/* overwrite=1 means "overwrite preexisting files.
Otherwise, don't overwrite
*/
overwrite=1
/* optional request header(s) to send to servers
Note: use '0d0a'x to seperate multiple request headers */
reqheaders='User-agent: GrabSite'
/* if URL's path starts with remove_prefix, then trim
the beginning of the path (remove everything up to the first /)
For example,
if remove_prefix='!RANGE
and a link is /!RANGE:bytes=100-200/surplus/prices.lst
then /surplus/prices.lst is used */
remove_prefix='!RANGE'
/* If robot_check=1, then check for a /ROBOTS.TXT file. This contains
instructions on what paths should not be visited by "web robots".
*/
robot_check=1
/* if URL's path starts with skip_prefix, then skip it
This is only needed when the "retrieve" test is /. */
skip_prefix='!'
/* Status reports:
-2 for NO status output, -1 for minimal, 0 for average
1 for some, 2 for too much */
verbose=1
/********** END USER changeable parameters ***********/
parse arg afile destdir includer includer2 write_all
cmdline=0
if afile<>'' then do
afile=translate(afile,'/','\')
cmdline=1
end /* do */
write_all0=write_all
if afile='?' then do
say "GrabSite -- GET a linked set of pages from the WWW"
say
say "Calling syntax: GrabSite URL DestDir Test1 Test2 Get_all "
say " where:"
say " URL = a fully qualified URL (the home page to start at)"
say " DestDir = destination directory (on local disk) to write results to"
say " Test1 = only parse documents in/under this prefix "
say " Test2 = only retrieve documents in/under this prefix "
say " Get_all = if 0, then do NOT get non-html documents "
say " "
say " Note: to avoid command line problems: use \ instead of /"
say " "
say "Example: "
say " D:>grabsite http:\\fu.br.net\circ\index.htm d:\foob \circ\ \ 1 "
say
say "Or .. enter without arguments for user prompts"
exit
end /* do */
/* initialize some stuff */
baseurl=''
rootdir=''
includer=translate(translate(includer,'/','\'))
includer2=translate(translate(includer2,'/','\'))
remove_prefix=translate(remove_prefix)
skip_prefix=translate(skip_prefix)
htmlexts=translate(htmlexts)
not_htmlexts=translate(not_htmlexts)
ndeleted=0
nwritten=0 ; noconnects=0
ngets=0; n400s=0
nparsed=0
crlf='0d0a'x
fileurls.0=0
flist.0=0
call loaddll /* load some dlls, set some parameters */
say
say " "cy_ye"GrabSite -- GET a set of linked documents from a WWW site"normal
say
if logfile=0 | logfile=' ' then do
logfile=0
end /* do */
else do
aa=stream(logfile,'c','query exists')
if aa<>'' then do
foo=sysfiledelete(logfile)
if verbose>0 then say "Old logfile deleted: "logfile
call lineout logfile,'GrabSite log file. Created '||time('n')||' '||date('n')
end /* do */
end
/***** determine file/url to read, and other info */
say
jump1: nop
if afile='' then do
afile=getstring("Home page to grab, or enter ? for a brief description.",'?',reverse' 1)'normal)
if afile='?' then do
call helpme1
afile=''
signal jump1
end /* do */
end
afile=strip(afile)
afileu=translate(strip(afile))
if abbrev(afileu,'FILE://') then do
parse var afile . '://' afile
afileu=translate(afile)
afile_isurl=0
if stream(afile,'c','query exists')="" then do
call printsay "No such file: "afile
exit
end /* do */
call printsay " ... reading "||cutstrg(afile,50) "...."
stuff=charin(afile,1,chars(afile))
afile_isurl=0
if baseurl='' then baseurl=getstring("Default site (the dotted ip address)",defbaseurl,reverse' 1a)'normal)
if rootdir='' then rootdir=getstring("Default 'root' directory ",defrootdir,reverse' 1b)'normal)
end /* local file as base */
else do /* it's a url */
if abbrev(afileu,'HTTP://')<>1 then do
afile='http://'afile
afileu=translate(afile)
end /* do */
afile_isurl=1
parse var afile . '://' bb1 '/' bb2
baseurl=bb1
ii=lastpos('/',bb2)
if ii=0 then
rootdir='/'
else
rootdir=left(bb2,ii)
end /* url entry */
if pos('://',baseurl)=0 then baseurl='http://'||baseurl
rootdir=strip(rootdir)
if rootdir<>'/' then rootdir='/'||strip(rootdir,,'/')||'/'
baseurl=strip(strip(baseurl,'t','/'))
/* destination directory */
atdestdir: nop
if destdir='' then do
destdir=getstring("Enter a destination directory ",directory(),reverse' 2)'normal)
if destdir="?" then do
call helpme1
destdir=''
signal atdestdir
end /* do */
didit=sysmkdir2(destdir,1)
if didit<>0 then do
say "Could not access, or create, "destdir
exit
end /* do */
end
else do
destdir=strip(strip(destdir),'t','\')'\'
didit=sysmkdir2(destdir,1)
if didit<>0 then do
say "Could not access, or create, "destdir
exit
end /* do */
end
destdir=strip(strip(destdir),'t','\')'\'
/* get and set includers variables */
call get_includers
/* Quick/skeleton mode */
getquick:nop
if cmdline<>1 then do
do until write_all<>''
al=getstring(' HTML documents only (Yes, No, or ? for help)','N',reverse' 4)'normal)
al=strip(translate(al))
if al='?' then do
al=''
call help_writeall
iterate
end
if abbrev(al,'N')=1 then
write_all=1
else
write_all=0
end
if write_all=0 then
call printsay "Ignoring non-html documents"
else
call printsay "Retrieving all links "
say
/* modify other parameters */
if write_all0='' then do
if yesno(" Would you like to modify configuration parameters?")=1 then do
call modify_config
end /* do */
end
end
/************** Done with user input **********/
/******* copy file/url to destdir */
/* if local file, copy directly to destidr
if url, then maybe copy relative to destdir
*/
if afile_isurl=0 then do /* local file -- jump start*/
ff=translate(afile,' ','\/')
ff2=word(ff,words(ff))
ff2=destdir||ff2
say bold"Saving to "normal|| ff2
foo=translate(stream(ff2,'c','open write'))
if foo<>'READY:' then do
say "Could not open file for writing. Error was: " foo
exit
end /* do */
foo=charout(ff2,stuff,1)
if foo<>0 then do
say "Error. Problem writing file "
exit
end /* do */
foo=stream(ff2,'c','close')
goo=time('e') /* get stuff from file */
foo=urls_in(stuff,baseurl,rootdir,afile)
goo2=time('e')
if verbose>0 then do
if goo2-goo>5 then call printsay " ... done parsing "||cutstrg(afile,50)
call printsay ' '
call printsay " " cy_ye " # links in "normal||bold||afile"="normal||" "||fileurls.0
end
nparsed=1
end
else do /* a url */
iurls=1
uaref=translate(strip(afile))
flist.uaref=1
flist.0=1
fileurls.iurls=afile
fileurls.iurls.!ref='user'
fileurls.0=iurls
end /* do */
/**** get a robot.txt file first? */
if robot_check=1 then do
aurl=baseurl'/robots.txt'
rlist=get_url(aurl)
exclist=add_robot(rlist)
if verbose>0 then do
call printsay "Excluding: "exclist ; call printsay ' '
end
exclist.0=0
if exclist<>'' then do
do ii=1 to words(exclist)
exclist.ii=translate(strip(word(exclist,ii)))
end /* do */
exclist.0=words(exclist)
end /* do */
end /* build exclist. */
/************ Get urls in first file/url */
call printsay ' '
if write_all=0 then
call printsay ' Examining html links starting from:'||bold||afile||normal
else
call printsay ' Examining links starting from:'||bold||afile||normal
call printsay ' '
/********** now get the urls, parse, add to list.... */
mm=0
do forever
mm=mm+1
if mm>fileurls.0 then leave
goob=fileurls.mm
goob2=translate(goob)
parse var goob . '://' bb1 '/' asel
baseurl=bb1
ii=lastpos('/',asel)
if ii=0 then
rootdir='/'
else
rootdir=left(asel,ii)
if pos('://',baseurl)=0 then baseurl='http://'||baseurl
rootdir=strip(rootdir)
if rootdir<>'/' then rootdir='/'||strip(rootdir,,'/')||'/'
baseurl=strip(strip(baseurl,'t','/'))
if robot_no(asel)=1 then iterate /* robot excluded */
if includer2<>"" then do /* only GET if in/under this directory */
if abbrev(goob2,includer2)=0 then iterate
end
if nocgi=1 then do /* cgi? then skip */
if pos('/CGI',asel)=1 then iterate
end
if nosearch=1 then do /* skip "search string" calls (usually to scripts*/
if pos('?',asel)>0 then iterate
end /* do */
if skip_prefix<>'' then do /* ignore if starts with this? */
if abbrev(asel,skip_prefix)=1 then iterate
end /* do */
ara=lastpos('.',asel);anext=''
if ara>0 then do /* check for html type of extentsion*/
anext=translate(strip(substr(asel,ara+1)))
end
if htmlexts<>'' & write_all<>1 then do /* only get possible htmls */
if pos(anext,htmlexts)=0 then iterate
end
if not_htmlexts<>"" & write_all<>1 then do /* don't get almost certainly NOT htmls */
if pos(anext,not_htmlexts)>0 then iterate
end /* do */
f1f=goob
if length(f1f)>40 then f1f='...'right(goob,36)
oof=''
if verbose>0 then oof=']--'||filespec('n',fileurls.mm.!ref)
if verbose>-1 then call printsay "Checking "bold||mm||normal||" of "fileurls.0")"||f1f||oof
/* get the url */
goo=time('e')
stuff=get_url(goob,,verbose,reqheaders)
goo2=time('e')
if goo2-goo>5 & verbose>0 then call printsay " .... done GETting "||cutstrg(goob,50)
if stuff="" then do
noconnects=noconnects+1
iterate
end /* do */
ngets=ngets+1
call extracts /* extract body and head */
/* look for return code */
parse var response_line . icode .
r1=left(response_code,1)
if r1=4 | r1=5 | r1=1 then do /* error response */
n400s=n400s+1
iterate
end /* do */
/* get the content-type */
ss='!CONTENT-TYPE'
if translate(headers.ss)<>'TEXT/HTML' then do /* not html -- don't parse */
if write_all=1 then call url_to_file goob /* but possibly save to disk */
iterate /* don't bother parsing this */
end
/* does it satisfy the INCLUDER test? */
if includer<>"" then do
if abbrev(goob2,includer)=0 then do
call url_to_file goob
iterate /* don't bother parsing this */
end /* do */
end
/* extract links, but first write it to disk */
call url_to_file goob
if result=0 then iterate
/* if here, extract urls and add to list */
eek=fileurls.0
goo=time('e')
if verbose>0 then call printsay " .... parsing "||cutstrg(goob,50)
if r1=3 then do /* redirect -- extract location header */
ss='!LOCATION'
asd=strip(headers.ss)
if asd<>'' then do
stuff=stuff||'<a href="'asd'"> ' /* convert location header to link (a small hack */
end /* do */
end /* do */
foo=urls_in(stuff,baseurl,rootdir,goob)
goo2=time('e')
if goo2-goo>5 & verbose>0 then call printsay " ... done parsing "||cutstrg(goob,50)
nparsed=nparsed+1
if verbose>1 then do
if eek<fileurls.0 & verbose>0 then call printsay " new links to check: "bold||(fileurls.0-eek)||normal
end
end /* ******* Read a url */
/**** Status info */
call printsay ' '
call printsay ' ------- Status: '
call printsay "Total number of unique URLs: "fileurls.0
call printsay "Total number retrieval attempts: " ngets '(400s='n400s'. No Connect='noconnects')'
call printsay "Total number of parsed pages: "nparsed
call printsay "Total number of files written: " nwritten '(files deleted='ndeleted')'
call printsay " "
call printsay "Reminder: files are written to "bold||destdir||normal
if logfile<>0 then do
say ' ** The log file is: ' logfile
call lineout logfile
end
exit
/********/
/* modify configuration parameters */
modify_config:
params="def_tofile htmlexts logfile not_Htmlexts overwrite robot_check reqheaders "
params=params||"reqheaders verbose nocgi nosearch remove_prefix skip_prefix"
params=translate(params)
say
do forever
aa=getstring("Select a parameter to modify (?=list,??=current values, X=done)","?",reverse" -->"normal)
if aa="?" then do
say
say " "reverse"Configuration Parameters: "normal
say bold" DEF_TOFILE"normal"= default filename, used when a URL does not contain a filename"
say bold" HTMLEXTS"normal"= HTML extensions (if quick mode selected, only files with these "
say " extensions are retrieved)"
say bold" LOGFILE"normal"= Name of logfile (results are recorded here)"
say bold" NOCGI"normal"= If 1, do NOT retrieve URLs containing /CGI (cgi-bin scripts)"
say bold" NOSEARCH"normal"= If 1, do NOT retrieve URLs that end with a ?xxxx "
say bold"NOT_HTMLEXTS"normal"= non-HTML extensions (if quick mode selected, files with these "
say " extensions are ignored)"
say bold" OVERWRITE"normal"= If 1, then overwrite preexisting files "
say bold"REMOVE_PREFIX"normal"= If the URL's path starts with this, then trim the "
say " beginning of the path (remove everything up to the first /) "
say bold" SKIP_PREFIX"normal"= If the URL's path starts with this, then skip it "
say bold" VERBOSE"normal"= If 1, verbose mode "
say
iterate
end /* do */
if aa="??" then do
say
say " "reverse"Current values of configuration Parameters: "normal
say bold" DEF_TOFILE"normal"= "def_tofile
say bold" HTMLEXTS"normal"= "htmlexts
say bold" LOGFILE"normal"= "logfile
say bold" NOCGI"normal"= "nocgi
say bold" NOSEARCH"normal"= "nosearch
say bold"NOT_HTMLEXTS"normal"= "not_htmlexts
say bold" OVERWRITE"normal"= "overwrite
say bold"REMOVE_PREFIX"normal"= "remove_Prefix
say bold" REQHEADERS"normal"= "reqheaders
say bold" ROBOT_CHECK"normal"= "robot_check
say bold" SKIP_PREFIX"normal"= "skip_prefix
say bold" VERBOSE"normal"= "verbose
say
say "Note: you can permanently change these values by editing GRABSITE.CMD"
say
iterate
end /* do */
aa=translate(strip(aa))
if aa='X' then leave
if wordpos(aa,params)=0 then do
say "No such parameter: " aa
end /* do */
else do
aaold=value(aa)
bb=getstring("Enter new value for "aa,aaold,bold" --->"normal)
foo=value(aa,bb)
end
end
return 0
/********/
/* get and set includer and includers2 */
get_includers:
include1: nop
if includer='' then do
includer=getstring(" Only GET & examine & save urls in or under (? for help) ",rootdir,reverse' 3)'normal)
end
if includer="?" then do
call help_includer
includer=''
signal include1
end /* do */
includer=translate(includer)
include2: nop
if includer2='' then do
includer2=getstring(" Only GET & save urls that being with ",includer,reverse' 3b)'normal)
end
if includer2="?" then do
call help_includer
includer2=''
signal include2
end /* do */
includer2=translate(includer2)
if includer='' then
includer=baseurl||rootdir
else
includer=baseurl||'/'strip(includer,'l','/')
say
call printsay "Only examining URLs in/under: "includer
if includer2='' then
includer2=baseurl||'/'
else
includer2=baseurl||'/'strip(includer2,'l','/')
call printsay "Only retrieving URLs in/under: "includer2
includer=translate(includer)
includer2=translate(includer2)
len_includer2=length(includer2)
say
return 0
/**************************************************/
/* copy a url to a file */
url_to_file:
parse arg afil
goob2=translate(afil)
if includer2<>"" then do /*relative to includer2 directory */
tofile=substr(goob2,len_includer2)
end
else do
parse var afil . '://' . '/' tofile
end /* do */
if tofile='' | right(tofile,1)='/' then tofile=tofile||def_tofile
/* save to destidr */
tofile=translate(tofile,'\','/')
tofile=strip(strip(tofile),'l','\')
tofile2=destdir||tofile
todir=filespec('d',tofile2)||filespec('p',tofile2)
mkit=sysmkdir2(todir)
yow=stream(tofile2,'c','query exists')
if yow<>'' then do
if overwrite=2 then do
if verbose>-1 then call printsay " "||cy_ye||tofile2||normal " old version used."
return 1 /* use old copy */
end
if overwrite=1 then do
if verbose>0 then call printsay " .... deleting "tofile2
foo=sysfiledelete(tofile2)
ndeleted=ndeleted+1
end /* do */
else do
call printsay " > "tofile2 " exists; "bold"skipping "normal
return 0
end /* do */
end /* do */
foo=stream(tofile2,'c','open write')
wow=charout(tofile2,stuff,1)
if wow<>0 then do
call printsay "ERROR: could not write "tofile2
return 0
end /* do */
foo=stream(tofile2,'c','close')
if foo="READY:" then do
if verbose>-2 then call printsay " "||cy_ye||tofile2||normal " written."
end /* do */
nwritten=nwritten+1
return 1 /* sets globals */
/********************/
/* search a file, find IMG SRC= and A HREF= urls. Add BASEURL if
no / or http://.../ at beginning of URL */
urls_in:procedure expose fileurls. flist. remove_prefix bold normal logfile reverse cy_ye
parse arg stuff, baseurl,rootdir,stuffname
/* remove comments */
body=""
do forever /*no comments within comments are allowed */
if stuff="" then leave
parse var stuff t1 '<!-- ' t2 '-->' stuff
body=body||t1
end /* do */
stuff=body
body=''
if verbose=1 then call printsay "Parsing "||length(stuff)||' characters'
/* find all IMG SRC= and A HREF=, FRAME= throw away internal links */
do until stuff=""
parse var stuff . '<' anarg '>' stuff
aref=afindsrc(anarg)
if aref='' then iterate
uaref=translate(aref)
if abbrev(uaref,'MAILTO:')=1 then iterate /* only keep https */
if abbrev(uaref,'FTP:')=1 then iterate
if abbrev(uaref,'GOPHER:')=1 then iterate
/* fix up name to be fully qualified url */
select
when abbrev(translate(aref),'HTTP://')=1 then nop
when abbrev(aref,'/')=1 then aref=baseurl||aref
otherwise aref=baseurl||rootdir||aref
end
/* check for remove_prefix entries */
if remove_prefix<>'' then do
parse var aref a1 '://' a2 '/' aaurl
if abbrev(translate(aaurl),translate(remove_prefix))=1 then do
parse var aaurl . '/' aaurl
aref=a1'://'a2'/'aaurl
if verbose=1 then call printsay " > " remove_prefix "removal yields: "aref
end /* do */
end /* do */
/* record this entry only if not yet recorded -- else, just increment counter */
uaref=translate(aref)
if datatype(flist.uaref)<>'NUM' then flist.uaref=0
flist.uaref=1+flist.uaref
flist.0=flist.0+1
if flist.uaref=1 then do
iurls=fileurls.0+1
fileurls.iurls=aref
fileurls.iurls.!ref=stuffname
fileurls.0=iurls
end
end /* do */
return iurls
/*****************/
/* get a string from user */
getstring:procedure expose normal bold reverse logfile cy_ye
parse arg prompt,def,prompt0
abold=bold
if bold="BOLD" then abold=''
anormal=normal
if normal='NORMAL' then anormal=''
l1=length(prompt)
l2=length(def)
if l1+l2>38 then do
say prompt0' 'abold||prompt||anormal
if l2>22 then do
say ' (ENTER='abold||def||normal')'
call charout, bold" ? "normal
parse pull ans
end /* do */
else do
call charout,' (ENTER='abold||def||anormal')? '
parse pull ans
end
end
else do
call charout,prompt0' 'bold||prompt||normal' (ENTER='abold||def||anormal')? '
parse pull ans
end
if ans='' then ans=def
return ans
/* ---------------------------------------------*/
/* get a url from some site, return first
maxchar characters (if maxchar missing, get 10million (the whole thing?)
call as: stuff=get_url(aurl,maxchar,verbose,headers)
where:
aurl: the url to GET (required)
the other 3 are optional:
maxchar: max chars to get (default=10,000,000)
verbose: verbose mode (default=OFF)
headers: list of extra request headers, CRLF delimited
*/
/* ---------------------------------------------*/
get_url:procedure expose logfile bold normal reverse cy_ye
parse arg aurl,maxchar,verbose,headers
if maxchar="" then maxchar=10000000
got=""
if abbrev(translate(aurl),'HTTP://')=0 then do
if verbose>0 then call printsay "Error: URL not properly specified (it must begin with HTTP://)"
return ''
end
parse var aurl . '://' server '/' request
if VERBOSE>1 then call printsay " GETting http url : " server ", " request
/* now get the url. This requires the RxSock.DLL be in your LIBPATH. */
/* Load RxSock */
if \RxFuncQuery("SockLoadFuncs") then nop
else do
call RxFuncAdd "SockLoadFuncs","rxSock","SockLoadFuncs"
call SockLoadFuncs
end
crlf ='0d0a'x /* constants */
family ='AF_INET'
httpport=80
rc=sockgethostbyname(server, "serv.0") /* get dotaddress of server */
if rc=0 then do
call printsay ' Unable to resolve "'server'"'
return 0
end
dotserver=serv.0addr /* .. */
gosaddr.0family=family /* set up address */
gosaddr.0port =httpport
gosaddr.0addr =dotserver
gosock = SockSocket(family, "SOCK_STREAM", "IPPROTO_TCP")
/* Set up request */
message="GET /"request' HTTP/1.0 'crlf||'Host: 'server||crlf
if length(headers)>2 then do
if right(headers,2)=crlf then headers=left(headers,length(headers)-2)
end
if headers<>'' then message=message||headers||crlf
message=message||crlf
got=''
rc = SockConnect(gosock,"gosaddr.0")
if rc<0 then do
call printsay ' Unable to connect to "'server'"'
return 0
end
rc = SockSend(gosock, message)
/* Now wait for the response */
do r=1 by 1
rc = SockRecv(gosock, "response", 1000)
got=got||response
if rc<=0 then leave
tmplen=length(got)
if tmplen> maxchar then leave
end r
rc = SockClose(gosock)
return got
/* --- Load the function library, if necessary --- */
loaddll:
if RxFuncQuery("SockLoadFuncs")=1 then do /* already there */
call RxFuncAdd "SockLoadFuncs","rxSock","SockLoadFuncs"
call SockLoadFuncs
end
foo=rxfuncquery('sysloadfuncs')
if foo=1 then do
call RxFuncAdd 'SysLoadFuncs', 'RexxUtil', 'SysLoadFuncs'
call SysLoadFuncs
end
/****
foo=rxfuncquery('rexxlibregister')
if foo=1 then do
call rxfuncadd 'rexxlibregister','rexxlib', 'rexxlibregister'
call rexxlibregister
end
foo=rxfuncquery('rexxlibregister')
if foo=1 then do
say " Could not find REXXLIB "
exit
end
***/
ansion=checkansi()
if ansion=1 then do
aesc='1B'x
cy_ye=aesc||'[37;46;m'
normal=aesc||'[0;m'
bold=aesc||'[1;m'
re_wh=aesc||'[31;47;m'
reverse=aesc||'[7;m'
end
else do
say " Warning: Could not detect ANSI.... output will look ugly ! "
cy_ye="" ; normal="" ; bold="" ;re_wh="" ;
reverse=""
end /* Do */
return 1
/* -------------------- */
/* get a yes or no , return 1 if yes */
yesno:procedure expose normal reverse bold logfile cy_ye
parse arg fooa , allopt,altans
if altans<>" " & words(altans)>1 then do
w1=strip(word(altans,1))
w2=strip(word(altans,2))
a1=left(w1,1) ; a2=left(w2,1)
a1a=substr(w1,2) ; a2a=substr(w2,2)
end
else do
a1='Y' ; a1a='es'
a2='N' ; a2a='o'
end /* Do */
ayn=' '||bold||a1||normal||a1a||'\'||bold||a2||normal||a2a
if allopt=1 then ayn=ayn||'\'||bold||'A'||normal||'ll'
do forever
foo1=normal||reverse||fooa||normal||ayn
call charout, foo1 normal ':'
pull anans
if abbrev(anans,a1)=1 then return 1
if abbrev(anans,a2)=1 then return 0
if allopt=1 & abbrev(anans,'A')=1 then return 2
end
nocon:
if rc=-7 then return 0
exit 0
/* ------------------------------------------------------------------ */
/* function: Check if ANSI is activated */
CheckAnsi: PROCEDURE
thisRC = -1
trace off
/* install a local error handler */
SIGNAL ON ERROR Name InitAnsiEnd
"@ANSI 2>NUL | rxqueue 2>NUL"
thisRC = 0
do while queued() <> 0
queueLine = lineIN( "QUEUE:" )
if pos( " on.", queueLine ) <> 0 | , /* USA */
pos( " (ON).", queueLine ) <> 0 then /* GER */
thisRC = 1
end /* do while queued() <> 0 */
InitAnsiEnd:
signal off error
RETURN thisRC
/*************************/
/* return 1 if adir is an existing (possibly empty) directory , 0 if not */
dosisdir2:procedure
parse arg adir
adir=strip(adir)
adir=strip(adir,'t','\')
nowdir=directory()
nowdrive=filespec('d',nowdir'\')
nowpath=filespec('p',nowdir'\')
adr=filespec('d',adir)
if adr='' then do
if abbrev(adir,'\')=0 then
adir=nowdrive||nowpath||adir
else
adir=nowdrive||adir
end /* do */
foo=sysfiletree(adir,goo,'D')
if goo.0>0 then return 1
return 0
/*************************************/
/* parse GETten stuff to globals
response_line = the response line */
response_code = the 200, 401, etc. code
headers. = list of response headers
stuff = the contents (the file)
*/
extracts:
cr='0a'x
parse var stuff response_line (cr) stuff
parse var response_line . response_code .
response_line=strip(response_line,,'0d'x)
headers.0=''
do forever
parse var stuff ahead (cr) stuff
ahead=strip(ahead,,'0d'x)
if ahead='' then leave
parse var ahead name ':' aval
nn=translate('!'||name)
headers.0=headers.0' 'nn
headers.nn=aval
end /* do */
/* remove html comments */
return 1
/* ------------- */
/* create a directory, arbitrarily deep.
Returns 0 if succes, otherwise returns an error code
adir: directory to create -- must be fully qualified.
verbose: if 1, will write some status stuff to screen
*/
sysmkdir2:procedure
parse arg adir,verbose
adir=strip(adir,'t','\')
if dosisdir2(adir)=1 then do /* already exists */
if verbose=1 then say " Using pre-existing directory: "adir
return 0
end /* do */
ff=sysmkdir(adir)
if ff=0 then return ff
/* make the tree */
f2=adir'\'
dd=filespec('d',f2)
pp=filespec('p',f2)
if pp='\' | pp='' then return -1
pp2=strip(translate(pp,' ','\'))
do mm=1 to words(pp2)
a1=subword(pp2,1,mm)
a1=translate(a1,'\',' ')
dd2=dd'\'a1
hoo=sysmkdir(dd2)
if hoo=0 & verbose=1 then call printsay ' ... creating: 'dd2
end /* do */
return hoo
/****************/
/* URL and DESTDIR help info */
helpme1:
say
say bold"GrabSite"normal" is designed to copy a WWW site to your local hard disk. "
say
say "It's easy to use: just specify a URL, and then specify a directory"
say "on your hard drive to copy the web pages (and other files) retrieved"
say "from this WWW site."
say
say "For example: suppose the 'home page' is"
say " http://www.coolstuff.org/games/expert.htm"
say "and the 'destination directory' is:"
say " d:\localweb\game10 "
say "Then..."
say " a) GrabSite will GET (using socket calls) the /games/expert.htm HTML "
say " document at www.coolstuff.org."
say " b) A copy of /games/expert.htm will be written to d:\localweb\games10 "
say " c) /games/expert.htm will be scanned for links "
say " d) For each link found, repeat step a (changing names appropriately)"
say
say "Note: For hints on running from command line, run GrabSite with a ? argument."
say" Example: D:>GrabSite ? "
say
call charout,reverse"Hit any key to continue "normal
foo=sysgetkey('noecho')
say
return 1
/****************/
/* INCLUDER help info */
help_Includer:
say
say "You can, and should, limit the scope of "bold"GrabSite"normal"'s WWW downloads"
say "(If you don't, you could end up downloading a significant chunk of the WWW!)"
say
say "There are two tests used to limit scope: "
say
say " a) Limiting what URLS are "bold"downloaded"normal" and "bold"examined"normal"."
say " URLS that pass this test are retrieved (and saved to disk). "
say " If they are text/html documents they will also be 'parsed' --"
say " the links found in these text/html documents may also be retrieved."
say
say " b) Limiting what URLS are "bold"downloaded"normal", but "bold"not"normal" examined."
say " URLS that pass this less stringent test are downloaded (and saved to disk)."
say " They are "bold"not"normal" parsed -- links they may contain are ignored."
say
say " By using two tests, one can:"
say " i) 'Recursively GET' URLS thar are in (or under) the directory "
say " of the 'home page' you selected. "
say " ii) Download & save (but not examine) files pointed to by these pages. "
say " For example, .GIF files stored on a different part of the site."
say
call charout,reverse"Hit any key to continue "normal
foo=sysgetkey('noecho')
say
return 1
/****************/
/* writeall help info */
help_writeall:
say
say "You can either: "
say " a) Download all documents, images, etc. from the site (more precisely,"
say " documents, etc. that satisfy the 'scope tests')"
say " b) Only download HTML documents "
say
say "The latter option is useful if you want a quick snapshot of the navigable"
say "portion of the site -- if you do not care about images, text files, and "
say "other such 'non-html' contents."
say
say "If you select this latter option, the following rule is used: "
if htmlexts<>'' then
say "Only retrieve links ending with: "htmlexts
else
say "Retrieve links that do NOT end with: "not_htmlexts
say
say cy_ye" Note: Configuration hint:"normal
say " You can modify this rule by changing the HTMLEXTS and NOT_HTMLEXTS parameters"
say
call charout,reverse"Hit any key to continue "normal
foo=sysgetkey('noecho')
say
return 1
/***************/
/* cut length of string to nn characters, if necessary */
cutstrg:procedure
parse arg astr,ilen
if ilen='' then return astr
if length(astr)<ilen then return astr
aa=left(astr,14)'...'||right(astr,33)
return aa
/***************/
/* say, and possible lineout, output */
printsay:procedure expose logfile bold normal reverse cy_ye
parse arg aval
say aval
aval=removestrg(aval,bold)
aval=removestrg(aval,normal)
aval=removestrg(aval,reverse)
aval=removestrg(aval,cy_ye)
if logfile<>0 then call lineout logfile,aval
return 0
/***********************************/
/* search a file, find IMG SRC=, FRAME SRC=, and A HREF= urls. Add BASEURL if
no / or http://.../ at beginning of URL
Return results in hrefs. and imgs. */
afindsrc:procedure
parse arg anarg
parse var anarg htype stuff
htype=translate(strip(htype))
/* find all FRAME SRC=, IMG SRC= and A HREF=, throw away internal links */
chklist='BODY IMG A FRAME AREA EMBED LINK APPLET '
anctype=wordpos(htype,chklist)
if anctype=0 then return '' /* not a url containing element */
/* depending on anctye, look for different things */
select
when anctype=1 then do /* body background */
do forever
if anarg='' then return '' /* nothing found */
parse var anarg a1 anarg ; a1=strip(a1)
if abbrev(translate(a1),'BACKGROUND=')=0 then iterate
parse var a1 . '=' gotimg . ; gotimg=strip(strip(gotimg),,'"')
return gotimg
end /* do */
end /* i3>0 */
when anctype=2 then do /* img */
do forever
if anarg='' then return ''
parse var anarg a1 anarg ; a1=strip(a1)
if abbrev(translate(a1),'SRC=')=0 then iterate
parse var a1 . '=' gotimg . ; gotimg=strip(strip(gotimg),,'"')
return gotimg
end /* do */
end
when anctype=3 | anctype=5 | anctype=7 then do /* A AREA LINK */
do forever
if anarg='' then leave
parse var anarg a1 anarg ; a1=strip(a1)
if abbrev(translate(a1),'HREF=')=0 then iterate
parse var a1 . '=' gothref . ; gothref=strip(strip(gothref),,'"')
parse var gothref gothref '#' . /* toss out internal jumps */
if gothref="" then return ""
if abbrev(translate(gothref),'JAVASCRIPT:') then return "" /* don't do "javascript:" entries */
return gothref
end /* do */
end
when anctype=4 | anctype=6 then do /* FRAME EMBED */
do forever
if anarg='' then leave
parse var anarg a1 anarg ; a1=strip(a1)
if abbrev(translate(a1),'SRC=')=0 then iterate
parse var a1 . '=' gothref . ; gothref=strip(strip(gothref),,'"')
parse var gothref gothref '#' . /* toss out internal jumps */
if gothref="" then return ""
return gothref
end /* do */
end
when anctype=8 then do /* APPLET */
abase=''; aref=''
do forever
if anarg='' then leave
parse var anarg a1 anarg ; a1=strip(a1)
if abbrev(translate(a1),'CODE=') + ,
abbrev(translate(a1),'CODEBASE=')=0 then iterate
if abbrev(translate(a1),'CODEBASE=')=1 then do
parse var a1 '"' abase '"' .
end /* do */
else do /* CODE */
parse var a1 '"' aref '"'
end /* do */
if aref<>'' & abase<>'' then leave
end
if aref='' then return '' /* no CODE= found */
if abase<>'' then aref=strip(abase,'t','/')||'/'||strip(aref,'l','/')
return aref
end
otherwise return ''
end /* select */
return ''
/***********/
/* remove substring */
removestrg:procedure
parse arg aval,astr
if pos(astr,aval)=0 then return aval
aa=''
do forever
if aval='' then leave
parse var aval a1 (astr) aval
aa=aa||a1
end
return aa
/**************/
/******************************/
/* parse a robots.txt file,
The algorithim:
1 ignore # lines (comments)
2a look for user-agent: grabsite lines
2b if none, look for user-agent:* lines
3 if 2a or 2b don't match, then no robot disallows exist
4 otherwise, from the look for disallow lines going starting from
the user-agent line, until the first empty line (use 0a as line delimiter,
and throw away the 0d)
5 add from each disallow: asel to exclusion_list
---------------
# samples robots.txt -- will add cgi-* to exclusion_list
user-agent: mozilla
Disallow: /samples
Disallow: /stuff/
#user-agent: checklink
user-agent:gizmo
disallow:fes/
user-agent:*
disallow:cgi-
---------------
*/
add_robot:procedure expose verbose
parse arg abody
parse var abody . icode .
if left(strip(icode),1)<>2 then return '' /* not 200 code, so no disallows */
cr='0a'x
do forever /* get rid of response header */
if abody='' then return '' /* nothing in body */
parse var abody al1 (cr) abody
al1=strip(al1,,'0d'x)
if al1='' then leave /* found empty line*/
end
nn=0
do forever
if abody='' then leave
parse var abody al1 (cr) abody
al1=strip(al1,,'0d'x)
if al1='#' then iterate
parse var al1 al1a '#' .
nn=nn+1
lins.nn=al1a
end
if nn=0 then return '' /* no entries, return */
lins.0=nn
/* look for GRABSITE, or *, user-agent */
iat=0
do mm=1 to lins.0
al=strip(lins.mm)
if abbrev(translate(al),'USER-AGENT')=0 then iterate
parse var al . ':' dagent ; dagent=translate(strip(dagent))
if abbrev(dagent,'CHECKLINK')=1 then do
iat=mm
leave
end
if dagent='*' then do
iat=mm
end /* do */
end /* do */
exlist2=''
if iat=0 then return ' ' /* no matching user-agent */
do mm=iat+1 to lins.0
al=translate(strip(lins.mm))
if al='' then leave /* blank line signals end of "record" */
if abbrev(al,'DISALLOW')<>1 then iterate
parse var al . ':' dasel ; dasel=strip(dasel)
exlist2=exlist2||' '||strip(dasel,'l','/')
end /* do */
return exlist2
/*******************/
/ compare arg against "robot" exclist. -- return 1 if a match */
robot_No:procedure expose exclist.
parse upper arg asel
asel=strip(asel,'l','/')
do mm=1 to exclist.0
tt=exclist.mm
if abbrev(asel,tt)=1 then return 1
end /* do */
return 0