home *** CD-ROM | disk | FTP | other *** search
/ AmigActive 6 / AACD06.ISO / AACD / Online / GetAllHTML / GetAllHTML.rexx < prev   
OS/2 REXX Batch file  |  2000-01-30  |  23KB  |  605 lines

  1. /* GetAllHTML "URL"/A,"DestDir"/A,NOASK/S,ARC/S,PIC/S,RESUME/S,PAUSE/S,TERSE,DEPTH=/N/K,PORT="/K",BASEURL=/K,BROKENLINKS/S
  2.   v0.66ß (30-01-00)  Copyright 1998-2000 Chris S Handley
  3.   (email: Chris.S.Handley@BTInternet.com)
  4.  
  5.    This is going to be converted to a super-fast AmigaE version, so I am not
  6.   working on this (much) anymore.  If you alter & distribute this, please
  7.   mention me as the original author!  However, I would prefer you send me any
  8.   suggestions to be used in the E version...
  9.  
  10.   Do not hold your breath for the E version as I have so little spare time.
  11.  
  12.    See GetAllHTML.doc for more details
  13. */
  14.  
  15. OPTIONS RESULTS
  16. Call Addlib('rexxsupport.library',0,-30,0)
  17.  
  18. Say 'GetAllHTML v0.66ß  Copyright 1998-2000 Chris Handley (read program file for details)'
  19.  
  20.  /* set-up */
  21. HTTPResume='Programs:Utils/Comms/HTTPResume'
  22.  
  23. ExtDir='T:'
  24. TempFile='T:GetAllHTML'
  25. TempFileAdd = Random(1,999,Time(s))
  26. DO UNTIL ~Exists(TempFile||TempFileAdd)
  27.     TempFileAdd = Random(1,999,Time(s))
  28. END
  29. TempFile=TempFile||TempFileAdd
  30.  
  31.  /* deal with args */
  32. Parse VALUE Arg(1) WITH '"' MainURL '"' . '"' DestDir '"' Switch1 Switch2 Switch3 Switch4 Switch5 Switch6 Switch7 Switch8 Switch9 Switch10
  33. IF (MainURL='')|(DestDir='') THEN DO
  34.     Say 'ERROR:  Empty argument(s)!'
  35.         Say 'Usage:  GetAllHTML "URL"/A,"DestDir"/A,NOASK/S,ARC/S,PIC/S,RESUME/S,PAUSE/S,TERSE/S,DEPTH=/N/K,NOBASEINDEX/S,PORT=/K,BASEURL=/K,BROKENLINKS/S'
  36.     Say 'Note - both URL & DestDir *must* be enclosed in "double quotes".'
  37.     Say '     - after DEPTH should be a "=" followed by a number with NO spaces between them.'
  38.     Say '     - after PORT should be a "=" followed by a string with NO spaces between them.'
  39.     Say '     - after BASEURL should be a "=" followed by a string with NO spaces between them.'
  40.     Exit 20
  41. END
  42. IF (Right(DestDir,1)~='/')&(Right(DestDir,1)~=':') THEN DestDir=DestDir||'/'
  43. CALL MakeDir(DestDir)
  44. IF Left(MainURL,7)~='http://' THEN MainURL='http://'||MainURL
  45. Switch1=Upper(Switch1); Switch2=Upper(Switch2); Switch3=Upper(Switch3); Switch4=Upper(Switch4); Switch5=Upper(Switch5); Switch6=Upper(Switch6); Switch7=Upper(Switch7); Switch8=Upper(Switch8); Switch9=Upper(Switch9); Switch10=Upper(Switch10); Switch11=Upper(Switch11)
  46.  
  47. SwNoAsk=0; SwArc=0; SwPic=0; SwResume=0; SwDepth=30; SwNoPause=1; SwPort=0; SwTerse=0; BaseURLDir=''; SwBroken=0; SwNoBaseIndex=0
  48.  
  49. IF (Switch1='NOASK')|(Switch2='NOASK')|(Switch3='NOASK')|(Switch4='NOASK')|(Switch5='NOASK')|(Switch6='NOASK')|(Switch7='NOASK')|(Switch8='NOASK')|(Switch9='NOASK')|(Switch10='NOASK')|(Switch11='NOASK') THEN SwNoAsk=1
  50. IF (Switch1='ARC')|(Switch2='ARC')|(Switch3='ARC')|(Switch4='ARC')|(Switch5='ARC')|(Switch6='ARC')|(Switch7='ARC')|(Switch8='ARC')|(Switch9='ARC')|(Switch10='ARC')|(Switch11='ARC') THEN SwArc=1
  51. IF (Switch1='PIC')|(Switch2='PIC')|(Switch3='PIC')|(Switch4='PIC')|(Switch5='PIC')|(Switch6='PIC')|(Switch7='PIC')|(Switch8='PIC')|(Switch9='PIC')|(Switch10='PIC')|(Switch11='PIC') THEN SwPic=1
  52. IF (Switch1='RESUME')|(Switch2='RESUME')|(Switch3='RESUME')|(Switch4='RESUME')|(Switch5='RESUME')|(Switch6='RESUME')|(Switch7='RESUME')|(Switch8='RESUME')|(Switch9='RESUME')|(Switch10='RESUME')|(Switch11='RESUME') THEN SwResume=1
  53. IF (Switch1='PAUSE')|(Switch2='PAUSE')|(Switch3='PAUSE')|(Switch4='PAUSE')|(Switch5='PAUSE')|(Switch6='PAUSE')|(Switch7='PAUSE')|(Switch8='PAUSE')|(Switch9='PAUSE')|(Switch10='PAUSE')|(Switch11='PAUSE') THEN SwNoPause=0
  54. IF (Switch1='TERSE')|(Switch2='TERSE')|(Switch3='TERSE')|(Switch4='TERSE')|(Switch5='TERSE')|(Switch6='TERSE')|(Switch7='TERSE')|(Switch8='TERSE')|(Switch9='TERSE')|(Switch10='TERSE')|(Switch11='TERSE') THEN SwTerse=1
  55. IF (Switch1='BROKENLINKS')|(Switch2='BROKENLINKS')|(Switch3='BROKENLINKS')|(Switch4='BROKENLINKS')|(Switch5='BROKENLINKS')|(Switch6='BROKENLINKS')|(Switch7='BROKENLINKS')|(Switch8='BROKENLINKS')|(Switch9='BROKENLINKS')|(Switch10='BROKENLINKS')|(Switch11='BROKENLINKS') THEN SwBroken=1
  56. IF (Switch1='NOBASEINDEX')|(Switch2='NOBASEINDEX')|(Switch3='NOBASEINDEX')|(Switch4='NOBASEINDEX')|(Switch5='NOBASEINDEX')|(Switch6='NOBASEINDEX')|(Switch7='NOBASEINDEX')|(Switch8='NOBASEINDEX')|(Switch9='NOBASEINDEX')|(Switch10='NOBASEINDEX')|(Switch11='NOBASEINDEX') THEN SwNoBaseIndex=1
  57. IF (Left(Switch1,5)='DEPTH')|(Left(Switch2,5)='DEPTH')|(Left(Switch3,5)='DEPTH')|(Left(Switch4,5)='DEPTH')|(Left(Switch5,5)='DEPTH')|(Left(Switch6,5)='DEPTH')|(Left(Switch7,5)='DEPTH')|(Left(Switch8,5)='DEPTH')|(Left(Switch9,5)='DEPTH')|(Left(Switch10,5)='DEPTH')|(Left(Switch11,5)='DEPTH') THEN DO
  58.     Parse VALUE Upper(Arg(1)) WITH '"' . '"' . '"' . '"' . 'DEPTH=' Depth .
  59.     IF Depth='' THEN DO
  60.         Say 'No DEPTH number found (must use "DEPTH=x" where x is your number).'
  61.         Say 'Search pages up to a depth of: '
  62.         Pull Depth
  63.     END
  64.  
  65.     IF Depth>42 THEN Depth=42    /* sanity protect against ARexx limitation */
  66.     IF Depth<10 THEN
  67.         SwDepth=Depth*2    /* since each grows by 2 each depth (e.g.".2.3.4.5") */
  68.     ELSE
  69.         SwDepth=((Depth-9)*3)+(9*2) /* as above but above 9 grows by 3 (e.g.".12.13.14.15") */
  70.     SwDepth=SwDepth+5-2    /* 5 = length of "Root." */
  71. END
  72. IF (Left(Switch1,4)='PORT')|(Left(Switch2,4)='PORT')|(Left(Switch3,4)='PORT')|(Left(Switch4,4)='PORT')|(Left(Switch5,4)='PORT')|(Left(Switch6,4)='PORT')|(Left(Switch7,4)='PORT')|(Left(Switch8,4)='PORT')|(Left(Switch9,4)='PORT')|(Left(Switch10,4)='PORT')|(Left(Switch11,4)='PORT') THEN DO
  73.     SwPort=1
  74.     Parse VALUE Upper(Arg(1)) WITH '"' . '"' . '"' . '"' . 'PORT=' Port .
  75.     IF Port='' THEN DO
  76.         Port=Address()
  77.         IF Left(Port,11)~='HTTPRESUME.' THEN DO
  78.             Say 'ERROR:  PORT argument was not followed by a = and a string with no spaces between (eg."PORT=HTTPResume.1"), and the host enviroment was not already HTTPResume!'
  79.             Exit 20
  80.         END
  81.     END
  82.  END
  83. ELSE Port='' /*probably not necessary*/
  84. IF (Left(Switch1,7)='BASEURL')|(Left(Switch2,7)='BASEURL')|(Left(Switch3,7)='BASEURL')|(Left(Switch4,7)='BASEURL')|(Left(Switch5,7)='BASEURL')|(Left(Switch6,7)='BASEURL')|(Left(Switch7,7)='BASEURL')|(Left(Switch8,7)='BASEURL')|(Left(Switch9,7)='BASEURL')|(Left(Switch10,7)='BASEURL')|(Left(Switch11,7)='BASEURL') THEN DO
  85.     Parse VALUE Upper(Arg(1)) WITH '"' . '"' . '"' . '"' . 'BASEURL=' BaseURLDir .
  86.     IF BaseURLDir='' THEN DO
  87.         Say 'ERROR:  BASEURL argument was not followed by a = and a string with no spaces between (eg."BASEURL=www.amiga.com")!'
  88.         Exit 20
  89.     END
  90.     BaseURLDir=SubStr(Arg(1),Index(Upper(Arg(1)),BaseURLDir),Length(BaseURLDir))
  91.     IF Right(BaseURLDir,1)~='/' THEN BaseURLDir=BaseURLDir||'/'
  92.  END
  93. ELSE DO
  94.     Parse VALUE Reverse(MainURL) WITH . '/' BaseURLDir
  95.     IF Length(BaseURLDir)<8 THEN BaseURLDir = Reverse(MainURL)     /*check for cases like FileURL="http://www.kosh.net" - i.e.no end slash*/
  96.     BaseURLDir=Reverse(BaseURLDir)||'/'
  97. END
  98.  
  99. If SwResume=1 THEN Say 'NOTE:  Resume mode activated!'
  100. If SwBroken=1 THEN Say 'NOTE:  Broken-link detection mode activated!'
  101.  
  102. IF Port='' THEN DO
  103.      /* run HTTPResume & set-up related stuff; OVERWRITE cause problems (restart from scratch if fails in the middle) */
  104.     Address Command 'Run >Nil: '||HTTPResume||' GUI NODATECHECK AUTORESUME STARTICONIFIED QUICKQUIT NOERRREQ RXPORTFILE='||TempFile /*NOENV removed*/
  105.     Say 'Waiting for HTTPResume...'
  106.     DO UNTIL Exists(TempFile)
  107.         Delay(25)
  108.     END
  109.     Delay(100)
  110.     IF ~Open(.port, TempFile, 'READ') THEN DO
  111.         Say 'ERROR:  Could not open "'||TempFile||'"!'
  112.         Exit 20
  113.     END
  114.     Port=ReadLn(.port)
  115.     Call Close(.port)
  116.     Call Delete(TempFile)
  117.     IF Port='***' THEN DO
  118.         Say 'ERROR:  HTTPResume could not open it''s ARexx port!'
  119.         Exit 20
  120.     END
  121.     Address(Port)
  122.  END
  123. ELSE DO
  124.     Address(Port)
  125. /*    SET OVERWRITE*/
  126.     SET NODATECHECK
  127.     SET AUTORESUME
  128.     SET QUICKQUIT
  129.     SET NOERRREQ
  130. END
  131.  
  132.  /* init set-up */
  133. Root.0=1
  134. Root.1=MainURL
  135. Root.1.HTML=1
  136.  
  137. ModemOnLine=0
  138. LastSuffix=''    /*record of suffix of last user confirmed file download - so semi-intelligent! */
  139.  
  140.  /* get all pages recurcively */
  141. Say 'Downloading & scanning pages...'
  142. CALL DownloadList('Root.',DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex)
  143.  
  144. Say 'Finished.'
  145. IF SwPort=0 THEN QUIT
  146. Exit
  147.  
  148. DownloadList: PROCEDURE EXPOSE Root. Resume. ModemOnLine LastSuffix ExtDir
  149.     /* DownloadList(URLList,DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex) */
  150.      /* grab args */
  151.     URLList=Arg(1)
  152.     DestDir=Arg(2)
  153.     BaseURLDir=Arg(3)
  154.     SwNoAsk=Arg(4)
  155.     SwArc=Arg(5)
  156.     SwPic=Arg(6)
  157.     SwResume=Arg(7)
  158.     SwDepth=Arg(8)
  159.     SwNoPause=Arg(9)
  160.     SwTerse=Arg(10)
  161.     SwBroken=Arg(11)
  162.     SwNoBaseIndex=Arg(12)
  163.  
  164.     INTERPRET 'URLListSize='||URLList||'0'
  165.  
  166.      /* deal with each URL in list in turn */
  167.     IF Length(URLList)>SwDepth THEN
  168.         NOP
  169.      ELSE DO
  170.         IF URLListSize>0 THEN DO
  171. /*Say '-Length('URLList')='||Length(URLList)*/
  172.             DO i=1 TO URLListSize
  173.                 NewURLList=URLList||i
  174.                 INTERPRET 'URL='||NewURLList
  175.                 INTERPRET 'HTMLfile='||NewURLList||'.HTML'
  176.  
  177.                 INTERPRET 'ExternalLink='||NewURLList||'.EXT'
  178.                 IF ExternalLink~=1 THEN ExternalLink=0
  179.  
  180.                  /* decide on relative file & path */
  181.                 IF ExternalLink=0 THEN
  182.                      /* find local path */
  183.                     Parse VAR URL (BaseURLDir) PathFile
  184.                 ELSE DO
  185.                      /* outside normal search (external) - set PathFile as just file */
  186.                     Parse VALUE Reverse(URL) WITH PathFile '/' .
  187.                     PathFile=Reverse(PathFile)
  188.                 END
  189.                 IF (Right(PathFile,1)='/')|(PathFile='') THEN DO
  190.                     PathFile=PathFile||'InDeX.hTmL'    /* give filename-less pages a name */
  191.                     HTMLfile=1            /* force attempted scanning for HTMLs */
  192.                     GuessedURL=1
  193.                  END
  194.                 ELSE
  195.                     GuessedURL=0
  196.                 Parse VALUE Reverse(PathFile) WITH File '/' Path
  197.                 File=Reverse(File)
  198.                 Path=Reverse(Path)
  199.                 IF Path='' THEN DO
  200.                     File=PathFile
  201.                     Path=''
  202.                 END
  203.  
  204.                  /* create necessary dir(s) */
  205.                 PathLeft=Path                    /* use URL minus file at end */
  206.                 CurPath=DestDir
  207.                 DO While PathLeft~=''
  208.                     Parse VALUE PathLeft WITH NewDir '/' PathLeft
  209.                     IF NewDir~=='' THEN DO
  210.                         CurPath=CurPath||NewDir||'/'
  211.                         CALL MakeDir(Left(CurPath,Length(CurPath)-1))
  212.                      END
  213.                     ELSE DO
  214.                         IF SwTerse=0 THEN DO
  215.                             IF SwNoPause=0 THEN DO
  216.                                 Say 'WARNING:  Empty dir name in URL "'||URL||'" (press <return>)'
  217.                                 Pull Input
  218.                              END
  219.                             ELSE
  220.                                 Say 'WARNING:  Empty dir name in URL "'||URL||'"'
  221.                         END
  222.                     END
  223.                 END
  224.                 IF ExternalLink=0 THEN
  225.                     DownloadFile=DestDir||PathFile
  226.                 ELSE
  227.                     DownloadFile=ExtDir||PathFile
  228.  
  229.  
  230.                 IF SwResume~=0 THEN DO
  231.                     SeenBefore=0
  232.                     /*RxDownloadFile=Translate(DownloadFile,'abcdefghijklmnopqrstuvwxyz0123456789','abcdefghijklmnopqrstuvwxyz0123456789','_')*/
  233.                     RxDownloadFile=DownloadFile
  234.                     IF Resume.RxDownloadFile=1 THEN SeenBefore=1
  235.  
  236.                     IF SeenBefore=0 THEN DO            /* if visited this page before then pass! */
  237.                         IF Exists(DownloadFile) THEN DO
  238.                             Resume.RxDownloadFile=1
  239.  
  240.                             IF HTMLfile=1 THEN DO
  241.                                  /* parse page for URLs into a list */
  242.                                 CALL GetURLs(NewURLList||'.',DownloadFile,BaseURLDir,URL,SwNoAsk,SwArc,SwPic,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex)
  243.                                          /* download pages from list */
  244.                                 CALL DownloadList(NewURLList||'.',DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex)
  245.                             END
  246.                          END
  247.                         ELSE DO
  248.                             IF ExternalLink=0 THEN DO
  249.                                 SwResume=0    /* reached point did last time, now continue as before */
  250.                                 IF ModemOnLine=0 THEN DO    /* hack to ensure only halt for input once (so can leave alone) */
  251.                                     Say 'NOTE:  Reached point where left off! (press <return>)'
  252.                                     Pull Input
  253.                                 END
  254.                                 ModemOnLine=1
  255.                             END
  256.                         END
  257.  
  258.                     END
  259.                 END
  260.  
  261.                 IF (SwResume=0)|(ExternalLink=1) THEN DO
  262.                     IF ~Exists(DownloadFile) THEN DO    /* if visited this page before then pass! */
  263.                         IF ExternalLink=0 THEN DO
  264.                             /* download file */
  265.                             CALL GetHTML(URL,DownloadFile)
  266.  
  267.                             /* see if was downloaded */
  268.                             IF ~Exists(DownloadFile) THEN DO
  269.                                 IF SwTerse=0 THEN DO
  270.                                     IF GuessedURL~=1 THEN DO
  271.                                         IF SwNoPause=0 THEN DO
  272.                                             Say 'WARNING:  Couldn''t download file "'||DownloadFile||'" (press <return>)'
  273.                                             Pull Input
  274.                                          END
  275.                                         ELSE
  276.                                             Say 'WARNING:  Couldn''t download file "'||DownloadFile||'"'
  277.                                     END
  278.                                 END
  279.  
  280.                                 /* if not downloaded then place empty 'fake' file to stop RESUME stopping to early */
  281.                                 Call Open(.file, DownloadFile, 'WRITE')
  282.                                     Call Close(.file)
  283.                              END
  284.                             ELSE DO
  285.                                 /* scan downloaded file if asked to */
  286.                                 IF HTMLfile=1 THEN DO
  287.                                      /* parse page for URLs into a list */
  288.                                     CALL GetURLs(NewURLList||'.',DownloadFile,BaseURLDir,URL,SwNoAsk,SwArc,SwPic,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex)
  289.                                              /* download pages from list */
  290.                                     CALL DownloadList(NewURLList||'.',DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex)
  291.                                 END
  292.                             END
  293.                          END
  294.                         ELSE DO
  295.                             /* download file, only if haven't done before (hijacked how RESUME checks) */
  296.                             /*RxDownloadFile=Translate(DownloadFile,'abcdefghijklmnopqrstuvwxyz0123456789','abcdefghijklmnopqrstuvwxyz0123456789','_')*/
  297.                             RxDownloadFile=DownloadFile
  298.                             IF Resume.RxDownloadFile~=1 THEN DO
  299.                                 Resume.RxDownloadFile=1
  300.  
  301.                                 /* download file */
  302.                                 CALL GetHTML(URL,DownloadFile)
  303.  
  304.                                 /* re-try downloading twice, incase 'freak' connect failure */
  305.                                 IF ~Exists(DownloadFile) THEN DO
  306.                                     CALL Delay(50)
  307.                                     CALL GetHTML(URL,DownloadFile)
  308.  
  309.                                     IF ~Exists(DownloadFile) THEN DO
  310.                                         CALL Delay(50)
  311.                                         CALL GetHTML(URL,DownloadFile)
  312.                                     END
  313.                                 END
  314.  
  315.                                 IF ~Exists(DownloadFile) THEN DO
  316.                                     Parse VALUE Reverse(URLList) WITH . '.' BrokePage
  317.                                     INTERPRET 'BrokePage='||Reverse(BrokePage)
  318.                                     Say 'Found BROKEN LINK to "'||URL||'" in "'||BrokePage||'"'
  319.                                  END
  320.                                 ELSE
  321.                                     CALL Delete(DownloadFile)
  322.                             END
  323.                         END
  324.                     END
  325.                 END
  326.             END
  327.         END
  328.     END
  329. Return
  330.  
  331. GetURLs: PROCEDURE EXPOSE Root. LastSuffix
  332.     /* GetURL(URLList,DownloadFile,BaseURLDir,FileURL,SwNoAsk,SwArc,SwPic,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex) */
  333.      /* get args */
  334.     URLList=Arg(1)
  335.     DownloadFile=Arg(2)
  336.     BaseURLDir=Arg(3)
  337.     FileURL=Arg(4)
  338.     SwNoAsk=Arg(5)
  339.     SwArc=Arg(6)
  340.     SwPic=Arg(7)
  341.     SwNoPause=Arg(8)
  342.     SwTerse=Arg(9)
  343.     SwBroken=Arg(10)
  344.     SwNoBaseIndex=Arg(11)
  345.  
  346.     INTERPRET URLList||'0 = 0'
  347.  
  348.      /* expand path to global, if is local reference like "/new/0083.html" */
  349.     Parse VALUE Reverse(FileURL) WITH . '/' LocalURLDir
  350.     IF Length(LocalURLDir)<8 THEN LocalURLDir = Reverse(FileURL)    /*check for cases like FileURL="http://www.kosh.net" - i.e.no end slash*/
  351.     LocalURLDir=Reverse(LocalURLDir)||'/'
  352.  
  353.     PARSE VAR LocalURLDir 'http://' LocalURLDomain '/' .    /*recover domain from URL*/
  354.     LocalURLDomain='http://'||LocalURLDomain
  355.  
  356.      /* parse (possibly) downloaded HTML file for URLs */
  357.     IF Open(.file, DownloadFile, 'READ') THEN DO
  358. /*Say 'Reading HTML file "'||DownloadFile||'"'*/
  359.         DO WHILE ~EOF(.file)
  360.              /* search for HTML ref. links */
  361.             Line=ReadLn(.file)
  362.             ULine=Upper(Line)
  363.  
  364.             NewPos=0; Mode=0
  365.             DO UNTIL NewPos<0
  366.                  /* non-frame search */
  367.                 IF Mode=0 THEN DO
  368.                     NewPos=Pos('HREF=',ULine,NewPos+1)    /*finds "<AREA HREF" "<A HREF" "<A/nHREF" ...*/
  369.                     IF NewPos=0 THEN DO
  370.                         Mode=1
  371.                         NewPos=0
  372.                     END
  373.                 END
  374.                  /* frame/image search */
  375.                 IF Mode=1 THEN DO
  376.                     Done=1
  377.                      /* "SRC=" occurs for both in frames & images */
  378.                     NewPos=Pos('SRC=',ULine,NewPos+1)
  379.                     IF NewPos=0 THEN NewPos=-1
  380.                 END
  381.  
  382.                  /* expand URL to full path, remove non-file parts & store only if inside parameters */
  383.                 IF NewPos>0 THEN DO
  384.                     Parse VAR Line =NewPos '="' URL '"'
  385.                     IF URL='' THEN Parse VAR Line =NewPos '=\"' URL '"'    /*javascripts precede "s by a slash*/
  386. /*Say '-Found URL "'||URL||'"'*/
  387.                     IF URL~=='' THEN DO
  388.                         Parse UPPER VAR URL URLDev ':' URLRest
  389.                         Download=1
  390.                         IF (URLRest~=='')&(URLDev~='HTTP') THEN DO
  391.                              /* found e.g. "mailto:" */
  392.                             IF SwTerse=0 THEN Say 'Found non-http link "'||URL||'"'
  393.                             Download=0
  394.                         END
  395.                         IF URLDev=Upper(URL) THEN DO
  396.                         /*    IF Left(URL,1)='/' THEN DO
  397.                                 URL=SubStr(URL,2)        /* remove pre-slash */
  398.                                 DO While Left(URL,2)='..'    /* convert "../" to "//" */
  399.                                     URL='/'||SubStr(URL,3)
  400.                                 END
  401.                                 /*???remove pre-slash (again)???*/
  402.                                 URL=LocalURLDomain||URL    /* pre-slash finally replaced by domain name */
  403.                              END
  404.                             ELSE DO
  405.                                 DO While Left(URL,2)='..'    /* convert "../" to "//" */
  406.                                     URL='/'||SubStr(URL,3)
  407.                                 END
  408.                                 IF Left(URL,1)='/' THEN URL=SubStr(URL,2)    /* remove 1st spurious pre-slash (otherwise path wrongly interpreted) */
  409.                                 URL=LocalURLDir||URL    /* local reference -> expand to full */
  410.                             END
  411.                         */
  412.                             IF Left(URL,1)='/' THEN URL=SubStr(URL,2)    /* remove pre-slash */
  413.                         /*    DO While Left(URL,2)='..'            /* convert "../" to "//" */
  414.                                 URL='/'||SubStr(URL,3)
  415.                             END
  416.                         */
  417.  
  418.                             /*handle "../"s*/
  419.                             newURL=URL
  420.                             DO UNTIL URL=newURL
  421.                                 URL=newURL
  422.                                 Parse VAR URL newURLpre '../' newURLpost
  423.                                 IF newURLpost~='' THEN newURL=newURLpre||'//'||newURLpost
  424.                             END
  425.                             URL=newURL
  426.  
  427.                             /*handle "./"s*/
  428.                             newURL=URL
  429.                             DO UNTIL URL=newURL
  430.                                 URL=newURL
  431.                                 Parse VAR URL newURLpre './' newURLpost
  432.                                 IF newURLpost~='' THEN newURL=newURLpre||newURLpost
  433.                             END
  434.                             URL=newURL
  435.  
  436.                             IF Left(URL,1)='/' THEN URL=SubStr(URL,2)    /* remove 1st spurious pre-slash (otherwise path wrongly interpreted) */
  437.                             URL=LocalURLDir||URL                /* local reference -> expand to full */
  438.  
  439.                              /* if have double-slashes (go down dir), then remove relevant dirs */
  440.                             Done=0
  441.                             DO Until Done=1            /* an algorithm with a bit of magic! */
  442.                                 URLLen=Length(URL)
  443.                                 EndDPos=Index(URL,'//',8)                /* marks end of '//' */
  444.                                 IF EndDPos>0 THEN DO
  445.                                     StartDPos=Index(Reverse(URL),'/',URLLen-EndDPos+2)
  446.                                     IF StartDPos>0 THEN DO
  447.                                         StartDPos=URLLen-StartDPos+1        /* marks 1st slash before '//' */
  448.                                         URL=Left(URL,StartDPos)||SubStr(URL,EndDPos+2)
  449.                                      END
  450.                                     ELSE
  451.                                         Done=1
  452.                                  END
  453.                                 ELSE
  454.                                     Done=1
  455.                             END
  456.                          END
  457.                         ELSE DO
  458.                             IF URLRest=='' THEN Download=0        /* nothing after ":" */
  459.                         END
  460.  
  461.                         IF Download=1 THEN DO
  462.                              /* remove "#search" from "http:path/file#search" */
  463.                             IF Index(URL,'#')~=0 THEN DO
  464.                                 Parse VALUE Reverse(URL) WITH . '#' URL
  465.                                 URL=Reverse(URL)
  466.                             END
  467.  
  468.                              /* used to check for suffix & if it is not part of e.g. www.amiga.com */
  469.                             Parse VALUE Reverse(URL) WITH URLFile '/' .
  470.                             Parse VAR URLFile Suffix '.' .
  471.                             URLFile=Reverse(URLFile)
  472.                             Suffix=Reverse(Suffix)
  473.                             DirSuffix=0
  474.                             GotSuffix=0
  475.                             IF Suffix~=URLFile THEN DO
  476.                                 GotSuffix=1
  477.                                 IF Index(Reverse(URL),'/')>(Length(URL)-7) THEN DO
  478.                                     DirSuffix=1
  479.                                     GotSuffix=0
  480.                                 END
  481.                             END
  482.  
  483.                             IF (GotSuffix=0)&(Right(URL,1)~='/')&(Index(URL,'?')=0) THEN
  484.                                 URL2=URL||'/'    /* MAY need to add implicit slash */
  485.                             ELSE
  486.                                 URL2=''
  487.  
  488.                             ExernalLink=0
  489.                             IF Left(URL,Length(BaseURLDir))~==BaseURLDir THEN DO
  490.                                 IF SwBroken=0 THEN
  491.                                     Download=0    /* don't download pages below initial dir */
  492.                                 ELSE
  493.                                     ExternalLink=1    /* do download but no further */
  494.                              END
  495.                             ELSE DO
  496.                                  /* check if URL is BaseURL's index (ie."baseurl/" or "baseurl/index.html") */
  497.                                 IF SwNoBaseIndex=1 THEN DO
  498.                                     AboveBaseURLDir=Upper(Right(URL,Length(URL)-Length(BaseURLDir)))
  499.                                     IF (AboveBaseURLDir='/')|(Left(AboveBaseURLDir,6)='INDEX.') THEN Download=0
  500.                                 END
  501.                             END
  502.  
  503.                              /* check if should download this file-type */
  504.                             HTMLfile=0
  505.                             IF (GotSuffix=1)&&(ExternalLink=0) THEN DO    /*never consider external links*/
  506.                                 Suffix=Upper(Left(Suffix,3,' '))
  507.                                  /* as well as always downloading HTML files, also intelligently downloads if suffix same as last user-confirmed download */
  508.                                 IF (Suffix~='HTM')&(Suffix~='SHT')&(Suffix~='SH ')&(Suffix~=LastSuffix) THEN DO
  509.                                     Ask=1
  510.                                     Arc=0; Pic=0
  511.                                     IF (Suffix='LZX')|(Suffix='LHA')|(Suffix='ZIP')|(Suffix='LZH')|(Suffix='ZOO') THEN Arc=1
  512.                                     IF (Suffix='GIF')|(Suffix='JPG')|(Suffix='JPE')|(Suffix='PNG')|(Suffix='JFI') THEN Pic=1
  513.  
  514.                                     IF (Arc=1)&(SwArc=1) THEN Ask=0
  515.                                     IF (Pic=1)&(SwPic=1) THEN Ask=0
  516.  
  517.                                     IF Download=1 THEN DO
  518.                                         IF Ask=1 THEN DO
  519.                                             IF SwNoAsk=1 THEN
  520.                                                 Download=0
  521.                                             ELSE DO
  522.                                                 Say 'QUERY:  Download file "'||URL||'"?'
  523.                                                 DO Until Input~=''
  524.                                                     Pull Input
  525.                                                 END
  526.                                                 IF Left(Input,1)='N' THEN
  527.                                                     Download=0
  528.                                                 ELSE
  529.                                                     LastSuffix=Suffix
  530.                                             END
  531.                                         END
  532.                                     END
  533.                                   END
  534.                                 ELSE
  535.                                     HTMLfile=1
  536.                             END
  537.  
  538.                             IF FileURL=URL  THEN Download=0        /* avoid self-referencing infinite loops */
  539. /*IF Download=1 THEN DO
  540. IF URL2='' THEN
  541.     Say '--Final URL="'||URL||'", Download='||Download
  542. ELSE
  543.     Say '--Final URL="'||URL||'"(/), Download='||Download
  544. END*/
  545.  
  546.                             IF Download=1 THEN DO
  547.                                  /* store URL in list */
  548.                                 URL=Strip(URL,'T')
  549.                                 INTERPRET 'URLListSize='||URLList||'0 + 1'
  550.                                 INTERPRET URLList||'0 = URLListSize'
  551.                                 INTERPRET URLList||URLListSize||' = URL'
  552.  
  553.                                 INTERPRET URLList||URLListSize||'.HTML = HTMLfile'    /* record whether file should be scanned! */
  554.                                 INTERPRET URLList||URLListSize||'.EXT = ExternalLink'    /* record is external link */
  555.  
  556.                                  /* add 2nd possible interpretation of URL to list*/
  557.                                 IF URL2~='' THEN DO
  558.                                     URL2=Strip(URL2,'T')
  559.                                     INTERPRET 'URLListSize='||URLList||'0 + 1'
  560.                                     INTERPRET URLList||'0 = URLListSize'
  561.                                     INTERPRET URLList||URLListSize||' = URL2'
  562.  
  563.                                     INTERPRET URLList||URLListSize||'.HTML = HTMLfile'    /* record whether file should be scanned! */
  564.                                     IF ExternalLink=1 THEN INTERPRET URLList||URLListSize||'.EXT = 1' /* record is external link */
  565.                                 END
  566.                             END
  567.                         END
  568.                      END
  569. /*                    ELSE DO
  570.                         IF SwTerse=0 THEN DO
  571.                             IF SwNoPause=0 THEN DO
  572.                                 Say 'WARNING:  Empty URL at pos '||NewPos||' in line "'||Line||'" (press <return>)'
  573.                                 Pull Input
  574.                              END
  575.                             ELSE
  576.                                 Say 'WARNING:  Empty URL at pos '||NewPos||' in line "'||Line||'"'
  577.                         END
  578.  
  579.                     END
  580. */
  581.                 END
  582.             END
  583.         END
  584.         CALL Close(.file)
  585.     END
  586. Return
  587.  
  588. GetHTML: PROCEDURE
  589.     /* GetHTML(TheURL,File) */
  590.      /* grab args */
  591.     TheURL=Arg(1)
  592.     File=Arg(2)
  593.  
  594.      /* download file */
  595.     SET OUTFILE File
  596.     SET URL TheURL
  597.     START
  598.     Working=1
  599.     DO WHILE Working>0
  600.         CALL Delay(50)
  601.         QUERY FINISHED
  602.         Working=Result
  603.     END
  604. Return
  605.