home *** CD-ROM | disk | FTP | other *** search
/ Dream 52 / Amiga_Dream_52.iso / Linux / Magazine / wwwoffle-2.1.tar.gz / wwwoffle-2.1 / html.l < prev    next >
Text File  |  1998-02-26  |  14KB  |  532 lines

  1. W               [ \t\r\n]
  2. F               [-a-z0-9$_.!*(),%;/?:@&=+~|]
  3.  
  4. %x HEADER
  5. %x COMMENT
  6. %x ANGLE_START ANGLE
  7. %x IMAGE IMAGE_SRC
  8. %x BODY BODY_BACK
  9. %x ANCHOR ANCHOR_HREF
  10. %x LINK LINK_HREF
  11. %x FRAME FRAME_SRC
  12. %x META META_REFRESH META_REFRESH_URL
  13.  
  14. %{
  15. /***************************************
  16.   $Header: /home/amb/wwwoffle/RCS/html.l 2.13 1998/02/26 20:04:43 amb Exp $
  17.  
  18.   WWWOFFLE - World Wide Web Offline Explorer - Version 2.1.
  19.   Parse the HTML and look for the images, links and end of body.
  20.   ******************/ /******************
  21.   Written by Andrew M. Bishop
  22.  
  23.   This file Copyright 1997 Andrew M. Bishop
  24.   It may be distributed under the GNU Public License, version 2, or
  25.   any higher version.  See section COPYING of the GNU Public license
  26.   for conditions under which this file may be redistributed.
  27.   ***************************************/
  28.  
  29.  
  30. #include <stdlib.h>
  31. #include <string.h>
  32. #include <ctype.h>
  33.  
  34. #include <unistd.h>
  35.  
  36. #include "wwwoffle.h"
  37. #include "misc.h"
  38.  
  39.  
  40. extern int yylex(void);
  41. #define yywrap() 1
  42.  
  43. /*+ The type of reference that has been found. +*/
  44. typedef enum _RefType
  45. {
  46.  Image,                         /*+ An inlined image. +*/
  47.  Link,                          /*+ A link to another page. +*/
  48.  Frame                          /*+ The contents of a frame. +*/
  49. }
  50. RefType;
  51.  
  52. static void free_list(RefType type);
  53. static char **get_list(RefType type);
  54. static void add_reference(char* name,RefType type);
  55. static void canonicalise_references(RefType type);
  56.  
  57. /*+ The file descriptor that we are reading from. +*/
  58. static int yyfd=-1;
  59.  
  60. /*+ If the file is parsed as an HTML file. +*/
  61. static int is_html=0;
  62.  
  63. /*+ The list of images. +*/
  64. static char **images=NULL;
  65.  
  66. /*+ The number of images. +*/
  67. static int nimages=0;
  68.  
  69. /*+ The list of links. +*/
  70. static char **links=NULL;
  71.  
  72. /*+ The number of links. +*/
  73. static int nlinks=0;
  74.  
  75. /*+ The list of frames. +*/
  76. static char **frames=NULL;
  77.  
  78. /*+ The number of frames. +*/
  79. static int nframes=0;
  80.  
  81. /*+ The refresh content of a Meta tag. +*/
  82. static char *meta_refresh=NULL;
  83.  
  84. /*+ Just before the end of body tag (or the end of html tag (or end of file)). +*/
  85. static int body_or_html_end=0;
  86.  
  87. /*+ The URL that this is referenced from. +*/
  88. static URL *refUrl=NULL;
  89.  
  90.  
  91. /*++++++++++++++++++++++++++++++++++++++
  92.   Parse the HTML and look for references to image/links/frames.
  93.  
  94.   int ParseHTML Returns 1 if it was parsed as an HTML file.
  95.  
  96.   int fd The file descriptor of the file to parse.
  97.  
  98.   URL *Url The reference URL to use.
  99.  
  100.   int html For files without a header, force html selection.
  101.   ++++++++++++++++++++++++++++++++++++++*/
  102.  
  103. int ParseHTML(int fd,URL *Url,int html)
  104. {
  105.  refUrl=Url;
  106.  
  107.  is_html=html;
  108.  body_or_html_end=0;
  109.  
  110.  free_list(Image);
  111.  free_list(Link);
  112.  free_list(Frame);
  113.  
  114.  meta_refresh=NULL;
  115.  
  116.  yyfd=fd;
  117.  yyrestart(NULL);
  118.  yylex();
  119.  
  120.  add_reference(NULL,Image);
  121.  add_reference(NULL,Link);
  122.  add_reference(NULL,Frame);
  123.  
  124.  return(is_html);
  125. }
  126.  
  127.  
  128. /*++++++++++++++++++++++++++++++++++++++
  129.   Return the list of references.
  130.  
  131.   char **ListImages Returns a null terminated list of images.
  132.   ++++++++++++++++++++++++++++++++++++++*/
  133.  
  134. char **ListImages(void)
  135. {
  136.  return(get_list(Image));
  137. }
  138.  
  139.  
  140. /*++++++++++++++++++++++++++++++++++++++
  141.   List the links found in the HTML file.
  142.  
  143.   char **ListLinks Returns a null terminated list of links.
  144.   ++++++++++++++++++++++++++++++++++++++*/
  145.  
  146. char **ListLinks(void)
  147. {
  148.  return(get_list(Link));
  149. }
  150.  
  151.  
  152. /*++++++++++++++++++++++++++++++++++++++
  153.   List the frames found in the HTML file.
  154.  
  155.   char **ListFrames Returns a null terminated list of frames.
  156.   ++++++++++++++++++++++++++++++++++++++*/
  157.  
  158. char **ListFrames(void)
  159. {
  160.  return(get_list(Frame));
  161. }
  162.  
  163.  
  164. /*++++++++++++++++++++++++++++++++++++++
  165.   Return the URL from the Meta Refresh tag if there is one.
  166.  
  167.   char *MetaRefresh Returns the new URL or NULL if none.
  168.   ++++++++++++++++++++++++++++++++++++++*/
  169.  
  170. char *MetaRefresh(void)
  171. {
  172.  return(meta_refresh);
  173. }
  174.  
  175.  
  176. /*++++++++++++++++++++++++++++++++++++++
  177.   Return the position of the end of the html in the document.
  178.  
  179.   int GetHTMLEnd Returns the position of the end of the html.
  180.   ++++++++++++++++++++++++++++++++++++++*/
  181.  
  182. int GetHTMLEnd(void)
  183. {
  184.  return(body_or_html_end);
  185. }
  186.  
  187.  
  188. /*++++++++++++++++++++++++++++++++++++++
  189.   Free up a list of references.
  190.  
  191.   RefType type The type of list that is to be freed.
  192.   ++++++++++++++++++++++++++++++++++++++*/
  193.  
  194. static void free_list(RefType type)
  195. {
  196.  char ***list=NULL;
  197.  int *nlist=0;
  198.  
  199.  switch(type)
  200.    {
  201.    case Image:
  202.     list=&images;
  203.     nlist=&nimages;
  204.     break;
  205.    case Link:
  206.     list=&links;
  207.     nlist=&nlinks;
  208.     break;
  209.    case Frame:
  210.     list=&frames;
  211.     nlist=&nframes;
  212.     break;
  213.    }
  214.  
  215.  if(*list)
  216.    {
  217.     int i;
  218.     for(i=0;(*list)[i];i++)
  219.        free((*list)[i]);
  220.     free(*list);
  221.    }
  222.  
  223.  *list=NULL;
  224.  *nlist=0;
  225. }
  226.  
  227.  
  228. /*++++++++++++++++++++++++++++++++++++++
  229.   Get a list of the references of the specified type.
  230.  
  231.   char **get_list Returns the list of URLs.
  232.  
  233.   RefType type The type of list that is required.
  234.   ++++++++++++++++++++++++++++++++++++++*/
  235.  
  236. static char **get_list(RefType type)
  237. {
  238.  char ***list=NULL;
  239.  
  240.  switch(type)
  241.    {
  242.    case Image:
  243.     list=&images;
  244.     break;
  245.    case Link:
  246.     list=&links;
  247.     break;
  248.    case Frame:
  249.     list=&frames;
  250.     break;
  251.    }
  252.  
  253.  if(*list)
  254.     canonicalise_references(type);
  255.  
  256.  return(*list);
  257. }
  258.  
  259.  
  260. /*++++++++++++++++++++++++++++++++++++++
  261.   A function to add a name to the list of references.
  262.  
  263.   char* name The name to add.
  264.  
  265.   RefType type The type of reference.
  266.   ++++++++++++++++++++++++++++++++++++++*/
  267.  
  268. static void add_reference(char* name,RefType type)
  269. {
  270.  char ***list=NULL;
  271.  int *nlist=0;
  272.  
  273.  if(name && !strncmp("mailto:",name,7))
  274.     return;
  275.  
  276.  switch(type)
  277.    {
  278.    case Image:
  279.     list=&images;
  280.     nlist=&nimages;
  281.     break;
  282.    case Link:
  283.     list=&links;
  284.     nlist=&nlinks;
  285.     break;
  286.    case Frame:
  287.     list=&frames;
  288.     nlist=&nframes;
  289.     break;
  290.    }
  291.  
  292.  if((*nlist)==0)
  293.     (*list)=(char**)malloc(16*sizeof(char*));
  294.  else if(((*nlist)%16)==0)
  295.     (*list)=(char**)realloc((*list),((*nlist)+16)*sizeof(char*));
  296.  
  297.  if(name)
  298.    {
  299.     (*list)[(*nlist)]=(char*)malloc(strlen(name)+1);
  300.     strcpy((*list)[(*nlist)],name);
  301.    }
  302.  else
  303.     (*list)[(*nlist)]=NULL;
  304.  
  305.  (*nlist)++;
  306. }
  307.  
  308.  
  309. /*++++++++++++++++++++++++++++++++++++++
  310.   Fix the list up with canonical URLs not the relative ones.
  311.  
  312.   RefType type The type of reference.
  313.   ++++++++++++++++++++++++++++++++++++++*/
  314.  
  315. static void canonicalise_references(RefType type)
  316. {
  317.  char ***list=NULL;
  318.  int *nlist=0;
  319.  int i,j;
  320.  
  321.  switch(type)
  322.    {
  323.    case Image:
  324.     list=&images;
  325.     nlist=&nimages;
  326.     break;
  327.    case Link:
  328.     list=&links;
  329.     nlist=&nlinks;
  330.     break;
  331.    case Frame:
  332.     list=&frames;
  333.     nlist=&nframes;
  334.     break;
  335.    }
  336.  
  337.  for(i=0;(*list)[i];i++)
  338.    {
  339.     char **item=&(*list)[i];
  340.     char *name=*item;
  341.     char *colon=strchr(name,':');
  342.     char *slash=strchr(name,'/');
  343.  
  344.     if(colon && slash && colon<slash)
  345.        ;
  346.     else
  347.       {
  348.        if(*name=='/')
  349.          {
  350.           *item=(char*)malloc(strlen(refUrl->proto)+strlen(refUrl->host)+strlen(name)+8);
  351.           sprintf(*item,"%s://%s%s",refUrl->proto,refUrl->host,name);
  352.          }
  353.        else
  354.          {
  355.           int j;
  356.           char *path=(char*)malloc(strlen(refUrl->path)+strlen(name)+2);
  357.           char *match;
  358.  
  359.           strcpy(path,refUrl->path);
  360.  
  361.           for(j=strlen(path);j>0;j--)
  362.              if(path[j]=='/')
  363.                 break;
  364.           path[j]=0;
  365.  
  366.           strcat(path,"/");
  367.           strcat(path,name);
  368.  
  369.           while((match=strstr(path,"/../")))
  370.             {
  371.              char *prev=match; match+=3;
  372.              while(prev>path && *--prev!='/');
  373.              while((*prev++=*match++));
  374.             }
  375.  
  376.           while((match=strstr(path,"/./")))
  377.             {
  378.              char *prev=match; match+=2;
  379.              while((*prev++=*match++));
  380.             }
  381.  
  382.           while((match=strstr(path,"//")))
  383.             {
  384.              char *prev=match; match++;
  385.              while((*prev++=*match++));
  386.             }
  387.  
  388.           match=&path[strlen(path)-2];
  389.           if(match>=path && !strcmp(match,"/."))
  390.              *match=0;
  391.  
  392.           match=&path[strlen(path)-3];
  393.           if(match>=path && !strcmp(match,"/.."))
  394.              if(match==path)
  395.                 *match=0;
  396.              else
  397.                 while(match>path && *--match!='/')
  398.                    *match=0;
  399.  
  400.           *item=(char*)malloc(strlen(refUrl->proto)+strlen(refUrl->host)+strlen(path)+8);
  401.           sprintf(*item,"%s://%s%s",refUrl->proto,refUrl->host,path);
  402.           free(path);
  403.          }
  404.  
  405.        free(name);
  406.       }
  407.    }
  408.  
  409.  /* remove the duplicates */
  410.  
  411.  for(i=0;(*list)[i];i++)
  412.    {
  413.     for(j=i+1;(*list)[j];j++)
  414.        if(!strcmp((*list)[i],(*list)[j]))
  415.           break;
  416.  
  417.     if((*list)[j])
  418.       {
  419.        free((*list)[j]);
  420.        do
  421.          {
  422.           (*list)[j]=(*list)[j+1];
  423.          }
  424.        while((*list)[j++]);
  425.        i--;
  426.        (*nlist)--;
  427.       }
  428.    }
  429. }
  430.  
  431. #define YY_INPUT(buf,result,max_size) \
  432.         if((result=read_data(yyfd,buf,max_size))==-1) \
  433.            result=0;
  434.  
  435. %}
  436.  
  437. %%
  438.  int open_angle=0,position=0,html_end=0,body_end=0;
  439.  if(is_html)
  440.     BEGIN(INITIAL);
  441.  else
  442.     BEGIN(HEADER);
  443.  
  444. <HEADER>\r*\n                                           { if(is_html) BEGIN(INITIAL); else return(EOF); }
  445. <HEADER>"Content-Type:"[ \t]+"text/html"[ \t]*\r*\n     { is_html=1; }
  446. <HEADER>.+\r*\n                                         { }
  447.  
  448.  
  449. [^<]+                                   { position+=yyleng; }
  450. "<!--"                                  { position+=yyleng; BEGIN(COMMENT); }
  451. "<"{W}*                                 { position+=yyleng; BEGIN(ANGLE_START); open_angle=yyleng; }
  452.  
  453.  
  454. <COMMENT>"-"                            { position+=yyleng; }
  455. <COMMENT>">"                            { position+=yyleng; }
  456. <COMMENT>"-->"                          { position+=yyleng; BEGIN(INITIAL); }
  457. <COMMENT>[^->]+                         { position+=yyleng; }
  458.  
  459.  
  460. <ANGLE_START>"img"{W}                   { position+=yyleng; BEGIN(IMAGE); }
  461. <ANGLE_START>"body"{W}                  { position+=yyleng; BEGIN(BODY); }
  462. <ANGLE_START>"a"{W}                     { position+=yyleng; BEGIN(ANCHOR); }
  463. <ANGLE_START>"link"{W}                  { position+=yyleng; BEGIN(LINK); }
  464. <ANGLE_START>"frame"{W}                 { position+=yyleng; BEGIN(FRAME); }
  465. <ANGLE_START>"meta"{W}                  { position+=yyleng; BEGIN(META); }
  466. <ANGLE_START>"/body"                    { position+=yyleng; BEGIN(ANGLE); body_end=position-yyleng-open_angle; }
  467. <ANGLE_START>"/html"                    { position+=yyleng; BEGIN(ANGLE); html_end=position-yyleng-open_angle; }
  468. <ANGLE_START>">"                        { position+=yyleng; BEGIN(INITIAL); }
  469. <ANGLE_START>.|\n                       { position+=yyleng; BEGIN(ANGLE); }
  470.  
  471.  
  472. <ANGLE>">"                              { position+=yyleng; BEGIN(INITIAL); }
  473. <ANGLE>[^>]+                            { position+=yyleng; }
  474.  
  475.  
  476. <IMAGE>">"                              { position+=yyleng; BEGIN(INITIAL); }
  477. <IMAGE>"src"{W}*"="{W}*["']*            { position+=yyleng; BEGIN(IMAGE_SRC); }
  478. <IMAGE>.|\n                             { position+=yyleng; }
  479. <IMAGE_SRC>{F}+                         { position+=yyleng; BEGIN(IMAGE); add_reference(yytext,Image); }
  480. <IMAGE_SRC>.|\n                         { position+=yyleng; BEGIN(IMAGE); }
  481.  
  482.  
  483. <BODY>">"                               { position+=yyleng; BEGIN(INITIAL); }
  484. <BODY>"background"{W}*"="{W}*["']*      { position+=yyleng; BEGIN(BODY_BACK); }
  485. <BODY>.|\n                              { position+=yyleng; }
  486. <BODY_BACK>{F}+                         { position+=yyleng; BEGIN(BODY); add_reference(yytext,Image); }
  487. <BODY_BACK>.|\n                         { position+=yyleng; BEGIN(BODY); }
  488.  
  489.  
  490. <ANCHOR>">"                             { position+=yyleng; BEGIN(INITIAL); }
  491. <ANCHOR>"href"{W}*"="{W}*["']*          { position+=yyleng; BEGIN(ANCHOR_HREF); }
  492. <ANCHOR>.|\n                            { position+=yyleng; }
  493. <ANCHOR_HREF>{F}+                       { position+=yyleng; BEGIN(ANCHOR); add_reference(yytext,Link); }
  494. <ANCHOR_HREF>.|\n                        { position+=yyleng; BEGIN(ANCHOR); }
  495.  
  496.  
  497. <LINK>">"                               { position+=yyleng; BEGIN(INITIAL); }
  498. <LINK>"href"{W}*"="{W}*["']*            { position+=yyleng; BEGIN(LINK_HREF); }
  499. <LINK>.|\n                              { position+=yyleng; }
  500. <LINK_HREF>{F}+                         { position+=yyleng; BEGIN(LINK); add_reference(yytext,Link); }
  501. <LINK_HREF>.|\n                         { position+=yyleng; BEGIN(LINK); }
  502.  
  503.  
  504. <FRAME>">"                              { position+=yyleng; BEGIN(INITIAL); }
  505. <FRAME>"src"{W}*"="{W}*["']*            { position+=yyleng; BEGIN(FRAME_SRC); }
  506. <FRAME>.|\n                             { position+=yyleng; }
  507. <FRAME_SRC>{F}+                         { position+=yyleng; BEGIN(FRAME); add_reference(yytext,Frame); }
  508. <FRAME_SRC>.|\n                         { position+=yyleng; BEGIN(FRAME); }
  509.  
  510.  
  511. <META>{W}*"HTTP-EQUIV=\"Refresh\""{W}*  { position+=yyleng; BEGIN(META_REFRESH); }
  512. <META>.|\n                              { position+=yyleng; BEGIN(ANCHOR); }
  513. <META_REFRESH>"URL="                    { position+=yyleng; BEGIN(META_REFRESH_URL); }
  514. <META_REFRESH>[^>]                      { position+=yyleng; }
  515. <META_REFRESH>">"                       { position+=yyleng; BEGIN(INITIAL); }
  516. <META_REFRESH_URL>{F}+                  { position+=yyleng; BEGIN(META_REFRESH);
  517.                                           meta_refresh=(char*)malloc(strlen(yytext)+1); strcpy(meta_refresh,yytext); }
  518. <META_REFRESH_URL>.|\n                  { position+=yyleng; BEGIN(META_REFRESH); }
  519.  
  520.  
  521. <<EOF>>                                 { if(body_end && html_end && (html_end-body_end)<16 && (position-html_end)<16)
  522.                                              body_or_html_end=body_end;
  523.                                           else if(body_end && (position-body_end)<16)
  524.                                              body_or_html_end=body_end;
  525.                                           else if(html_end && (position-html_end)<16)
  526.                                              body_or_html_end=html_end;
  527.                                           else
  528.                                              body_or_html_end=position;
  529.                                           return(EOF); }
  530.  
  531. %%
  532.