home *** CD-ROM | disk | FTP | other *** search
/ The C Users' Group Library 1994 August / wc-cdrom-cusersgrouplibrary-1994-08.iso / vol_300 / 360_01 / dctcvt.c < prev    next >
Text File  |  1992-02-16  |  4KB  |  200 lines

  1. /*    dctcvt - convert spelling dictionary and build index    */
  2.  
  3. #include <fcntl.h>
  4. #include <sys/types.h>
  5. #include <sys/stat.h>
  6.  
  7. #define GRANULE    16    /* index granularity in words    */
  8.  
  9. struct idx
  10.     {
  11.     char    *key;        /* address of key in main mem */
  12.     long    addr;        /* file pointer where a word is stored */
  13.     };
  14.  
  15. struct suffix
  16.     {
  17.     char    *value;        /* suffix value            */
  18.     int    length;        /* length of suffix value    */
  19.     };
  20.  
  21. struct    suffix    suffix[]=
  22.     {
  23.     {"ers",3},
  24.     {"ing",3},
  25.     {"ed",2},
  26.     {"er",2},
  27.     {"es",2},
  28.     {"ly",2},
  29.     {"d",1},
  30.     {"s",1},
  31.     0
  32.     };
  33.  
  34. char    *malloc();
  35.  
  36. main(argc,argv)
  37.     int argc;
  38.     char **argv;
  39.     {
  40.     struct    stat    stats;
  41.     long    addr;
  42.     int    first=1;
  43.     int    size, idxsize, shortsize, dsize, dsize2, fileno, pos, words;
  44.     int    i, suflen;
  45.     char    *dict, *dict2, *dictend, *cp1, *cp2, *idx, *shortaddr;
  46.     char    *root[30], *flag[30];
  47.     char    c;
  48.  
  49.     printf("reading source dictionary\n");
  50.     fileno=open("wpdict.nl", O_RDONLY);
  51.     if (fileno<0)
  52.         {
  53.         printf("cant open old dictionary\n");
  54.         exit();
  55.         }
  56.  
  57.     fstat(fileno,&stats);
  58.     dsize=stats.st_size;
  59.     dict=malloc(dsize);
  60.     dictend=dict+dsize;
  61.     read(fileno,dict,dsize);
  62.     close(fileno);
  63.     printf("read %d\n",dsize);
  64.     printf("converting dictionary\n");
  65.     words=0;
  66.     for(cp1=dict; cp1<dictend; cp1++)
  67.         if (*cp1==10)
  68.             {
  69.             *cp1=0;
  70.             words++;
  71.             }
  72.  
  73.     printf("%d words processed\n",words);
  74.     for(i=0; i<sizeof(root)/sizeof(root[0]); root[i++]="");
  75.     dict2=malloc(dsize+words);
  76.     words=0;
  77.     for(cp1=dict, cp2=dict2; cp1<dictend; cp1++)
  78.         {
  79.         size=strlen(cp1);
  80.         for (i=0, suflen=0; suffix[i].length && !suflen; i++)
  81.             {
  82.             if (size>suffix[i].length
  83.                 && strcmp(cp1+size-suffix[i].length,
  84.                     suffix[i].value)==0
  85.                 && strcmp(cp1+size-suffix[i].length,
  86.                     suffix[i].value)==0
  87.                 && strncmp(cp1,
  88.                     root[size-suffix[i].length],
  89.                     size-suffix[i].length)==0)
  90.                 {
  91.                 suflen=suffix[i].length;
  92.                 *flag[size-suflen]=(*flag[size-suflen])|(1<<i);
  93.                 cp1+=size;
  94.                 }
  95.             }
  96.  
  97.         if (!suflen)
  98.             {
  99.             root[size]=cp1;
  100.             for (pos=1; *cp1; cp1++, pos++)
  101.                 {
  102.                 c=*cp1;
  103.                 if (c=='\'')
  104.                     c=1;
  105.                 else
  106.                     c=(c&31)+1;
  107.  
  108.                 if (pos==1)
  109.                     {
  110.                     *cp2=c<<3;
  111.                     *(cp2+1)=0;
  112.                     }
  113.                 else if (pos==2)
  114.                     {
  115.                     *cp2=*cp2|(c>>2);
  116.                     cp2++;
  117.                     *cp2=c<<6;
  118.                     *(cp2+1)=0;
  119.                     }
  120.                 else
  121.                     {
  122.                     *cp2=*cp2|c;
  123.                     cp2++;
  124.                     *cp2=0;
  125.                     pos=0;
  126.                     }
  127.                 }
  128.  
  129.             if (*cp2) *++cp2=0;
  130.             cp2++;
  131.             flag[size]=cp2;
  132.             *cp2++=0;
  133.             words++;
  134.             }
  135.         }
  136.  
  137.     printf("retained %d words\n",words);
  138.     dsize2=cp2-dict2;
  139.     printf("writing target dictionary\n");
  140.     fileno=creat("wpdict.dat",0666);
  141.     if (fileno<0)
  142.         {
  143.         printf("cant open new dictionary\n");
  144.         exit();
  145.         }
  146.  
  147.     write(fileno,dict2,dsize2);
  148.     close(fileno);
  149.     printf("wrote %d\n",dsize2);
  150.     printf("creating index\n");
  151.     dictend=dict2+dsize2;
  152.     cp2=idx=malloc(dsize);
  153.     for(cp1=dict2; cp1<dictend;)
  154.         {
  155.         shortsize=99;
  156.         for (i=0; i<GRANULE && cp1<dictend; i++)
  157.             {
  158.             size=strlen(cp1);
  159.             if (size<shortsize)
  160.                 {
  161.                 shortsize=size;
  162.                 shortaddr=cp1;
  163.                 }
  164.  
  165.             cp1+=size;
  166.             cp1++;
  167.             cp1++;
  168.             }
  169.  
  170.         if (first)
  171.             {
  172.             first=0;
  173.             shortaddr=dict2;
  174.             }
  175.  
  176.         strcpy(cp2,shortaddr);
  177.         cp2+=strlen(cp2);
  178.         cp2++;
  179.         addr=shortaddr-dict2;
  180.         *cp2++=addr%256;
  181.         addr/=256;
  182.         *cp2++=addr%256;
  183.         addr/=256;
  184.         *cp2++=addr;
  185.         }
  186.  
  187.     idxsize=cp2-idx;
  188.     printf("writing index\n");
  189.     fileno=creat("wpdict.idx",0666);
  190.     if (fileno<0)
  191.         {
  192.         printf("cant open new index file\n");
  193.         exit();
  194.         }
  195.  
  196.     write(fileno,idx,idxsize);
  197.     printf("wrote %d\n",idxsize);
  198.     }
  199.  
  200.