home *** CD-ROM | disk | FTP | other *** search
/ Skunkware 5 / Skunkware 5.iso / src / Tools / freeWAIS-sf-1.1 / ir / irverify.c < prev    next >
Encoding:
C/C++ Source or Header  |  1994-05-19  |  15.1 KB  |  544 lines

  1. /* WIDE AREA INFORMATION SERVER SOFTWARE:
  2.    No guarantees or restrictions.  See the readme file for the full standard
  3.    disclaimer.
  4.  
  5.    Brewster@think.com
  6. */
  7.  
  8. /* Copyright (c) CNIDR (see ../COPYRIGHT) */
  9.  
  10.  
  11. /* Change log:
  12.  * $Log: irverify.c,v $
  13.  * Revision 1.2  1994/03/08  21:06:27  pfeifer
  14.  * Patchlevel 04
  15.  *
  16.  * Revision 1.1  1993/02/16  15:05:35  freewais
  17.  * Initial revision
  18.  *
  19.  * Revision 1.6  92/04/01  09:57:36  morris
  20.  * fixed and eof check in readPostings
  21.  * 
  22.  * Revision 1.5  92/03/28  19:48:10  jonathan
  23.  * Fixed Log header.
  24.  * 
  25.  * Revision 1.4  92/02/18  15:36:32  morris
  26.  * made it faster
  27.  * 
  28.  * Revision 1.3  92/02/12  13:32:33  jonathan
  29.  * Added $Log so RCS will put the log message in the header
  30.  * 
  31. */
  32.  
  33. #include "irverify.h"
  34. #include "irfiles.h"
  35. #include "panic.h"
  36. #include "futil.h"
  37.  
  38. #define TEST_READ false
  39.  
  40. /*---------------------------------------------------------------------------*/
  41.  
  42. void
  43. printIndex (db)
  44. database* db;
  45. /* iterate over the index printing the contents */
  46. {
  47.   serialPostingFile* spf = NULL;
  48.   char indexFileName[MAX_FILE_NAME_LEN + 1];
  49.   postingsForATerm* posts = NULL;
  50.   
  51.   spf = initSerialPostingFile(index_filename(indexFileName,db));
  52.   
  53.   while ((posts = getPostingsForNextTerm(spf)) != NULL)
  54.    { printPostingsForATerm(posts);
  55.      /* XXX dispose of them */
  56.    }
  57.    
  58.   disposeSerialPostingFile(spf);
  59. }
  60.  
  61. /*---------------------------------------------------------------------------*/
  62.  
  63. static void 
  64. print_dictionary_block_and_index _AP((unsigned char* block,long size,serialPostingFile* spf));
  65.  
  66. static void 
  67. print_dictionary_block_and_index(block,size,spf)
  68. unsigned char *block;
  69. long size;
  70. serialPostingFile* spf;
  71. /* this prints the contents of a dictionary block */
  72. {
  73.   long i;
  74.   postingsForATerm* posts = NULL;
  75.   
  76.   for(i = 0; i < size; i++)
  77.    {
  78.      char *word = dictionary_block_word(i, block);
  79.      long pos = dictionary_block_position(i, block);
  80.      if(word[0] == '\0')
  81.        break;
  82.      printf("Entry %3ld: %21s %7ld\n", i, word,pos);
  83.      posts = getPostingsAt(spf,pos);
  84.      printPostingsForATerm(posts);
  85.      /* XXX dispose of them postings */
  86.    }
  87. }
  88.  
  89. /*---------------------------------------------------------------------------*/
  90.  
  91. extern long number_of_dictionary_blocks;
  92. extern unsigned char *dictionary_header_block;
  93. extern unsigned char *dictionary_block;
  94.  
  95. void
  96. printIndexUsingDictionary(db)
  97. database* db;
  98. /* use the dictionary to go over the index */
  99. {
  100.   /* prints the contents of a dictionary */
  101.   FILE *dictStream = db->dictionary_stream;
  102.   long i;
  103.   long new_number_of_dictionary_blocks;
  104.   serialPostingFile* spf = NULL;
  105.   char indexFileName[MAX_FILE_NAME_LEN + 1];
  106.  
  107.   spf = initSerialPostingFile(index_filename(indexFileName,db));
  108.  
  109.   if(NULL == dictStream)
  110.     panic("dictionary dictStream is not open");
  111.   s_fseek(dictStream, 0L, SEEK_SET);
  112.   new_number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE, dictStream);
  113.   if(new_number_of_dictionary_blocks > number_of_dictionary_blocks)
  114.     dictionary_header_block = NULL;
  115.   number_of_dictionary_blocks = new_number_of_dictionary_blocks;
  116.   printf("Number of dictionary blocks %ld\n", number_of_dictionary_blocks);
  117.   if(NULL == (dictionary_header_block =
  118.           read_dictionary_block(dictionary_header_block,
  119.                     DICTIONARY_HEADER_SIZE,
  120.                     number_of_dictionary_blocks,
  121.                     dictStream)))
  122.     panic("Could not read dictionary header block");
  123.   printf("The Dictionary Header Block:\n");
  124.   print_dictionary_block(dictionary_header_block, number_of_dictionary_blocks);
  125.   for(i = 0; i < number_of_dictionary_blocks; i++)
  126.   {
  127.     long pos = dictionary_block_position(i, dictionary_header_block);
  128.     if(NULL == (dictionary_block =
  129.         read_dictionary_block(dictionary_block,
  130.                       pos, DICTIONARY_BLOCK_SIZE, dictStream)))
  131.       panic("Could not read dictionary block %ld", pos);
  132.     printf("\n\nDictionary block %ld (position %ld):\n", i, pos);
  133.     print_dictionary_block_and_index(dictionary_block, DICTIONARY_BLOCK_SIZE,spf);
  134.   }
  135.   fseek(dictStream, 0L, SEEK_END);
  136.   disposeSerialPostingFile(spf);
  137. }
  138.  
  139. /*---------------------------------------------------------------------------*/
  140.  
  141. serialPostingFile*
  142. initSerialPostingFile(filename)
  143. char* filename;
  144. /* open an inverted index file create by irn8. return a structure
  145.    maintaining its state
  146.  */
  147. {
  148.   FILE* stream = NULL;
  149.   serialPostingFile* pf = NULL;
  150.  
  151.   stream = s_fopen(filename,"rb");
  152.   if (stream == NULL) /* can't open that file */
  153.     return(NULL);
  154.   s_fseek(stream,INDEX_HEADER_SIZE,SEEK_SET);
  155.  
  156.   pf = (serialPostingFile*)s_malloc((size_t)sizeof(serialPostingFile));
  157.   pf->stream = stream;
  158.   pf->length = file_length(stream);
  159.   pf->current_index_block = INDEX_HEADER_SIZE;
  160.  
  161.   return(pf);
  162. }
  163.  
  164. /*---------------------------------------------------------------------------*/
  165.  
  166. void
  167. disposeSerialPostingFile(pf)
  168. serialPostingFile* pf;
  169. {
  170.   s_fclose(pf->stream);
  171.   s_free(pf);
  172. }
  173.  
  174. /*---------------------------------------------------------------------------*/
  175.  
  176. void 
  177. printPostingsForATerm(pfat)
  178. postingsForATerm* pfat;
  179. {
  180.   long i;
  181.  
  182.   if (pfat->word[0] != '\0')
  183.     printf("word '%s'\n",pfat->word);
  184.     
  185.   for (i = 0; i < pfat->entries; i++)
  186.     printf("\tdoc %ld weight %ld\n",pfat->docs[i],pfat->weights[i]);
  187. }
  188.  
  189. /*---------------------------------------------------------------------------*/
  190.  
  191. postingsForATerm*
  192. getPostingsAt(spf,position)
  193. serialPostingFile* spf;
  194. long position;
  195. /* position better be a valid starting position! */
  196. {
  197.   fseek(spf->stream,position,SEEK_SET);
  198.   spf->current_index_block = position;
  199.   return(getPostingsForNextTerm(spf));
  200. }
  201.  
  202. /*---------------------------------------------------------------------------*/
  203.  
  204. void
  205. disposePostingsForATerm(pfat)
  206. postingsForATerm* pfat;
  207. {
  208.   s_free(pfat->docs);
  209.   s_free(pfat->weights);
  210.   s_free(pfat);
  211. }
  212.  
  213. /*---------------------------------------------------------------------------*/
  214.  
  215. void 
  216. removePostings(pfat,start,run)
  217. postingsForATerm* pfat;
  218. long start;
  219. long run;
  220. /* remove postings start through start + run from the pfat */
  221. {
  222.   void* toPtr = NULL;
  223.   long runLen;
  224.   long toMove;
  225.  
  226.   if (start + run > pfat->entries)
  227.     return; /* this is an error */
  228.  
  229.   toPtr = (void*)(pfat->docs + (start * sizeof(docID)));
  230.   runLen = run * sizeof(docID);
  231.   toMove = ((pfat->entries - start) * sizeof(docID)) - runLen;
  232.   memmove(toPtr,toPtr + runLen,toMove);
  233.  
  234.   toPtr = (void*)(pfat->weights + (start * sizeof(postingWeight)));
  235.   runLen = run * sizeof(docID);
  236.   toMove = ((pfat->entries - start) * sizeof(postingWeight)) - runLen;
  237.   memmove(toPtr,toPtr + runLen,toMove);
  238.  
  239.   pfat->entries -= run;
  240. }
  241.  
  242. /*---------------------------------------------------------------------------*/
  243.  
  244. void
  245. readPostings(spf,posts,not_full_flag)
  246. serialPostingFile* spf;
  247. postingsForATerm* posts;
  248. long not_full_flag;
  249. {
  250.   long count;
  251.   long document_id,weight,number_of_valid_entries;
  252.   long index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
  253.   long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
  254.   
  255.   if (EOF == index_block_size) 
  256.    { fprintf(stderr,"reading from the index file failed\n");
  257.      return;
  258.    }
  259.       
  260.   if (not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG) /* not full */
  261.     number_of_valid_entries = index_block / INDEX_ELEMENT_SIZE;
  262.  
  263.   else if (not_full_flag == INDEX_BLOCK_FULL_FLAG) /* full */
  264.    { number_of_valid_entries = 
  265.        (index_block_size - INDEX_BLOCK_HEADER_SIZE) / INDEX_ELEMENT_SIZE;
  266.    }
  267.  
  268.   else
  269.    { /* bad news,file is corrupted.  this should return error
  270.     code rather than panicing XXX */
  271.      panic("Expected the flag in the inverted file to be valid.  it is %lx",
  272.        not_full_flag);
  273.    }
  274.  
  275.   posts->docs = 
  276.     (docID*)s_malloc((size_t)(sizeof(docID) * number_of_valid_entries));
  277.   posts->weights = 
  278.     (postingWeight*)s_malloc((size_t)(sizeof(postingWeight) * 
  279.                       number_of_valid_entries));
  280.  
  281.   for (count = 0; count < number_of_valid_entries; count++)
  282.    { long val;
  283.      posts->docs[count] = read_bytes(DOCUMENT_ID_SIZE,spf->stream);
  284.      s_fseek(spf->stream,WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE,SEEK_CUR);
  285.      val = read_bytes(WEIGHT_SIZE,spf->stream);
  286.      if(EOF == val)
  287.       { fprintf(stderr,"reading from the inverted file failed\n");
  288.         return;
  289.       }
  290.      else
  291.       { posts->weights[count] = val;
  292.         posts->entries++;
  293.       } 
  294.    }
  295. }
  296.  
  297. /*---------------------------------------------------------------------------*/
  298.  
  299. long
  300. readDictionaryIndexBlock(number_of_occurances,word,stream)
  301. long *number_of_occurances;
  302. char *word;
  303. FILE *stream;
  304. /* NOTE - similar to read_dictionary_index_lock */
  305. {
  306.   /* this reads the dictionary index block from the index stream.
  307.      It assumes the stream is positioned at the right after the flag
  308.      returns 0 if it succeeds.
  309.      returns -1 if it is at the end of a file.
  310.      returns -2 if it read something strange.
  311.      Always sets word length to 0 if it fails. */
  312.  
  313.   char temp[MAX_WORD_LENGTH + 2];
  314.   
  315.   word[0] = '\0';
  316.  
  317.   s_fseek(stream,NEXT_INDEX_BLOCK_SIZE+INDEX_BLOCK_SIZE_SIZE,SEEK_CUR);
  318.   *number_of_occurances = read_bytes(NUMBER_OF_OCCURANCES_SIZE,stream);
  319.   fgets(temp,MAX_WORD_LENGTH + 2,stream); /* 2 is for the \n and '\0' */
  320.  
  321.   /* trim the \n */
  322.   if(temp[strlen(temp) - 1] == '\n'){
  323.     temp[strlen(temp) - 1] = '\0';
  324.   }
  325.   strcpy(word, temp);
  326.  
  327.   return(0);
  328. }
  329.  
  330. /*---------------------------------------------------------------------------*/
  331.  
  332. postingsForATerm*
  333. getPostingsForNextTerm(spf)
  334. serialPostingFile* spf;
  335. {
  336.   postingsForATerm* posts = NULL;
  337.   
  338.   posts = (postingsForATerm*)s_malloc((size_t)sizeof(postingsForATerm));
  339.   posts->word[0] = '\0';
  340.   posts->entries = 0;
  341.  
  342.   /* this is really a 2 step process - read the dictonary block, then read
  343.      the postings.  I don't see any reason to unwrap it though */
  344.   while (true)
  345.    { 
  346.      long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
  347.  
  348.      if (flag == EOF)
  349.       { return(NULL);
  350.       }
  351.  
  352.      if (flag == INDEX_BLOCK_DICTIONARY_FLAG) /* the dictionary entry */
  353.       { /* read the dictionary part */
  354.     long number_of_occurances;
  355.     if (readDictionaryIndexBlock(&number_of_occurances,
  356.                      posts->word,spf->stream) < 0)
  357.       panic("read dictionary index block failed");
  358.       }
  359.      else /* the posting entry */
  360.       { readPostings(spf,posts,flag);
  361.     break;
  362.       }
  363.    }
  364.  
  365.   return(posts);
  366. }
  367.  
  368. /*---------------------------------------------------------------------------*/
  369.  
  370.  
  371.  
  372.  
  373. #ifdef old
  374.  
  375. these routines are slower thatn the current ones, keep them around for a 
  376. while until we are sure the new ones work ok
  377.  
  378. /*---------------------------------------------------------------------------*/
  379.  
  380. postingsForATerm*
  381. getPostingsForNextTerm(spf)
  382. serialPostingFile* spf;
  383. {
  384.   postingsForATerm* slow;
  385.   postingsForATerm* fast;
  386. /*
  387.   long filePos = s_ftell(spf->stream);
  388.   slow = getPostingsForNextTermSLOW(spf);
  389.   printf("SLOW:\n");
  390.   printPostingsForATerm(slow);NL();
  391.   s_fseek(spf->stream,filePos,SEEK_SET);
  392. */
  393.   fast = getPostingsForNextTermFAST(spf);
  394. /*  printf("FAST:\n");
  395.   printPostingsForATerm(fast);NL();
  396.   disposePostingsForATerm(slow);
  397. */
  398.   return(fast);
  399. }
  400.  
  401. /*---------------------------------------------------------------------------*/
  402.  
  403. postingsForATerm*
  404. getPostingsForNextTermSLOW(spf)
  405. serialPostingFile* spf;
  406. {
  407.   postingsForATerm* posts = NULL;
  408.   boolean keepGoing = true;
  409.   
  410.   if (spf->current_index_block >= spf->length)
  411.     return(NULL);
  412.  
  413.   posts = (postingsForATerm*)s_malloc((size_t)sizeof(postingsForATerm));
  414.   posts->word[0] = '\0';
  415.   posts->entries = 0;
  416.  
  417.   while (keepGoing) 
  418.    { 
  419.      long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
  420.      long next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
  421.      long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
  422.  
  423.      if (flag == INDEX_BLOCK_DICTIONARY_FLAG)
  424.        { long last_index_block;
  425.      long index_block_size;
  426.      long number_of_occurances;
  427.      char word[MAX_WORD_LENGTH + 1];
  428.      if (0 > read_dictionary_index_block(spf->current_index_block,
  429.                          &last_index_block,
  430.                          &index_block_size,
  431.                          &number_of_occurances,
  432.                          word,
  433.                          spf->stream))
  434.        panic("read dictionary index block failed");
  435.      cprintf(TEST_READ,
  436.          "%ld: size %3ld word '%s',occurances %ld last block %ld\n",
  437.          spf->current_index_block,index_block_size,word,
  438.          number_of_occurances,next_index_block);
  439.      strcpy(posts->word,word);
  440.        }
  441.  
  442.      else if (flag == INDEX_BLOCK_NOT_FULL_FLAG)
  443.        { cprintf(TEST_READ,"%ld: size %3ld Not full,valid entries %ld\n",
  444.          spf->current_index_block,index_block_size,next_index_block);
  445.      readPostings(spf,posts);
  446.      keepGoing = false;
  447.        }
  448.  
  449.      else if (flag == INDEX_BLOCK_FULL_FLAG)
  450.        { cprintf(TEST_READ,"%ld: size %3ld full block,next block %ld\n",
  451.          spf->current_index_block,index_block_size,next_index_block);
  452.      readPostings(spf,posts);
  453.      keepGoing = false;
  454.        }
  455.  
  456.      else 
  457.        panic("bad entry %ld (ftell %ld),flag was %ld",
  458.          spf->current_index_block,ftell(spf->stream),flag);
  459.  
  460.      spf->current_index_block += index_block_size;
  461.      s_fseek(spf->stream,spf->current_index_block,SEEK_SET);
  462.    }
  463.  
  464.   return(posts);
  465. }
  466.  
  467. /*---------------------------------------------------------------------------*/
  468.  
  469. void
  470. readPostings(spf,posts)
  471. serialPostingFile* spf;
  472. postingsForATerm* posts;
  473. {
  474.   long not_full_flag = INDEX_BLOCK_FULL_FLAG;
  475.   long count,index_block_size;
  476.   long document_id,weight,number_of_valid_entries;
  477.   long index_block = spf->current_index_block;
  478.   
  479.   if (index_block >= 0)
  480.     {
  481.       /* read the index block */
  482.       if (0 != fseek(spf->stream,(long)index_block,SEEK_SET))
  483.     { 
  484.       fprintf(stderr,
  485.           "fseek failed into the inverted file to position %ld\n",
  486.           (long)index_block); 
  487.       return;
  488.     }
  489.       
  490.       not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
  491.       index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
  492.       index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
  493.       if (EOF == index_block_size) 
  494.     { fprintf(stderr,"reading from the index file failed\n");
  495.       return;
  496.     }
  497.       
  498.       if (not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG)
  499.     { /* not full */
  500.       number_of_valid_entries = index_block;
  501.     }
  502.       else if (not_full_flag == INDEX_BLOCK_FULL_FLAG)
  503.     { /* full */
  504.       number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
  505.     }
  506.       else
  507.     { /* bad news,file is corrupted.  this should return error
  508.          code rather than panicing XXX */
  509.     panic("Expected the flag in the inverted file to be valid.  it is %ld",
  510.           not_full_flag);
  511.         }
  512.  
  513.       cprintf(TEST_READ,"  number of valid bytes: %ld\n",
  514.           number_of_valid_entries);
  515.       
  516.       for (count = 0; count < number_of_valid_entries; 
  517.        count = count + INDEX_ELEMENT_SIZE)
  518.     {
  519.       document_id = read_bytes(DOCUMENT_ID_SIZE,spf->stream);
  520.       (void)read_bytes(WORD_POSITION_SIZE,spf->stream);
  521.       (void)read_bytes(CHARACTER_POSITION_SIZE,spf->stream);
  522.       weight = read_bytes(WEIGHT_SIZE,spf->stream);
  523.       cprintf(TEST_READ,"    entry %ld,Doc_id: %ld,weight %ld\n",
  524.           count % INDEX_ELEMENT_SIZE,document_id,weight);
  525.       if(EOF == weight) 
  526.         { fprintf(stderr,"reading from the doc-id table failed\n");
  527.           return;
  528.         }
  529.       posts->entries++;
  530.       posts->docs = (docID*)s_realloc(posts->docs,
  531.                   (size_t)(sizeof(docID) * posts->entries));
  532.       posts->docs[posts->entries - 1] = document_id;
  533.       posts->weights = (postingWeight*)s_realloc(posts->weights,
  534.                      (size_t)(sizeof(postingWeight) * 
  535.                           posts->entries));
  536.       posts->weights[posts->entries - 1] = weight;
  537.     }
  538.     }
  539. }
  540.  
  541. /*---------------------------------------------------------------------------*/
  542.  
  543. #endif /* ndef old */
  544.