home *** CD-ROM | disk | FTP | other *** search
-
- {Section 11.3 Phase 1: Splitting the Text into Words}
-
-
- {Exercise E6}
-
- const
- nfiles = 8; {number of temporary files for unprocessed words}
- type
- filecode = 1..nfiles;
- var
- RefFile: array[filecode] of fileref;
- {local files used for auxiliary storage of words from phase 1 to phase 2:
- Normally, a separate file exist for each initial letter; this version uses
- nfiles files due to operating system constraints.}
-
-
- procedure SplitWords;
- var
- outcount: array[filecode] of integer; {counter for word files}
- code: filecode; {Into which file does word go?}
- {The remainder of the local declarations are unchanged.}
-
- begin {procedure SplitWords}
- Initialize; {sets up files, hash table, constants}
- GetWord(w); {Obtain a single word from InText.}
- while not endinput do
- begin
- x := HashAddress(w);
- if w <> hash[x] then
- begin {Not in hash table; put into RefFile.}
- code := FindFile( w[1] );
- outcount[code] := outcount[code] + 1;
- with RefFile[code]^ do {Update the storage file.}
- begin
- wd := w;
- pg := pagecount
- end;
- Put(RefFile[code])
- end;
- GetWord(w)
- end;
- Conclude {writes word counts to output}
- end; {procedure SplitWords}
-
-
- function FindFile( ch: letter): filecode;
- {Uses binary decision tree to select one of nfiles = 8 files depending
- on the letter ch. }
- begin {function FindFile}
- if ch < 'M' then
- if ch < 'E' then
- if ch < 'C' then FindFile := 1
- else FindFile := 2
- else if ch < 'H' then FindFile := 3
- else FindFile := 4
- else if ch < 'S' then
- if ch < 'P' then FindFile := 5
- else FindFile := 6
- else if ch < 'T' then FindFile := 7
- else FindFile := 8
- end; {function FindFile}
-
-
- for ch := A to Z do
- begin
- rewrite( RefFile[ch] );
- outcount[ch] := 0
- end;
-
-
- for i := 1 to nfiles do
- begin
- rewrite( RefFile[i] );
- outcount[i] := 0
- end;
-
-
- procedure Conclude;
- {Writes out counts of various word lists. For some systems, it is
- necessary to close files, which should be done here.}
- var
- i: integer; {loop index}
- begin {procedure Conclude}
- writeln('The total number of words read in is ', wordcount:7);
- writeln;
- writeln('The number of words to process further in the next stage,');
- writeln('in each temporary file, is below.');
- writeln('a - b c - d e - g h - l m - o p - r s t - z');
- for i := 1 to nfiles do
- write(outcount[i]:7);
- writeln;
- writeln
- end; {procedure Conclude}
-
-
-
-
- {Section 11.4 Phase 2: Classifying the Words}
-
-
- {Exercise E3}
-
- procedure ClassifyWords;
- {The references stored in the temporary files are placed into a list,
- the words from the file InIndex are compared with the words in the list
- as they are merged into the file NewIndex.}
- type
- wordtype = (hash, count, index); {ways to process a word}
- pointref = ^reflist;
- reflist = record {list of page references}
- pg: integer;
- next: pointref
- end;
- pointer = ^node;
- node = record {node of list storing words}
- wd: word;
- kind: wordtype;
- ct: integer;
- ref: pointref;
- next: pointer
- end;
- {Cannot use varying types as wordtype is not known upon first reading.}
- list = record
- head: pointer
- end;
- var
- code: filecode; {index used to loop through temporary files}
- NewList: list;
-
-
- procedure Merge(p, q: pointer; var r: pointer);
- {Merges two sorted lists into one, that will begin at r;
- requires that both lists be nonempty. This version is modified
- slightly from the version listed in the text due to a difference
- in the data structures used.}
- var
- s: pointer; {always points to last node of sorted list}
- begin {procedure Merge}
- if (p = nil) or (q = nil) then
- writeln('Merge called with empty list(s).');
- if p^.wd <= q^.wd then {First find the head, r, of the merged list.}
- begin {Note the change from .info.key to .wd. }
- r := p;
- p := p^.next
- end
- else begin
- r := q;
- q := q^.next
- end;
- s := r; {s always points to the last entry of the merged list.}
- while (p <> nil) and (q <> nil) do
- if p^.wd <= q^.wd then {Note the change from .info.key to .wd. }
- begin
- s^.next := p; {Attach the node with the smaller key to the sorted list.}
- s := p;
- p := p^.next {Advance to the next unmerged node.}
- end
- else begin
- s^.next := q;
- s := q;
- q := q^.next
- end;
- if p = nil then {After one list is exhausted, attach the remainder of the other one.}
- s^.next := q
- else
- s^.next := p
- end; {procedure Merge}
-
- {Include the procedures MergeSort and Divide from Chapter 7 here.}
-
- procedure MainMergeSort(var L: list);
- { Main procedure to invoke recursive procedure MergeSort, as listed
- in the text. }
- begin
- MergeSort(L.head)
- end;
- procedure InitializeList(var L: list);
- begin
- L.head := nil
- end;
-
-
- procedure Insert(x: reference; var L: list);
- { Inserts the reference into the hash table of references. }
- var
- done: Boolean;
- p: pointer;
- q: pointref;
- begin {procedure Insert}
- done := false;
- p := L.head;
- while (p <> nil) and (not done) do
- begin
- if p^.wd = x.wd then {The word is already in the list, update its node.}
- begin
- p^.ct := p^.ct + 1;
- new(q);
- q^.pg := x.pg;
- q^.next := p^.ref;
- p^.ref := q;
- done := true
- end
- else
- p := p^.next
- end;
- if not done then
- begin {Insert a new entry if the word is not already in the table.}
- p := nil;
- new(p);
- p^.wd := x.wd;
- p^.ct := 1; {Initialize the count and the page references.}
- new(q);
- q^.pg := x.pg;
- q^.next := nil;
- p^.ref := q;
- p^.next := L.head;
- L.head := p
- end
- end; {procedure Insert}
-
-
- procedure Append(p: pointer; var L: list);
- { Append the nodes pointed to by p to the end of the list L. }
- var
- q: pointer;
- begin {procedure Append}
- q := L.head;
- if q = nil then {The list is empty, make p the beginning of the list.}
- L.head := p
- else begin
- while q^.next <> nil do {Find the end of the list.}
- q := q^.next;
- q^.next := p
- end
- end; {procedure Append}
-
-
- procedure Place(var F: fileref; var L: list);
- { Places the words in file F into the list of words. }
- var
- x: reference;
- temp: pointer;
- begin {procedure Place}
- temp := L.head; {Save the words that have already been processed.}
- L.head := nil;
- reset(F);
- while not eof(F) do {Insert all the words into the list.}
- begin
- x := F^;
- get(F);
- Insert(x, L)
- end;
- MainMergeSort(L); {Sort the list and append the other work to the list.}
- Append(temp, L)
- end; {procedure Place}
-
-
- procedure RemoveFirst(var p: pointer; var L: list);
- { Removes the first node from the list L. }
- begin {procedure RemoveFirst}
- p := L.head;
- if not Empty(L) then {standard list operation}
- begin
- L.head := L.head^.next;
- p^.next := nil
- end
- end; {procedure RemoveFirst}
-
-
- procedure ReadReference(var r: pointer; var F: text);
- { Reads reference from the file F. }
- var
- k: char;
- begin {procedure ReadReference}
- if eof(F) then
- r := nil
- else begin
- ReadWord(F, r^.wd);
- readln(F, k);
- case k of
- 'F', 'f': r^.kind := hash;
- 'C', 'c': begin
- r^.kind := count;
- r^.ct := 0
- end;
- 'I', 'i': begin
- r^.kind := index;
- r^.ref := nil
- end
- end
- end
- end; {procedure ReadReference}
-
-
- procedure WriteReference(p: pointer; var NewIndex, NewHashFile: text);
- {writes a word to the appropriate file with the appropriate associated information}
- var
- q: pointref;
- begin {procedure WriteReference}
- with p^ do
- case kind of
- hash: begin {Write the word to the hash file.}
- WriteWord(NewHashFile, wd);
- writeln(NewHashFile)
- end;
- count:begin {Write the word and its frequency to the new index file.}
- WriteWord(NewIndex, wd);
- write(NewIndex, 'c');
- writeln(NewIndex, ct:5)
- end;
- index:begin {Write the word and its page numbers to the new index file.}
- WriteWord(NewIndex, wd);
- write(NewIndex, 'i');
- q := ref;
- while q <> nil do
- begin
- write(NewIndex, q^.pg:5);
- q := q^.next
- end;
- writeln(NewIndex)
- end
- end
- end; {procedure WriteReference}
-
-
- procedure GetWordType(p: pointer);
- { Request the user to specify the category of the given word. }
- var
- response: char;
- begin {procedure GetWordType}
- with p^ do
- begin
- repeat
- WriteWord(output, wd);
- write(' is (F, C, I)?');
- readln(response)
- until response in ['F', 'f', 'C', 'c', 'I', 'i'];
- case response of
- 'F', 'f': kind := hash;
- 'C', 'c': kind := count;
- 'I', 'i': kind := index
- end
- end
- end; {procedure GetWordType}
-
-
- procedure Delete(var p: pointer);
- { Delete the word p^ as well as all of the page references associated with it. }
- var
- q, r: pointref;
- begin {procedure Delete}
- if p^.kind = index then
- begin
- q := p^.ref;
- while q <> nil do
- begin {Dispose the list of page references associated with the word.}
- r := q^.next;
- dispose(q);
- p^.ref := r;
- q := r
- end
- end;
- dispose(p) {dispose the node itself}
- end; {procedure Delete}
-
-
- procedure CompareAndMerge(var L: list; var InIndex, NewIndex, NewHashFile: text);
- { Compare the list L with InIndex, merge if was found. }
- var
- p, r: pointer;
- begin {procedure CompareAndMerge}
- RemoveFirst(p, L);
- new(r);
- ReadReference(r, InIndex);
- while p <> nil do
- if r = nil then
- begin
- GetWordType(p);
- WriteReference(p, NewIndex, NewHashFile);
- Delete(p); {Remove reference list and node from memory.}
- RemoveFirst(p, L)
- end
- if p^.wd < r^.wd then
- begin
- GetWordType(p);
- WriteReference(p, NewIndex, NewHashFile);
- Delete(p); {Remove reference list and node from memory.}
- RemoveFirst(p, L)
- end
- else if p^.wd > r^.wd then {Do not write a word that is not used to NewIndex.}
- ReadReference(r, InIndex)
- else begin {p^.wd = r^.wd}
- p^.kind := r^.kind;
- WriteReference(p, NewIndex, NewHashFile);
- Delete(p); {Remove reference list and node from memory.}
- RemoveFirst(p, L);
- ReadReference(r, InIndex)
- end
- end; {procedure CompareAndMerge}
-
-
- begin {procedure ClassifyWords}
- reset(InIndex);
- rewrite(NewIndex);
- rewrite(NewHashFile);
- InitializeList(NewList);
- for code := nfiles downto 1 do {Place the words from each file into the list.}
- Place(RefFile[code], NewList);
- if not Empty(NewList) then {standard list operation}
- CompareAndMerge(NewList, InIndex, NewIndex, NewHashFile)
- end; {procedure ClassifyWords}
-
-
-
-
-
- {Exercise E4}
-
- procedure ClassifyWords;
- {The references stored in the temporary files are placed in a new hash table,
- the words from the file InIndex are compared with the words in the new table
- as they are merged into the file NewIndex.}
- const
- RefTableSize = 3023; {Size of the hash table to temporarily store words.}
- RefTableMax = 3022;
- type
- wordtype = (hash, count, index); {ways to process a word}
- pointref = ^reflist;
- reflist = record {list of page references}
- pg: integer;
- next: pointref
- end;
- pointer = ^node;
- node = record {Node of list storing words.}
- wd: word;
- kind: wordtype;
- ct: integer;
- ref: pointref;
- next: pointer
- end;
- {Cannot use varying types as wordtype is not known upon first reading.}
- list = record
- head: pointer
- end;
- RefHashTable = array[0..RefTableMax] of list;
- var
- code: filecode; {index used to loop through temporary files}
- RefTable: RefHashTable; {stores all references in memory}
- NewList: list;
-
-
- procedure InitializeTable(var RefTable: RefHashTable);
- var i: integer;
- begin {procedure InitializeTable}
- for i := 0 to RefTableMax do
- RefTable[i].head := nil
- end; {procedure InitializeTable}
-
-
- function RefTableAddress(x: reference): integer;
- { Returns hashed address of reference. }
- var
- i, h: integer;
- begin {function Hash}
- h := 0;
- with x do
- for i := 1 to maxwd do
- h := h + ord(wd[i]);
- RefTableAddress := h mod RefTableSize
- end; {function Hash}
-
-
- procedure Insert(x: reference; pos: integer; var RefTable: RefHashTable);
- {inserts the reference into the hash table of references}
- var
- done: Boolean;
- p: pointer;
- q: pointref;
- begin {procedure Insert}
- done := false;
- p := RefTable[pos].head;
- while (p <> nil) and (not done) do
- begin {Search for the word, update the reference if it is found.}
- if p^.wd = x.wd then begin
- p^.ct := p^.ct + 1; {Update count and page reference.}
- new(q);
- q^.pg := x.pg;
- q^.next := p^.ref;
- p^.ref := q;
- done := true
- end
- else
- p := p^.next
- end;
- if not done then begin {Insert a new entry if the word is not in the table.}
- p := nil;
- new(p);
- p^.wd := x.wd;
- p^.ct := 1; {Initialize the count and the page references.}
- new(q);
- q^.pg := x.pg;
- q^.next := nil;
- p^.ref := q;
- p^.next := RefTable[pos].head;
- RefTable[pos].head := p
- end
- end; {procedure Insert}
-
-
- procedure Place(var F: fileref; var RefTable: RefHashTable);
- {places the words in file F into the reference table}
- var
- x: reference;
- begin {procedure Place}
- reset(F);
- while not eof(F) do begin
- x := F^;
- get(F);
- Insert(x, RefTableAddress(x), RefTable)
- end
- end; {procedure Place}
-
-
- procedure LinkEntries(var RefTable: RefHashTable; var NewList: list);
- { The references in the hashed table are combined into the list NewList. }
- var
- i: integer;
- p: pointer;
- begin {procedure LinkEntries}
- i := 0;
- while (i < RefTableMax) and Empty(RefTable[i]) do {Find the first entry.}
- i := i + 1;
- if i <= RefTableMax then
- begin
- NewList.head := RefTable[i].head; {Initialize the list to point to the first entry.}
- p := RefTable[i].head;
- if p <> nil then {Find the end of the first chain.}
- while p^.next <> nil do
- p := p^.next;
- while (i < RefTableMax) do {Link remaining entries into the list.}
- begin
- i := i + 1;
- if not Empty(RefTable[i]) then {standard list procedure}
- begin
- p^.next := RefTable[i].head;
- while p^.next <> nil do {Move p to the end of the chain.}
- p := p^.next
- end
- end
- end
- else
- NewList.head := nil
- end; {procedure LinkEntries}
-
-
- {See the previous exercise for the following procedures.}
- procedure RemoveFirst(var p: pointer; var L: list);
- procedure ReadReference(var r: pointer; var F: text);
- procedure WriteReference(p: pointer; var NewIndex, NewHashFile: text);
- procedure GetWordType(p: pointer);
- procedure Delete(var p: pointer);
- procedure CompareAndMerge(var L: list; var InIndex, NewIndex, NewHashFile: text);
- procedure Merge(p, q: pointer; var r: pointer);
- {Include the procedures MergeSort and Divide from Chapter 7 here.}
- procedure MainMergeSort(var L: list);
-
-
- begin {procedure ClassifyWords}
- reset(InIndex);
- rewrite(NewIndex);
- rewrite(NewHashFile);
- InitializeTable(RefTable);
- for code := 1 to nfiles do {Place all the words into the reference hash table.}
- Place(RefFile[code], RefTable);
- LinkEntries(RefTable, NewList); {Link the entries of the table into a list.}
- MainMergeSort(NewList);
- if not Empty(NewList) then
- CompareAndMerge(NewList, InIndex, NewIndex, NewHashFile)
- end; {procedure ClassifyWords}
-
-