home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
ftp.barnyard.co.uk
/
2015.02.ftp.barnyard.co.uk.tar
/
ftp.barnyard.co.uk
/
cpm
/
walnut-creek-CDROM
/
MBUG
/
MBUG043.ARC
/
CMP-DOC.PAS
< prev
next >
Wrap
Pascal/Delphi Source File
|
1979-12-31
|
8KB
|
186 lines
PROGRAM cmp_to_doc (input, dictionary, document, output);
{ Convert Word+ dictionary to text file format. }
{ The format of a Word+ dictionary is as follows :-
Word+_dictionary ::= non_data_part data_part final_part
For reference only the non_data_part consists of 128 bytes apparently as follows :-
non_data_part ::= 3_unspecified_byte index 47_unspecified_byte
(128 bytes)
index ::= 26_index_reference
index_reference ::= logical_block_number logical_sector_offset byte_offset
(3 bytes) (0 ..) (0 .. 128) (0 .. 128)
data_part ::= [ compressed_word ]
(The words in the data part are contained in alphabetical order.)
compressed_word ::= zero_bit explicit_count [ character ] last_letter |
(n bytes) zero_bit compressed_count letter [ character ] last_letter |
one_bit compressed_count letter
explicit_count ::= 2_zero_bit 5_bit
(7 bits)
These values indicate how many of the initial letters of the previous
word to use for the new word, a value of 26 is not allowed.
compressed_count ::= zero_bit one_bit | one_bit zero_bit | one_bit one_bit
(2 bits)
These values indicate,
'use the same first 3 letters as for the previous word',
'use the same first 4 letters as for the previous word', and
'use the same first 5 letters as for the previous word' respectively.
last_letter ::= one_bit 2_zero_bit letter
character ::= zero_bit 2_zero_bit letter
letter ::= 5_bit
(0 .. 25)
These values represent the letters in the word 'A' .. 'Z'.
final_part ::= 1Ah [ 00h ] [ 1Ah ]
The final part consists of an eof byte, the record is then padded
out with nulls, any subsiquent records consist of ^Z's. (^Z = eof = 1Ah = 26d)
Re-phrasing all this in English, the data_part consists of a sequence of
words arranged in alphabetical order. Each word occupies one or more
bytes. The last byte of a word is denoted by having bit 7 set. To
conserve space only those letters of a word which differ from those of
the previous word are specified. Thus a count of how many of the
initial letters the word has in common with the previous word is
stored, this is the first entry for a word. Normally this occupies the
last five bits of the first byte, however for values of 3, 4 or 5 it
only occupies the 5th and 6th bits of the first byte. Thus
letters in common | first byte
-------------------+------------
0 | *000 0000
1 | *000 0001
2 | *000 0010
3 | *01- ----
4 | *10- ----
5 | *11- ----
6 | *000 0110
7 | *000 1000
. . .
. . .
31 | *001 1111
* - normaly 0, only 1 if this is the last byte of the word
(due to lexical ordering this can only be if 3, 4 or
5 letters are in common (see below)).
Note : 26 letters in common is not allowed as this value
indicates the end of the file.
The letters of a word are represented by a five bit code, A = 00000,
.., Z = 11001 (ie. 0 .. 25). In the case of 3, 4 or 5 bytes in common
the first letter can be placed in the first byte, otherwise they will
occupy bits 0 - 4 of successive bytes, bits 5 and 6 being zero. }
CONST
cr = ^M;
eof = ^Z;
buff_rec = 8;
{ Size of dictionary buffer in records. }
buff_bytes = { buff_rec * $80 = } $400;
buff_bytes_minus_1 = { buff_bytes - 1 = } $3FF;
TYPE
str = string [80];
VAR
dictionary : file;
document : text;
buffer : array [0 .. buff_bytes_minus_1] of byte;
buff_pt : 0 .. buff_bytes;
next_byte : byte;
word, first, last : str;
{ word holds word of dictionary being extracted. }
count, processed, dummy : integer;
{ Number of letters to keep from previous word, amount of data processed in kilobytes. }
words, copied : real;
compressed, new_word, end_of_file : boolean;
FUNCTION bits (i, l, h : integer) : integer;
{ Return bits l to h of i, right adjusted. }
BEGIN
bits := ord ((i shr l) and not ((not 0) shl (h - l + 1)))
END;
PROCEDURE upper_case (VAR s : str);
VAR
i : integer;
BEGIN
FOR i := 1 TO length (s) DO
s [i] := upcase (s [i])
END;
BEGIN
writeln ('Convert Word+ dictionary to a document format file.');
write ('Enter first word within which to copy, eg. AAA : ');
readln (first);
write ('Enter last word within which to copy, eg. ZZZ : ');
readln (last);
upper_case (first);
upper_case (last);
writeln ('Extracting all words between ', first, ' and ', last, '.');
assign (dictionary, 'MAINDICT.CMP');
assign (document, 'MAINDICT.DOC');
reset (dictionary);
rewrite (document);
blockread (dictionary, buffer, 1);
{ Ignore non-data part (1 record = 128 bytes). }
end_of_file := false;
processed := 0;
words := 0;
copied := 0;
word := '';
new_word := true;
WHILE not end_of_file DO
BEGIN
{$I-}
{ Don't worry about over-read on last filling of buffer. }
blockread (dictionary, buffer, buff_rec);
dummy := ioresult;
{$I+}
buff_pt := 0;
WHILE (buff_pt < buff_bytes) and not end_of_file DO
BEGIN
next_byte := buffer [buff_pt];
buff_pt := buff_pt + 1;
IF new_word THEN
BEGIN
end_of_file := next_byte = ord (eof);
IF not end_of_file THEN
BEGIN
compressed := bits (next_byte, 5, 6) <> 0;
IF compressed THEN
count := bits (next_byte, 5, 6) + 2
ELSE
count := bits (next_byte, 0, 4);
delete (word, count + 1, length (word) - count);
IF compressed THEN
word := word + chr (ord ('A') + bits (next_byte, 0, 4))
END
END
ELSE
word := word + chr (ord ('A') + bits (next_byte, 0, 4));
new_word := bits (next_byte, 7, 7) = 1;
IF new_word THEN
BEGIN
IF (first < word) and (word < last) THEN
BEGIN
writeln (document, word);
copied := copied + 1
END;
words := words + 1
END
END;
IF not end_of_file THEN
processed := processed + 1;
write (processed : 3, 'k, ', words : 5 : 0, ' words processed, ', copied : 5 : 0, ' words copied.', cr)
END;
close (dictionary);
close (document);
writeln
END.