home *** CD-ROM | disk | FTP | other *** search
- /*
- ** Really mindless text compression program.
- **
- ** The intended use of the compression is to be able to fit
- ** E-text books (Gutenberg, OBI, Wiretap) into a smaller space
- ** so that they can be read on a machine with very limited store,
- ** such as a handheld computer. One implication of this is that
- ** the form of compression does not need to be completely lossless
- ** as long as the result is still readable.
- **
- ** This program assumes a linear address space at least large enough
- ** to load the whole text. One implication of this is that it won't
- ** be usable as a simple DOS program, but the alternative was to spend
- ** more time on this than I felt I could afford.
- */
- #include <stdlib.h>
- #include <string.h>
- #include <stdio.h>
- #include <assert.h>
-
- #define FILTER /* don't preprocess text if this is */
- /* not defined */
- /* this should give LOSSLESS compression */
- /* useful mainly for testing purposes */
-
- #define NEARLY 10 /* less than this is "nearly" unused */
-
- #define GRAM 2 /* size of strings to scan for */
- #define GRAMS 4000 /* max size of dictionary of scanned strings */
-
- #define LLIM 250 /* max size of uncompressed line in file */
-
- #define FALSE 0
- #define TRUE 1
-
- #define NL '\n'
- #define CR '\r'
-
- static unsigned long character_count[256];
-
- /*
- ** definition
- **
- ** This is a table of definitions for the 256 possible character values.
- ** Note that character 0 is never used for anything, in order to avoid
- ** C string termination problems.
- **
- ** For other characters, the table entry is a pointer to a heap string
- ** which is what that character would expand to if it appeared in the
- ** compressed string.
- **
- ** Note that this is NOT a recursive concept; the definition is a flattened
- ** string of characters. This, if the character "q" only appears before
- ** "u" and is chosen as symbol "x", then the definition of "x" will be
- ** "qu" and "q" itself may then be redefined as something else later on.
- */
- static unsigned char *definition[256];
-
- /*
- ** Text storage
- **
- ** "original_size" is the size, in characters, of the original file.
- ** "text" is a pointer to a heap area of a little bigger than this size,
- ** so that it can be terminated by a couple of special characters and
- ** a zero if required.
- ** "current_size" is the number of characters in "text" which are
- ** currently valid.
- */
- static size_t original_size;
- static size_t current_size;
- static unsigned char *text;
-
- /*
- ** abandon
- **
- ** Single point of exit from the program in the event of problems.
- */
- static void abandon ( const char *s )
- {
- fprintf(stderr, "Abandon: %s\n", s);
- exit(EXIT_FAILURE);
- }
-
- /*
- ** process_char
- **
- ** Called once for each character in the input file. This is a state
- ** machine which weeds out things like redundant white space and
- ** unwanted hyphenation.
- */
- static unsigned char *pp; /* pointer through which process_char writes */
-
- #if defined(FILTER)
- static void process_char ( unsigned char c )
- {
- static enum state {
- END_PARA, /* have just seen end of a paragraph */
- END_LINE, /* have just seen end of a line */
- IN_WORD, /* we are inside a word */
- IN_LINE, /* inside a line, but not inside a word */
- PEND_SPACE, /* in white space */
- PEND_HYPHEN, /* just seen a hyphen, may be at end of word */
- PEND_HYPHEN_NL /* floop-<NL> seen */
- } state = END_PARA;
-
- switch (state) {
-
- line_ended:
- case END_LINE:
- if (c == NL) {
- /*
- ** If we see a second newline after the end of an
- ** ordinary line, we have come to the end of the
- ** paragraph. Issue the second newline and move
- ** into the END_PARA state.
- */
- *pp++ = c;
- state = END_PARA;
- } else if (c == ' ') {
- /*
- ** If we see a leading space after a newline, we have
- ** see the _other_ kind of paragraph-starting convention,
- ** and we must change it to the first kind.
- */
- *pp++ = NL;
- state = END_PARA;
- } else if (isalpha(c)) {
- *pp++ = c;
- state = IN_WORD;
- } else {
- *pp++ = c;
- state = IN_LINE;
- }
- break;
-
- case END_PARA:
- if (c == ' ') {
- /*
- ** Ignore spaces at this point; we know we are at the end of
- ** a paragraph already.
- */
- } else if (c == NL) {
- /*
- ** Ignore extra blank lines
- */
- } else {
- /*
- ** Something interesting: consider in line context
- */
- goto in_line;
- }
- break;
-
- in_line:
- case IN_LINE:
- state = IN_LINE;
- if (c == ' ') {
- state = PEND_SPACE;
- } else if (c == NL) {
- *pp++ = NL;
- state = END_LINE;
- } else if (isalpha(c)) {
- *pp++ = c;
- state = IN_WORD;
- } else {
- *pp++ = c;
- }
- break;
-
- case IN_WORD:
- if (isalpha(c)) {
- *pp++ = c;
- } else if (c == '-') {
- state = PEND_HYPHEN;
- } else {
- goto in_line;
- }
- break;
-
- case PEND_HYPHEN:
- if (isalpha(c)) { /* hyphen within a word */
- *pp++ = '-';
- *pp++ = c;
- state = IN_WORD;
- } else if (c == ' ') {
- /* absorb spaces after in-word hyphens */
- } else if (c == NL) {
- state = PEND_HYPHEN_NL;
- } else {
- *pp++ = '-'; /* flush hyphen */
- goto in_line;
- }
- break;
-
- case PEND_HYPHEN_NL:
- if (isalpha(c)) {
- /*
- ** We have seen a <alpha><-><wsp><NL><wsp><alpha>
- ** sequence in a word. Drop the middle of it and
- ** join the two lines together, which will also
- ** join the two parts of the word together.
- */
- *pp++ = c;
- state = IN_WORD;
- } else if (c == ' ') {
- /*
- ** Absorb spaces at beginning of lines like these
- */
- } else {
- *pp++ = '-';
- *pp++ = NL;
- state = END_LINE;
- goto line_ended;
- }
- break;
-
- case PEND_SPACE:
- if (c == ' ') {
- /* ignore multiple white space */
- } else if (c == NL) {
- /*
- ** White space before line end: take as line end.
- */
- *pp++ = NL;
- state = END_LINE;
- } else {
- *pp++ = ' ';
- goto in_line;
- }
- break;
-
- default:
- fprintf(stderr, "state: %d character: %d\n", state, c);
- abandon("process state machine failure");
- }
- }
- #endif /* FILTER */
-
- static load_file(FILE *fin)
- {
- int c;
-
- /*
- ** Work out the size of the file by reading it all in, character
- ** at a time. Slow but sure.
- */
- original_size = 0;
- for (;;) {
- int c = fgetc(fin);
- if (c == EOF) break;
- original_size++;
- }
- printf("original size: %ld characters\n", (long)original_size);
-
- /*
- ** Allocate some space for the file...
- */
- text = malloc(original_size)+10;
- if (!text) abandon("could not allocate buffer for text");
-
- /*
- ** Reset the file pointer to the start and load the file in.
- */
- fseek(fin, 0, SEEK_SET);
- pp = text;
- for (;;) {
- int c = fgetc(fin);
- if (c == EOF) break;
- #if defined(FILTER)
- process_char(c);
- #else
- *pp++ = c;
- #endif
- }
-
- /*
- ** Purge state machine if we have one.
- */
- #if defined(FILTER)
- process_char(NL);
- process_char(NL);
- process_char(NL);
- #endif
- current_size = pp - text;
- }
-
- /*
- ** pchar
- **
- ** Prints a character in such a way that it is always readable,
- ** if necessary hexifying it.
- */
- static void pchar (int n)
- {
- if (isprint(n)) {
- putchar(n);
- } else {
- printf("<%02X>", n);
- }
- }
-
- /*
- ** pstring
- **
- ** Prints a given string "safely" through pchar.
- */
- static void pstring ( const unsigned char *s )
- {
- while (*s) pchar(*s++);
- }
-
- static void show_nearlies(void)
- {
- int nearlies;
- int c;
- for (c=1, nearlies=0; c<256; c++) {
- int count = character_count[c];
- if (count && (count<NEARLY)) nearlies++;
- }
- printf("nearly empty character slots: %d\n", nearlies);
- if (nearlies) {
- printf(" ");
- for (c=1; c<256; c++) {
- if (character_count[c] == 0) continue;
- if (character_count[c] < NEARLY) pchar(c);
- }
- putchar(NL);
- }
- }
-
- static int count_characters(void)
- {
- int c, empties;
- unsigned char *p = text;
-
- /*
- ** First, zero the count array.
- */
- for (c=1; c<256; c++)
- character_count[c] = 0;
-
- /*
- ** Make sure we don't by accident use anything as a symbol
- ** that the Psion IOREAD function will regard as significant.
- **
- ** (1000 is large enough to prevent these turning up as "nearlies").
- */
- character_count[26] = 1000;
- character_count[NL] = 1000;
- character_count[CR] = 1000;
-
- while (p != (text+current_size)) {
- character_count[*p]++;
- p++;
- }
-
- for (c=1, empties=0; c<256; c++) {
- int count = character_count[c];
- if (count == 0) empties++;
- }
-
- printf("empty character slots: %d\n", empties);
-
- return empties;
- }
-
- struct gram {
- char ch[GRAM];
- unsigned long nrefs;
- };
-
- static struct gram grams[GRAMS];
- static int cur_grams;
- static unsigned long dropped_grams;
-
- static void pgram ( const struct gram *g )
- {
- int i;
- for (i=0; i<GRAM; i++) pchar(g->ch[i]);
- }
-
- static int sort_nrefs ( const void *L, const void *R )
- {
- const struct gram *LL = L;
- const struct gram *RR = R;
- if (LL->nrefs < RR->nrefs) return -1;
- if (LL->nrefs > RR->nrefs) return 1;
- return 0;
- }
-
- static int sort_grams ( const void *L, const void *R )
- {
- const struct gram *LL = L;
- const struct gram *RR = R;
- return strncmp(LL->ch, RR->ch, GRAM);
- }
-
- static void count_gram ( const char *g )
- {
- struct gram gram;
- struct gram *q;
- memcpy(gram.ch, g, GRAM);
- q = bsearch(&gram, grams, cur_grams, sizeof(struct gram), sort_grams);
- if (q) {
- q->nrefs++;
- } else if (cur_grams < GRAMS) {
- /*
- ** A gram we haven't seen before, but which we do have room
- ** for in the table.
- */
- gram.nrefs = 1;
- #if 0
- /*
- ** add this gram in at the end of the table
- */
- grams[cur_grams] = gram;
- #else
- /*
- ** Add this gram in at the start of the table.
- ** This is likely to be statistically close to where it
- ** will be going, particularly if it is a late arrival.
- */
- if (cur_grams) memmove(grams+1, grams, cur_grams*sizeof(struct gram));
- grams[0] = gram;
- #endif
- cur_grams++;
- qsort(grams, cur_grams, sizeof(struct gram), sort_grams);
- } else {
- dropped_grams++;
- }
- }
-
- /*
- ** valid_gram
- **
- ** True if the string is an allowed n-gram. False if it
- ** contains NL and is therefore prohibited.
- */
- static int valid_gram ( const unsigned char *p )
- {
- int i;
- for (i=0; i<GRAM; i++)
- if (*p++ == NL) return FALSE;
- return TRUE;
- }
-
- /*
- ** count_grams
- **
- ** Run through the current text counting each n-gram
- ** as we go.
- */
- static void count_grams(void)
- {
- char *p = text;
- char *plim = text+current_size-GRAM+1;
- dropped_grams = 0;
- while (p != plim) {
- if (valid_gram(p))
- count_gram(p);
- p++;
- }
- }
-
- static void reset_grams(void)
- {
- int cut = cur_grams/100;
- int i;
- if (cut) {
- memmove(grams, grams+cut, (cur_grams-cut)*sizeof(struct gram));
- cur_grams -= cut;
- }
- for (i=0; i<cur_grams; i++)
- grams[i].nrefs = 0;
- }
-
- static void replace_gram(int gramno, struct gram *g)
- {
- char *from = text;
- char *to = text;
- char *org_limit = text + current_size;
- char *limit = text+current_size-GRAM+1;
- current_size = 0;
- while (from < limit) {
- if (!strncmp(from, g->ch, GRAM)) {
- *to++ = gramno;
- from += GRAM;
- } else {
- *to++ = *from++;
- }
- current_size++;
- }
- while (from != org_limit) {
- *to++ = *from++;
- current_size++;
- }
- }
-
- /*
- ** replace_string
- **
- ** Replace an arbitrary string with a shorter string replacement
- ** throughout the current version of the text.
- */
- static int replace_string ( const unsigned char *s,
- const unsigned char *r )
- {
- int n = 0;
- unsigned char *beg = text;
- int rlen = strlen(r);
- int cut = strlen(s)-strlen(r); /* number of characters to lose */
- text[current_size] = 0; /* terminate current text */
- while (beg = strstr(beg, s)) {
- memcpy(beg, r, rlen); /* perform replacement */
- n++;
- if (cut) memmove(beg+rlen, beg+rlen+cut, strlen(beg+rlen+cut)+1);
- current_size -= cut;
- }
- return n;
- }
-
- static void expand_gram ( unsigned char *buf, struct gram *g )
- {
- int i;
- *buf = 0;
- for (i=0; i<GRAM; i++)
- strcat(buf, definition[g->ch[i]]);
- }
-
- static void compute_grams(void)
- {
- unsigned char ch;
- for (ch=255; ch>=1; ch--) {
- struct gram *g;
- if (!character_count[ch]) {
- /*
- ** We can use this character value as a gram definition.
- */
- unsigned char x[100];
-
- printf("size now %ld (down %2.1f%%) ",
- (long)current_size,
- 100.0*(original_size-current_size)/(double)original_size);
- fflush(stdout);
-
- count_grams();
-
- qsort(grams, cur_grams, sizeof(struct gram), sort_nrefs);
- g = &grams[cur_grams-1];
-
- printf("top %d-gram is \"", GRAM);
- pgram(g);
- printf("\" (ref %ld, drop %lu)\n", g->nrefs, dropped_grams);
-
- replace_gram(ch, g);
-
- expand_gram(x, g);
-
- reset_grams();
- cur_grams--; /* we no longer have this one */
-
- free(definition[ch]);
- definition[ch] = strdup(x);
- printf(" character value %d now expands to \"", ch);
- pstring(definition[ch]);
- putchar('"');
- putchar(NL);
- }
- }
- }
-
- static void dump_table ( FILE *fout )
- {
- int ch;
- for (ch=0; ch<=255; ch++) {
- const unsigned char *def = definition[ch];
- if (strlen(def)==1 && *def == ch) {
- /*
- ** If the character is defined as itself, just put out
- ** a blank line.
- */
- fputc(NL, fout);
- } else {
- /*
- ** Otherwise, put out the definition of the character
- ** on a line.
- */
- fprintf(fout, "%s\n", definition[ch]);
- }
- }
- }
-
- /*
- ** initialise_definitions
- **
- ** Set up the definition array with pointers to heap strings
- ** defining the symbol as equivalent to itself.
- */
- static void initialise_definitions(void)
- {
- int c;
- for (c=1; c<256; c++) {
- unsigned char x[2];
- x[0] = c; x[1] = 0;
- definition[c] = strdup(x);
- if (!definition[c]) abandon("can't initialise definition");
- }
- }
-
- /*
- ** optimise_lines
- **
- ** This function scans the text and re-formats all the paragraphs
- ** to give maximal line sizes. The effect of this is to allow the
- ** compression algorithms to work slightly more effectively because
- ** word endings are more often seen as with a space (which can be
- ** involved in compression) than before (NL can not be part of
- ** a compression symbol).
- **
- ** Note that any such change leaves the text exactly the same
- ** size as it was before.
- */
- #if defined(FILTER)
- static void optimise_lines(void)
- {
- unsigned char *p;
-
- /*
- ** Terminate the current text.
- */
- text[current_size] = 0;
-
- /*
- ** Phase 1
- **
- ** Make each paragraph (where paragraphs are defined as being
- ** separated by blank lines) be a single line with spaces
- ** separating words.
- */
- p = text;
- while (p = strchr(p, NL)) {
- if (p[1] == NL) {
- /*
- ** THIS newline is the end of the paragraph, the NEXT
- ** one is the required blank line. We don't want to
- ** touch this, and THEN we want to start at the start
- ** of the next paragraph.
- */
- p += 2;
- } else if (p[1] == 0) {
- /*
- ** Last newline in the file.
- */
- break;
- } else {
- /*
- ** Simple end of line, but not end of paragraph.
- */
- *p++ = ' ';
- }
- }
-
- /*
- ** Phase 2
- **
- ** For each paragraph/line in the file, reduce it to lines shorter
- ** than or equal to LLIM characters by converting spaces to NLs.
- */
- p = text;
- while (*p) {
- unsigned char *end; /* points to end of this line (NL character) */
-
- /*
- ** "p" points to the start of the current line.
- ** If this is a blank line, just skip over it.
- */
- if (*p == NL) {
- p++;
- continue;
- }
-
- /*
- ** We are at the start of a non-empty line.
- ** Locate it's NL character and thus figure out
- ** how long the line is.
- */
- end = strchr(p, NL);
-
- /*
- ** Reduce this line, while it is still too large.
- */
- while ( (end-p) > LLIM ) {
- unsigned char *brk = p+LLIM; /* candidate breakpoint */
- while (*brk != ' ') brk--; /* run back to previous space */
- *brk = NL; /* overwrite with newline */
- p = brk+1; /* new beginning of line */
- }
-
- p = end+1; /* move to start of next line */
- }
- }
- #endif
-
- /*
- ** main
- */
- int main(int argc, char *argv[])
- {
- FILE *fin, *fout;
- int n;
-
- if (argc != 3) abandon("two arguments (in, out) required");
-
- fin = fopen(argv[1], "r");
- if (!fin) abandon("could not open input file");
-
- load_file(fin);
- fclose(fin);
-
- /*
- ** Perform some text replacements.
- **
- ** For "Terminal Compromise":
- ** ^[ ^H ^\ -> e-grave
- ** ^[ ^B ^\ -> e-acute
- ** <130> -> e-acute
- **
- ** General textual:
- ** Spaced out ellipsis ". . ." to "..."
- ** Compact lines of stars
- */
- #if defined(FILTER)
- n = replace_string("\x1B\x08\x1C", "\x8A");
- if (n) printf("e-grave replacements: %d\n", n);
- n = replace_string("\x1B\x02\x1C", "\x82");
- n += replace_string("<130>", "\x82");
- if (n) printf("e-acute replacements: %d\n", n);
- n = replace_string(". .", "..");
- if (n) printf("ellipsis compressions: %d\n", n);
- n = replace_string("* *", "**");
- if (n) printf("star compressions: %d\n", n);
- #endif
-
- /*
- ** Reformat paragraphs
- */
- #if defined(FILTER)
- optimise_lines();
- #endif
-
- initialise_definitions();
-
- while (count_characters())
- compute_grams();
-
- show_nearlies();
-
- /*
- ** Create the output file
- */
- fout = fopen(argv[2], "w");
- if (!fout) abandon("could not open output file");
- definition[0] = "!!Compressed!!";
- dump_table(fout);
- fwrite(text, 1, current_size, fout);
- fclose(fout);
-
- return EXIT_SUCCESS;
- }
-