PDA Software Library

home *** CD-ROM | disk | FTP | other *** search

/ PDA Software Library / pdasoftwarelib.iso / PSION / MISC / READER / ZVR.ZIP / ZVR.C < prev next >

Wrap

C/C++ Source or Header | 1995-04-11 | 17.7 KB | 770 lines

/* ** Really mindless text compression program. ** ** The intended use of the compression is to be able to fit ** E-text books (Gutenberg, OBI, Wiretap) into a smaller space ** so that they can be read on a machine with very limited store, ** such as a handheld computer. One implication of this is that ** the form of compression does not need to be completely lossless ** as long as the result is still readable. ** ** This program assumes a linear address space at least large enough ** to load the whole text. One implication of this is that it won't ** be usable as a simple DOS program, but the alternative was to spend ** more time on this than I felt I could afford. */ #include <stdlib.h> #include <string.h> #include <stdio.h> #include <assert.h> #define FILTER /* don't preprocess text if this is */ /* not defined */ /* this should give LOSSLESS compression */ /* useful mainly for testing purposes */ #define NEARLY 10 /* less than this is "nearly" unused */ #define GRAM 2 /* size of strings to scan for */ #define GRAMS 4000 /* max size of dictionary of scanned strings */ #define LLIM 250 /* max size of uncompressed line in file */ #define FALSE 0 #define TRUE 1 #define NL '\n' #define CR '\r' static unsigned long character_count[256]; /* ** definition ** ** This is a table of definitions for the 256 possible character values. ** Note that character 0 is never used for anything, in order to avoid ** C string termination problems. ** ** For other characters, the table entry is a pointer to a heap string ** which is what that character would expand to if it appeared in the ** compressed string. ** ** Note that this is NOT a recursive concept; the definition is a flattened ** string of characters. This, if the character "q" only appears before ** "u" and is chosen as symbol "x", then the definition of "x" will be ** "qu" and "q" itself may then be redefined as something else later on. */ static unsigned char *definition[256]; /* ** Text storage ** ** "original_size" is the size, in characters, of the original file. ** "text" is a pointer to a heap area of a little bigger than this size, ** so that it can be terminated by a couple of special characters and ** a zero if required. ** "current_size" is the number of characters in "text" which are ** currently valid. */ static size_t original_size; static size_t current_size; static unsigned char *text; /* ** abandon ** ** Single point of exit from the program in the event of problems. */ static void abandon ( const char *s ) { fprintf(stderr, "Abandon: %s\n", s); exit(EXIT_FAILURE); } /* ** process_char ** ** Called once for each character in the input file. This is a state ** machine which weeds out things like redundant white space and ** unwanted hyphenation. */ static unsigned char *pp; /* pointer through which process_char writes */ #if defined(FILTER) static void process_char ( unsigned char c ) { static enum state { END_PARA, /* have just seen end of a paragraph */ END_LINE, /* have just seen end of a line */ IN_WORD, /* we are inside a word */ IN_LINE, /* inside a line, but not inside a word */ PEND_SPACE, /* in white space */ PEND_HYPHEN, /* just seen a hyphen, may be at end of word */ PEND_HYPHEN_NL /* floop-<NL> seen */ } state = END_PARA; switch (state) { line_ended: case END_LINE: if (c == NL) { /* ** If we see a second newline after the end of an ** ordinary line, we have come to the end of the ** paragraph. Issue the second newline and move ** into the END_PARA state. */ *pp++ = c; state = END_PARA; } else if (c == ' ') { /* ** If we see a leading space after a newline, we have ** see the _other_ kind of paragraph-starting convention, ** and we must change it to the first kind. */ *pp++ = NL; state = END_PARA; } else if (isalpha(c)) { *pp++ = c; state = IN_WORD; } else { *pp++ = c; state = IN_LINE; } break; case END_PARA: if (c == ' ') { /* ** Ignore spaces at this point; we know we are at the end of ** a paragraph already. */ } else if (c == NL) { /* ** Ignore extra blank lines */ } else { /* ** Something interesting: consider in line context */ goto in_line; } break; in_line: case IN_LINE: state = IN_LINE; if (c == ' ') { state = PEND_SPACE; } else if (c == NL) { *pp++ = NL; state = END_LINE; } else if (isalpha(c)) { *pp++ = c; state = IN_WORD; } else { *pp++ = c; } break; case IN_WORD: if (isalpha(c)) { *pp++ = c; } else if (c == '-') { state = PEND_HYPHEN; } else { goto in_line; } break; case PEND_HYPHEN: if (isalpha(c)) { /* hyphen within a word */ *pp++ = '-'; *pp++ = c; state = IN_WORD; } else if (c == ' ') { /* absorb spaces after in-word hyphens */ } else if (c == NL) { state = PEND_HYPHEN_NL; } else { *pp++ = '-'; /* flush hyphen */ goto in_line; } break; case PEND_HYPHEN_NL: if (isalpha(c)) { /* ** We have seen a <alpha><-><wsp><NL><wsp><alpha> ** sequence in a word. Drop the middle of it and ** join the two lines together, which will also ** join the two parts of the word together. */ *pp++ = c; state = IN_WORD; } else if (c == ' ') { /* ** Absorb spaces at beginning of lines like these */ } else { *pp++ = '-'; *pp++ = NL; state = END_LINE; goto line_ended; } break; case PEND_SPACE: if (c == ' ') { /* ignore multiple white space */ } else if (c == NL) { /* ** White space before line end: take as line end. */ *pp++ = NL; state = END_LINE; } else { *pp++ = ' '; goto in_line; } break; default: fprintf(stderr, "state: %d character: %d\n", state, c); abandon("process state machine failure"); } } #endif /* FILTER */ static load_file(FILE *fin) { int c; /* ** Work out the size of the file by reading it all in, character ** at a time. Slow but sure. */ original_size = 0; for (;;) { int c = fgetc(fin); if (c == EOF) break; original_size++; } printf("original size: %ld characters\n", (long)original_size); /* ** Allocate some space for the file... */ text = malloc(original_size)+10; if (!text) abandon("could not allocate buffer for text"); /* ** Reset the file pointer to the start and load the file in. */ fseek(fin, 0, SEEK_SET); pp = text; for (;;) { int c = fgetc(fin); if (c == EOF) break; #if defined(FILTER) process_char(c); #else *pp++ = c; #endif } /* ** Purge state machine if we have one. */ #if defined(FILTER) process_char(NL); process_char(NL); process_char(NL); #endif current_size = pp - text; } /* ** pchar ** ** Prints a character in such a way that it is always readable, ** if necessary hexifying it. */ static void pchar (int n) { if (isprint(n)) { putchar(n); } else { printf("<%02X>", n); } } /* ** pstring ** ** Prints a given string "safely" through pchar. */ static void pstring ( const unsigned char *s ) { while (*s) pchar(*s++); } static void show_nearlies(void) { int nearlies; int c; for (c=1, nearlies=0; c<256; c++) { int count = character_count[c]; if (count && (count<NEARLY)) nearlies++; } printf("nearly empty character slots: %d\n", nearlies); if (nearlies) { printf(" "); for (c=1; c<256; c++) { if (character_count[c] == 0) continue; if (character_count[c] < NEARLY) pchar(c); } putchar(NL); } } static int count_characters(void) { int c, empties; unsigned char *p = text; /* ** First, zero the count array. */ for (c=1; c<256; c++) character_count[c] = 0; /* ** Make sure we don't by accident use anything as a symbol ** that the Psion IOREAD function will regard as significant. ** ** (1000 is large enough to prevent these turning up as "nearlies"). */ character_count[26] = 1000; character_count[NL] = 1000; character_count[CR] = 1000; while (p != (text+current_size)) { character_count[*p]++; p++; } for (c=1, empties=0; c<256; c++) { int count = character_count[c]; if (count == 0) empties++; } printf("empty character slots: %d\n", empties); return empties; } struct gram { char ch[GRAM]; unsigned long nrefs; }; static struct gram grams[GRAMS]; static int cur_grams; static unsigned long dropped_grams; static void pgram ( const struct gram *g ) { int i; for (i=0; i<GRAM; i++) pchar(g->ch[i]); } static int sort_nrefs ( const void *L, const void *R ) { const struct gram *LL = L; const struct gram *RR = R; if (LL->nrefs < RR->nrefs) return -1; if (LL->nrefs > RR->nrefs) return 1; return 0; } static int sort_grams ( const void *L, const void *R ) { const struct gram *LL = L; const struct gram *RR = R; return strncmp(LL->ch, RR->ch, GRAM); } static void count_gram ( const char *g ) { struct gram gram; struct gram *q; memcpy(gram.ch, g, GRAM); q = bsearch(&gram, grams, cur_grams, sizeof(struct gram), sort_grams); if (q) { q->nrefs++; } else if (cur_grams < GRAMS) { /* ** A gram we haven't seen before, but which we do have room ** for in the table. */ gram.nrefs = 1; #if 0 /* ** add this gram in at the end of the table */ grams[cur_grams] = gram; #else /* ** Add this gram in at the start of the table. ** This is likely to be statistically close to where it ** will be going, particularly if it is a late arrival. */ if (cur_grams) memmove(grams+1, grams, cur_grams*sizeof(struct gram)); grams[0] = gram; #endif cur_grams++; qsort(grams, cur_grams, sizeof(struct gram), sort_grams); } else { dropped_grams++; } } /* ** valid_gram ** ** True if the string is an allowed n-gram. False if it ** contains NL and is therefore prohibited. */ static int valid_gram ( const unsigned char *p ) { int i; for (i=0; i<GRAM; i++) if (*p++ == NL) return FALSE; return TRUE; } /* ** count_grams ** ** Run through the current text counting each n-gram ** as we go. */ static void count_grams(void) { char *p = text; char *plim = text+current_size-GRAM+1; dropped_grams = 0; while (p != plim) { if (valid_gram(p)) count_gram(p); p++; } } static void reset_grams(void) { int cut = cur_grams/100; int i; if (cut) { memmove(grams, grams+cut, (cur_grams-cut)*sizeof(struct gram)); cur_grams -= cut; } for (i=0; i<cur_grams; i++) grams[i].nrefs = 0; } static void replace_gram(int gramno, struct gram *g) { char *from = text; char *to = text; char *org_limit = text + current_size; char *limit = text+current_size-GRAM+1; current_size = 0; while (from < limit) { if (!strncmp(from, g->ch, GRAM)) { *to++ = gramno; from += GRAM; } else { *to++ = *from++; } current_size++; } while (from != org_limit) { *to++ = *from++; current_size++; } } /* ** replace_string ** ** Replace an arbitrary string with a shorter string replacement ** throughout the current version of the text. */ static int replace_string ( const unsigned char *s, const unsigned char *r ) { int n = 0; unsigned char *beg = text; int rlen = strlen(r); int cut = strlen(s)-strlen(r); /* number of characters to lose */ text[current_size] = 0; /* terminate current text */ while (beg = strstr(beg, s)) { memcpy(beg, r, rlen); /* perform replacement */ n++; if (cut) memmove(beg+rlen, beg+rlen+cut, strlen(beg+rlen+cut)+1); current_size -= cut; } return n; } static void expand_gram ( unsigned char *buf, struct gram *g ) { int i; *buf = 0; for (i=0; i<GRAM; i++) strcat(buf, definition[g->ch[i]]); } static void compute_grams(void) { unsigned char ch; for (ch=255; ch>=1; ch--) { struct gram *g; if (!character_count[ch]) { /* ** We can use this character value as a gram definition. */ unsigned char x[100]; printf("size now %ld (down %2.1f%%) ", (long)current_size, 100.0*(original_size-current_size)/(double)original_size); fflush(stdout); count_grams(); qsort(grams, cur_grams, sizeof(struct gram), sort_nrefs); g = &grams[cur_grams-1]; printf("top %d-gram is \"", GRAM); pgram(g); printf("\" (ref %ld, drop %lu)\n", g->nrefs, dropped_grams); replace_gram(ch, g); expand_gram(x, g); reset_grams(); cur_grams--; /* we no longer have this one */ free(definition[ch]); definition[ch] = strdup(x); printf(" character value %d now expands to \"", ch); pstring(definition[ch]); putchar('"'); putchar(NL); } } } static void dump_table ( FILE *fout ) { int ch; for (ch=0; ch<=255; ch++) { const unsigned char *def = definition[ch]; if (strlen(def)==1 && *def == ch) { /* ** If the character is defined as itself, just put out ** a blank line. */ fputc(NL, fout); } else { /* ** Otherwise, put out the definition of the character ** on a line. */ fprintf(fout, "%s\n", definition[ch]); } } } /* ** initialise_definitions ** ** Set up the definition array with pointers to heap strings ** defining the symbol as equivalent to itself. */ static void initialise_definitions(void) { int c; for (c=1; c<256; c++) { unsigned char x[2]; x[0] = c; x[1] = 0; definition[c] = strdup(x); if (!definition[c]) abandon("can't initialise definition"); } } /* ** optimise_lines ** ** This function scans the text and re-formats all the paragraphs ** to give maximal line sizes. The effect of this is to allow the ** compression algorithms to work slightly more effectively because ** word endings are more often seen as with a space (which can be ** involved in compression) than before (NL can not be part of ** a compression symbol). ** ** Note that any such change leaves the text exactly the same ** size as it was before. */ #if defined(FILTER) static void optimise_lines(void) { unsigned char *p; /* ** Terminate the current text. */ text[current_size] = 0; /* ** Phase 1 ** ** Make each paragraph (where paragraphs are defined as being ** separated by blank lines) be a single line with spaces ** separating words. */ p = text; while (p = strchr(p, NL)) { if (p[1] == NL) { /* ** THIS newline is the end of the paragraph, the NEXT ** one is the required blank line. We don't want to ** touch this, and THEN we want to start at the start ** of the next paragraph. */ p += 2; } else if (p[1] == 0) { /* ** Last newline in the file. */ break; } else { /* ** Simple end of line, but not end of paragraph. */ *p++ = ' '; } } /* ** Phase 2 ** ** For each paragraph/line in the file, reduce it to lines shorter ** than or equal to LLIM characters by converting spaces to NLs. */ p = text; while (*p) { unsigned char *end; /* points to end of this line (NL character) */ /* ** "p" points to the start of the current line. ** If this is a blank line, just skip over it. */ if (*p == NL) { p++; continue; } /* ** We are at the start of a non-empty line. ** Locate it's NL character and thus figure out ** how long the line is. */ end = strchr(p, NL); /* ** Reduce this line, while it is still too large. */ while ( (end-p) > LLIM ) { unsigned char *brk = p+LLIM; /* candidate breakpoint */ while (*brk != ' ') brk--; /* run back to previous space */ *brk = NL; /* overwrite with newline */ p = brk+1; /* new beginning of line */ } p = end+1; /* move to start of next line */ } } #endif /* ** main */ int main(int argc, char *argv[]) { FILE *fin, *fout; int n; if (argc != 3) abandon("two arguments (in, out) required"); fin = fopen(argv[1], "r"); if (!fin) abandon("could not open input file"); load_file(fin); fclose(fin); /* ** Perform some text replacements. ** ** For "Terminal Compromise": ** ^[ ^H ^\ -> e-grave ** ^[ ^B ^\ -> e-acute ** <130> -> e-acute ** ** General textual: ** Spaced out ellipsis ". . ." to "..." ** Compact lines of stars */ #if defined(FILTER) n = replace_string("\x1B\x08\x1C", "\x8A"); if (n) printf("e-grave replacements: %d\n", n); n = replace_string("\x1B\x02\x1C", "\x82"); n += replace_string("<130>", "\x82"); if (n) printf("e-acute replacements: %d\n", n); n = replace_string(". .", ".."); if (n) printf("ellipsis compressions: %d\n", n); n = replace_string("* *", "**"); if (n) printf("star compressions: %d\n", n); #endif /* ** Reformat paragraphs */ #if defined(FILTER) optimise_lines(); #endif initialise_definitions(); while (count_characters()) compute_grams(); show_nearlies(); /* ** Create the output file */ fout = fopen(argv[2], "w"); if (!fout) abandon("could not open output file"); definition[0] = "!!Compressed!!"; dump_table(fout); fwrite(text, 1, current_size, fout); fclose(fout); return EXIT_SUCCESS; }