Amiga MA Magazine 1998 #6

home *** CD-ROM | disk | FTP | other *** search

/ Amiga MA Magazine 1998 #6 / amigamamagazinepolishissue1998.iso / packery / bwt / src / unbwt.cpp < prev next >

Wrap

C/C++ Source or Header | 1996-06-17 | 8KB | 238 lines

// // UNBWT.CPP // // Mark Nelson // March 8, 1996 // http://web2.airmail.net/markn // // DESCRIPTION // ----------- // // This program performs an inverse Burrows-Wheeler transform on an input // file or stream, and sends the result to an output file or stream. // // While this program can be compiled in 16 bit mode, it might not // be able to invert transforms created by a 32 bit program. The // 16 bit program has to drop its block size quite a bit. However, // rewriting the code to use a huge buffer under MS-DOS might solve // that problem. // // This program takes two arguments: an input file and an output // file. You can leave off one argument and send your output to // stdout. Leave off two arguments and read your input from stdin // as well. You can also specify "-d" as the first argument to get // a debug dump as well. // // The UNBWT input consists of a series of blocks that look like this: // // long byte_count | ...data... | long first | long last // // The byte_count refers to the number of data bytes. The data // itself is the "L" column from the sorted data. "first" is the // index where the first character from the buffer appears in the // sorted output. "last" is where the end-of-buffer special byte // appears in the output buffer. These blocks are repeated until // an EOF indicates that everything is done. // // This program accompanies my article "Data Compression with the // Burrows-Wheeler Transform." There is one major deviation from // the text of the article in this implementation. The BWT code // creates a buffer of N+1 characters out of an input string of // N charactes. The last character is a special end-of-buffer // character that makes sorting the input easier. See BWT.CPP for // details on how and why this works. // // The end of buffer character has special implications for UNBWT.CPP, // because we have to take note of its appearance in the data stream. // The "last" index sent after the block indicates whhich position in // the transformed buffer contains the virtual end-of-buffer character. // All of the calculations I do in the program have to take into account // the possible appearance of this virtual character in the input // stream. // // Build Instructions // ------------------ // // Define the constant unix for UNIX or UNIX-like systems. The // use of this constant turns off the code used to force the MS-DOS // file system into binary mode. g++ already does this, your UNIX // C++ compiler might also. // // Borland C++ 4.5 16 bit : bcc -w -ml unbwt.cpp // Borland C++ 4.5 32 bit : bcc32 -w unbwt.cpp // Microsoft Visual C++ 1.5 : cl /W4 /AL unbwt.cpp // Microsoft Visual C++ 4.0 : cl /W4 unbwt.cpp // g++ : g++ -o unbwt unbwt.cpp // // Typical Use // ----------- // // unari < compressed-file | unrle | unmtf | unbwt | unrle > raw-file // #define unix #include <stdio.h> #include <stdlib.h> #include <ctype.h> #include <limits.h> #include <fcntl.h> #if !defined( unix ) #include <io.h> #endif #include <string.h> #if ( INT_MAX == 32767 ) #define BLOCK_SIZE 20000 #else #define BLOCK_SIZE 200000 #endif unsigned char buffer[ BLOCK_SIZE + 1 ]; unsigned int T[ BLOCK_SIZE + 1 ]; long buflen; unsigned int Count[ 257 ]; unsigned int RunningTotal[ 257 ]; main( int argc, char *argv[] ) { int debug = 0; if ( argc > 1 && strcmp( argv[ 1 ], "-d" ) == 0 ) { debug = 1; argv++; argc--; } fprintf( stderr, "Performing Inverse BWT on " ); if ( argc > 1 ) { freopen( argv[ 1 ], "rb", stdin ); fprintf( stderr, "%s", argv[ 1 ] ); } else fprintf( stderr, "stdin" ); fprintf( stderr, " to " ); if ( argc > 2 ) { freopen( argv[ 2 ], "wb", stdout ); fprintf( stderr, "%s", argv[ 2 ] ); } else fprintf( stderr, "stdout" ); fprintf( stderr, "\n" ); #if !defined( unix ) setmode( fileno( stdin ), O_BINARY ); setmode( fileno( stdout ), O_BINARY ); #endif for ( ; ; ) { if ( fread( (char *) &buflen, sizeof( long ), 1, stdin ) == 0 ) break; fprintf( stderr, "Processing %ld bytes\n", buflen ); if ( buflen > ( BLOCK_SIZE + 1 ) ) { fprintf( stderr, "Buffer overflow!\n" ); abort(); } if ( fread( (char *)buffer, 1, (size_t) buflen, stdin ) != (size_t) buflen ) { fprintf( stderr, "Error reading data\n" ); abort(); } unsigned long first; fread( (char *) &first, sizeof( long ), 1, stdin ); unsigned long last; fread( (char *)&last, sizeof( long ), 1, stdin ); fprintf( stderr, "first = %lu, " "last = %lu\n", first, last ); // // To determine a character's position in the output string given // its position in the input string, we can use the knowledge about // the fact that the output string is sorted. Each character 'c' will // show up in the output stream in in position i, where i is the sum // total of all characters in the input buffer that precede c in the // alphabet, plus the count of all occurences of 'c' previously in the // input stream. // // The first part of this code calculates the running totals for all // the characters in the alphabet. That satisfies the first part of the // equation needed to determine where each 'c' will go in the output // stream. Remember that the character pointed to by 'last' is a special // end-of-buffer character that is supposed to be larger than any char // in the alphabet. // unsigned int i; for ( i = 0 ; i < 257 ; i++ ) Count[ i ] = 0; for ( i = 0 ; i < (unsigned int) buflen ; i++ ) if ( i == last ) Count[ 256 ]++; else Count[ buffer[ i ] ]++; if ( debug ) { fprintf( stderr, "Byte Counts:\n" ); for ( i = 0 ; i <= 256 ; i++ ) if ( Count[ i ] ) { if (isprint( i ) ) fprintf( stderr, "%c : ", i ); else fprintf( stderr, "%3d: ", i ); fprintf( stderr, "%u\n", Count[ i ] ); } } int sum = 0; if ( debug ) fprintf( stderr, "Running totals:\n" ); for ( i = 0 ; i < 257 ; i++ ) { RunningTotal[ i ] = sum; sum += Count[ i ]; if ( debug && Count[ i ] ) { if (isprint( i ) ) fprintf( stderr, "%c : ", i ); else fprintf( stderr, "%3d: ", i ); fprintf( stderr, "%u\n", (long) RunningTotal[ i ] ); } Count[ i ] = 0; } // // Now that the RunningTotal[] array is filled in, I have half the // information needed to position each 'c' in the input buffer. The // next piece of information is simply the number of characters 'c' // that appear before this 'c' in the input stream. I keep track of // that informatin in the Counts[] array as I go. By adding those // two number together, I get the destination of each character in // the input buffer, and that allows me to fill in all the positions // in the transformation vector, T[]. // for ( i = 0 ; i < (unsigned int) buflen ; i++ ) { int index; if ( i == last ) index = 256; else index = buffer[ i ]; T[ Count[ index ] + RunningTotal[ index ] ] = i; Count[ index ]++; } if ( debug ) { fprintf( stderr, "T = " ); for ( i = 0 ; i < (unsigned int) buflen ; i++ ) fprintf( stderr, "%u ", T[ i ] ); fprintf( stderr, "\n" ); } // // Once the transformation vector is in place, writing the // output is just a matter of following the indices. Note // that I don't have to watch for the end-of-buffer character // here. Since I am only outputting N characters, and I know // it will show up at position N+1, I know it won't pop up // unsigned int j; i = (unsigned int) first; for ( j = 0 ; j < (unsigned int) ( buflen - 1 ) ; j++ ) { putc( buffer[ i ], stdout ); i = T[ i ]; } } return 1; }