home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Power-Programmierung
/
CD2.mdf
/
doc
/
mir
/
sort2.c
< prev
next >
Wrap
Text File
|
1992-07-02
|
13KB
|
381 lines
/*
* Usage - sort2 [/r] [/+n] from_file to_file key[s]
*
* sort2 Sorts large ASCII files using the memory-bound DOS SORT
* routine in multiple passes. /r signifies reverse order.
* /+n specifies a starting column, 1-999. A key is 1 to 3
* characters, used as a dividing point. The program separates
* the input file into a series of temporary files, depending on
* the byte(s) at the starting column. For n dividing points,
* the program makes n+1 temporary files, and reports the size
* of each. If all are under 60k characters, they are sorted
* and placed together in the output file. If a run fails, add
* another dividing point mid-way in the range that fails (that
* is, the file that is too big), and try again. NOTE: The DOS
* SORT starts column count at 1, converts all lower to upper case!
*
* input: Line oriented printable ASCII text.
*
* output: Same file, sorted.
*
* writeup: MIR TUTORIAL ONE, topic 5
*
* Written: Douglas Lowry Mar 06 91
* Modified: Douglas Lowry Feb 21 92
* Copyright (C) 1992 Marpex Inc.
*
* The MIR (Mass Indexing and Retrieval) Tutorials explain detailed
* usage and co-ordination of the MIR family of programs to analyze,
* prepare and index databases (small through gigabyte size), and
* how to build integrated retrieval software around the MIR search
* engine. The fifth of the five MIR tutorial series explains how
* to extend indexing capability into leading edge search-related
* technologies. For more information, GO IBMPRO on CompuServe;
* MIR files are in the DBMS library. The same files are on the
* Canada Remote Systems BBS. A diskette copy of the Introduction
* is available by mail ($10 US... check, Visa or Mastercard);
* diskettes with Introduction, Tutorial ONE software and the
* shareware Tutorial ONE text cost $29. Shareware registration
* for a tutorial is also $29.
*
* E-mail...
* Compuserve 71431,1337
* Internet doug.lowry%canrem.com
* UUCP canrem!doug.lowry
* Others: doug.lowry@canrem.uucp
*
* FAX... 416 963-5677
*
* "Snail mail"... Douglas Lowry, Ph.D.
* Marpex Inc.
* 5334 Yonge Street, #1102
* North York, Ontario
* Canada M2N 6M2
*
* Related database consultation and preparation services are
* available through:
* Innotech Inc., 2001 Sheppard Avenue E., Suite #118,
* North York, Ontario Canada M2J 4Z7
* Tel. 416 492-3838 FAX 416 492-3843
*
* This program is free software; you may redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* (file 05LICENS) along with this program; if not, write to the
* Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
* USA.
*/
#include <stdio.h>
#include <stdlib.h>
#include <dos.h>
#include <ctype.h>
#include <direct.h>
#include <process.h>
#include <errno.h>
#define MAX_BYTES 1024
#define MAX_DIV 25
#define repeat for(;;)
typedef enum _bool
{ FALSE = 0, TRUE = 1 } Bool;
/*
* declarations
*/
void Usage_(), process();
char *Cmdname_() { return( "sort2" ); }
FILE *fp_in;
/*
* MAIN
*/
main( argc, argv )
int argc;
char **argv;
{
unsigned char dividers[ MAX_DIV ][ 4 ],
uc[4] ;
Bool reverse; /* perform reverse sort */
int used_args, /* arguments identified so far */
div_ct, /* count of dividers */
start_col, /* start sorting at column */
test, /* evaluate a comparison */
ar, i, j ;
/* usage: sort2 [/r] [/+n] from_file to_file key[s] */
if( argv[1][0] == '-' || argc < 4 )
Usage_();
start_col = 1 ;
used_args = div_ct = 0 ;
reverse = FALSE ;
for( i = 1 ; i < 3 ; i++ )
{
if( argv[i][0] != '/' )
break ;
if( argv[i][1] == 'r' || argv[i][1] == 'R' )
{
used_args++ ;
reverse = TRUE ;
}
else if( argv[i][1] == '+' )
{
used_args++ ;
start_col = ( int ) atol( &argv[i][2] ) ;
}
else
{
fprintf( stderr, "\nUnrecognized argument with /\n\n" );
Usage_();
}
}
if(( fp_in = fopen( argv[ used_args + 1 ], "rb" )) == NULL )
{
fprintf( stderr, "FATAL... Unable to open file %s\n",
argv[ used_args + 1 ] );
Usage_();
}
unlink( argv[ used_args + 2 ] ); /* output file name */
if( argc < used_args + 3 )
Usage_();
for( ar = used_args + 3 ; ar < argc ; ar++ )
{
if( strlen( argv[ ar ] ) > 3 )
Usage_();
for( i = 0 ; argv[ ar ][ i ] ; i++ )
{
uc[i] = argv[ar][i] ;
if( islower( uc[i] ) )
uc[i] = toupper( uc[i] );
}
uc[i] = 0 ;
if( !div_ct )
{
strcpy( dividers[ 0 ], uc ) ;
div_ct = 1 ;
}
else
{
/* insertion sort */
for( i = 0 ; i < div_ct ; i++ )
{
test = strcmp( uc, dividers[ i ] ) ;
if( !test )
break ; /* a repetition */
if( test < 0 )
{
for( j = div_ct ; j > i ; j-- )
strcpy( dividers[ j - 1 ], dividers[ j ] );
strcpy( dividers[ i ], uc ) ;
div_ct++ ;
break ;
}
}
if( i == div_ct ) /* add to end */
strcpy( dividers[ div_ct++ ], uc ) ;
if( div_ct > MAX_DIV -1 )
fprintf( stderr, "RECOMPILE... over %d dividers.\n",
MAX_DIV - 1 );
}
}
process( reverse, dividers, div_ct, start_col, argv[ used_args +2] );
fclose( fp_in );
exit( 0 );
}
/*
* Usage
*/
void
Usage_()
{
fprintf( stderr,
"usage: %s [/r] [/+n] from_file to_file key[s]\n\n\
Sorts large ASCII files using the memory-bound DOS SORT\n\
routine in multiple passes. /r signifies reverse order.\n\
/+n specifies a starting column, 1-999. A key is 1 to 3\n",
Cmdname_());
fprintf( stderr,
" characters, used as a dividing point. The program separates\n\
the input file into a series of temporary files, depending on\n\
the byte(s) at the starting column. For n dividing points,\n\
the program makes n+1 temporary files, and reports the size\n" );
fprintf( stderr,
" of each. If all are under 60k characters, they are sorted\n\
and placed together in the output file. If a run fails, add\n\
another dividing point mid-way in the range that fails (that\n\
is, the file that is too big), and try again. NOTE: The DOS\n" ) ;
fprintf( stderr,
" SORT starts column count at 1, converts all lower to upper case!\n\n\
input: Line oriented printable ASCII text.\n\n\
output: Same file, sorted.\n\n\
writeup: MIR TUTORIAL ONE, topic 5\n\n" ) ;
exit( 1 ) ;
}
/*
* PROCESS
*/
void
process( reverse, dividers, div_ct, start_col, outnam )
unsigned char dividers[ MAX_DIV ][ 4 ] ;
Bool reverse; /* perform reverse sort */
int div_ct, /* count of dividers */
start_col; /* start sorting at column */
char outnam[32]; /* name of output file */
{
FILE *fp_tmp, *fp_bat ;
char fname[ 32 ];
unsigned char buf[ MAX_BYTES ], uc[4],
from[4], to[4] ;
Bool too_big ; /* Won't be able to sort subset.*/
long int tmp_size ; /* size of temporary file */
int pass,
errno,
test_lo, test_hi,
len, i, pt ;
unlink ("sort2tmp.bat" );
if(( fp_bat = fopen( "sort2tmp.bat", "w" )) == NULL )
{
fprintf( stderr, "FATAL... Unable to open sort2tmp.bat\n" );
Usage_();
}
too_big = FALSE ;
for( pass = 0 ; pass <= div_ct ; pass++ )
{
for( i = 0 ; i < 4 ; i++ )
from[i] = to[i] = uc[i] = 0 ;
if( pass )
rewind( fp_in );
if( reverse )
{
if( !pass )
{
strcpy( from, dividers[ div_ct - 1 ] ) ;
to[0] = 255 ; /* max char value */
}
else if( pass == div_ct )
strcpy( to, dividers[0] ) ;
else
{
strcpy( from, dividers[ div_ct - pass - 1 ] ) ;
strcpy( to, dividers[ div_ct - pass ] ) ;
}
}
else /* forward sort */
{
if( pass )
strcpy( from, dividers[ pass - 1 ] ) ;
if( pass < div_ct )
strcpy( to, dividers[ pass ] ) ;
else
to[0] = 255 ;
}
sprintf( fname, "sort%02d.tmp", pass );
if(( fp_tmp = fopen( fname, "wb" )) == NULL )
{
fprintf( stderr,
"FATAL... Unable to open %s\n", fname );
Usage_();
}
tmp_size = 0 ;
while( fgets( buf, MAX_BYTES, fp_in ) != NULL )
{
len = strlen( buf ) ;
if( len < start_col )
{
for( i = 0 ; i < 3 ; i++ )
uc[i] = 0 ;
}
else
{
for( i = 0 ; i < 3 ; i++ )
{
uc[i] = buf[ start_col - 1 + i ];
if( islower( uc[i] ))
uc[i] = toupper( uc[i] );
}
}
test_lo = strcmp( uc, from ) ;
test_hi = strcmp( uc, to ) ;
if( test_lo < 0 || test_hi >= 0 )
continue ;
fputs( buf, fp_tmp );
tmp_size += len ;
}
if(( !reverse && pass < div_ct ) || ( reverse && pass ))
fprintf( stderr, "Setting up to %s, size %ld bytes.\n",
to, tmp_size );
else
fprintf( stderr,"Setting beyond %s, size %ld bytes.\n",
dividers[ div_ct - 1], tmp_size );
if( tmp_size > 60000 )
too_big = TRUE ;
fclose( fp_tmp );
/* Build up the command line */
if( tmp_size )
{
strncpy( buf, "sort ", 5 );
pt = 5 ;
if( reverse )
{
strncpy( &buf[ pt ], "/r ", 3 );
pt += 3 ;
}
if( start_col > 1 )
{
sprintf( &buf[ pt ], "/+%d ", start_col );
pt += 4 ;
if( start_col > 9 ) /* to 2 digits */
pt++ ;
if( start_col > 99 ) /* to 3 digits */
pt++ ;
}
buf[ pt ] = '\0' ;
fprintf( fp_bat, "%s < %s >> %s\n", buf, fname, outnam );
}
fprintf( fp_bat, "del %s\n", fname );
}
fclose( fp_bat );
if( !too_big )
{
errno = spawnl( 0, "sort2tmp.bat", "sort2tmp.bat", NULL );
if( errno )
fprintf( stderr, "spawnl error # %d\n", errno );
}
unlink( "sort2tmp.bat" );
return ;
}