home *** CD-ROM | disk | FTP | other *** search
- /* File : nsort.c
- Author : Richard A. O'Keefe
- Updated: 1988
- Purpose: Fast sorting command for smallish files.
-
- This program has no copyright notice. You can do anything you please
- with it, except blame me if it doesn't work.
- */
-
- #ifndef lint
- static char SCCSid[] = "%Z%%E% %M% %I%";
- #endif/*lint*/
-
- /* nsort <input >output
-
- is a very simple sorting program which has been written to be as fast
- as possible. It is equivalent to sort with no arguments. That is,
- it sorts its standard input to its standard output, and compares
- entire lines.
-
- It uses a natural merge, so if the input is already in order it takes
- linear time, and it reads the entire file into memory, using a single
- read() if the standard input is a regular file.
-
- Sorting random permutations of /usr/dict/words on a Sun-3/50:
- sort(1) : 15 seconds nsort : 8 seconds (with SCSI disc)
- 17 seconds 11 seconds (NFS over Ethernet)
- */
-
- #include <stdio.h>
-
- /* The following values are taken from <sysexits.h>, but are copied here
- in case you lack that BSD header file.
- */
- #define EX_OK 0 /* nothing went wrong */
- #define EX_USAGE 64 /* something wrong with the command line */
- #define EX_SOFTWARE 70 /* internal error (here, memory ran out) */
- #define EX_IOERR 71 /* transput error (here, read() trouble) */
-
- #define STDIN 0
-
- extern char * malloc();
- extern char * strcpy();
- extern int strcmp();
- extern int strlen();
- extern void perror();
- extern int lseek();
- extern int read();
-
- typedef struct item_rec *item_ptr;
-
- struct item_rec
- {
- item_ptr next;
- char* data;
- };
-
- #define IN_ORDER(x, y) (strcmp((x)->data, (y)->data) <= 0)
-
-
- item_ptr nat_sort(list)
- item_ptr list;
- {
- item_ptr stack[30];
- item_ptr *sp = stack;
- int runs = 0; /* total number of runs processed */
- register item_ptr p, q, r;
- struct item_rec header;
- int k;
-
- while (p = list) {
- /* pick up a run from the front of list, setting */
- /* p = (pointer to beginning of run), list = (rest of list) */
- if (q = p->next) {
- list = q->next;
- if (!IN_ORDER(p, q)) r = q, q = p, p = r;
- p->next = q;
- for (r = list; r && IN_ORDER(q, r); r = r->next)
- q->next = r, q = r;
- q->next = (item_ptr)0;
- list = r;
- } else {
- list = (item_ptr)0;
- }
- runs++;
- /* merge this run with 0 or more runs off the top of the stack */
- for (k = runs; 1 &~ k; k >>= 1) {
- q = *--sp;
- r = &header;
- while (q && p)
- if (IN_ORDER(q, p)) {
- r->next = q;
- r = q, q = q->next;
- } else {
- r->next = p;
- r = p, p = p->next;
- }
- r->next = q ? q : p;
- p = header.next;
- }
- /* push the merged run onto the stack */
- *sp++ = p;
- }
- if (sp == stack) return (item_ptr)0;
- /* merge all the runs on the stack */
- p = *--sp;
- while (sp != stack) {
- q = *--sp;
- r = &header;
- while (q && p)
- if (IN_ORDER(q, p)) {
- r->next = q;
- r = q, q = q->next;
- } else {
- r->next = p;
- r = p, p = p->next;
- }
- r->next = q ? q : p;
- p = header.next;
- }
- return p;
- }
-
- item_ptr alloc_item(data)
- char *data;
- {
- register item_ptr result;
-
- result = (item_ptr)malloc(sizeof *result + strlen(data) + 1);
- if (result) {
- result->next = (item_ptr)0;
- result->data = strcpy((char*)result + sizeof *result, data);
- }
- return result;
- }
-
- /* What we really want to do is to slurp the entire file in with one call
- to read(). In order to do that, we need to know how much there is. A
- file's size can be determined either by calling fstat() or by using an
- lseek() to the end. The number you get from fstat() doesn't mean much
- for pipes and terminals, so lseek() appears to be the better approach.
- Note that even when stdin is connected to a file, part of it may have
- been read already. In that case, we should not include the part which
- has been read. So we do
- i = lseek(STDIN, 0, 1)
- to discover the current position (and simultaneously to find out if we
- can use lseek() at all on this file descriptor). One problem remains;
- the size of the file may change while we are reading it. In that case
- we'll stop early if we have to, but if the file grows we'll miss stuff
- added at the end.
- */
- main(argc, argv)
- int argc;
- char **argv;
- {
- struct item_rec header;
- item_ptr p, q;
- char *progname = argc >= 1 ? argv[0] : "nsort";
- int i;
-
- if (argc != 1) {
- fprintf(stderr, "Usage: %s <unordered >sorted\n", progname);
- exit(EX_USAGE);
- }
- i = lseek(STDIN, 0, 1); /* current position in stream */
- if (i < 0) { /* can't find out the size by seeking */
- char buffer[1024];
-
- for (p = &header
- ; fgets(buffer, sizeof buffer, stdin)
- ; p->next = q, p = q) {
- i = strlen(buffer);
- if (i > 0 && buffer[i-1] == '\n') buffer[i-1] = '\0';
- if (!(q = alloc_item(buffer))) {
- fprintf(stderr, "%s: ran out of memory\n", progname);
- exit(EX_SOFTWARE);
- }
- } else {
- register char *b, *c;
- register int n;
-
- n = lseek(STDIN, 0, 2) - i; /* part of stdin may have been read */
- (void) lseek(STDIN, i, 0); /* go back to original position */
- if (!(b = malloc(n+1))) {
- fprintf(stderr, "%s: ran out of memory\n", progname);
- exit(EX_SOFTWARE);
- }
- for (c = b; (i = n-(c-b)) > 0; c += i) {
- i = read(STDIN, c, i);
- if (i < 0) {
- perror(progname);
- exit(EX_IOERR);
- } else
- if (i == 0) {
- break;
- }
- }
- /* it is possible that the file may have been truncated */
- /* while we were reading it. */
- n = c-b;
- for (p = &header; n > 0; b = c, p->next = q, p = q) {
- for (c = b; --n >= 0; c++)
- if (*c == '\n') {
- *c++ = '\0';
- break;
- }
- if (n < 0) *c = '\0';
- if (!(q = (item_ptr) malloc(sizeof *q))) {
- fprintf(stderr, "nsort: ran out of memory\n");
- exit(EX_SOFTWARE);
- }
- q->data = b;
- }
- }
- p->next = (item_ptr)0;
- p = header.next;
- for (p = nat_sort(p); p; p = p->next)
- puts(p->data);
- exit(EX_OK);
- }
-
-