home *** CD-ROM | disk | FTP | other *** search
-
- /*
- * `classify': Sort files into groups by content
- * Copyright (C) 1991 Mark-Jason Dominus. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 1, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
- #include <stdio.h>
- #include <ctype.h>
- #include <string.h>
-
- /* Return codes from `compare()' and macros for handling them. */
- #define SAME 1
- #define DIFFERENT 0
- #define BADFILE1 4
- #define BADFILE2 8
- #define BADFILEBOTH (BADFILE1 | BADFILE2)
- #define ERROR(RC) ((RC != SAME) && (RC != DIFFERENT))
-
- /* Allocate a new object and return a pointer to it. */
- #define NEW(type) ((type *) malloc(sizeof(type)))
-
- /* Are strings a and b equal? ('equal', not 'eq') */
- #define STREQ(a,b) (strcmp((a),(b)) == 0)
-
- /* Flags set by command-line options. */
- int foldflag = 0, blankflag = 0, whiteflag = 0;
-
- /* Format option and codes. */
- char formopt = 's';
-
- /* Explanation of data structure used in this program:
-
- Each 'masternode'is a linked list of filenames. The filenames in each
- masternode are the names of files that are identical, modulo some
- whitespace and upper/lower-case distinctions.
-
- The masternodes are linked together in a linked list called `list'.
-
- Example:
-
- list
- |
- V data next next
- masternode------>filenode------->filenode------>NULL
- | | |
- | next | data | data
- | V V
- | filename1 filename2
- |
- V data next
- masternode------>filenode------>NULL
- | |
- | next | data
- | V
- V filename3
- NULL
-
- This would represent three files: filename1, filename2, and filename3,
- of which filename1 and filename2 had the same contents, and filename3
- was different from both.
-
- Note: if j is a pointer to a masternode, then j->data->data is the
- first filename in j's masternode.
- */
-
- typedef struct s_fnode {
- char *data;
- struct s_fnode *next;
- } filenode;
-
- typedef struct s_mnode {
- filenode *data;
- struct s_mnode *next;
- } masternode;
-
- main(argc,argv)
- int argc;
- char *argv[];
- {
- /* Look at these absurd declarations! */
- int i, compare(), match, numfileargs, parseargs();
- void usage(), mappend(), fappend();
- masternode *list, *j, *mnew, *newmasternode();
- filenode *fnew, *k, *newfilenode();
- FILE *checkit;
- /* Didn't anyone ever tell you it wasn't polite to point? */
-
- /* Parse the arguments and obliterate switch options like `-f'. */
- numfileargs = parseargs(argc,argv);
- /* Anything that survives obliteration is assumed to be a filename. */
-
- /* No, no--what is the good of comparing only one file? */
- if (numfileargs < 2) usage(argv[0]);
-
- /* Find the name of the first file on the command line. */
- for (i=1; argv[i] == NULL; i++) ;
-
- /* This program has two essentially separate functions.
- * One is to take a list of files and group identical ones.
- * The other is to see which of files 2...n are identical to
- * file 1.
- *
- * If you specify -m or -M, you get the second
- * functionality. Otherwise, you get the first.
- *
- * What follows right here is the second functionality.
- */
-
- if (formopt == 'm' || formopt == 'M') {
-
- /* The first file the user named is the one to check the others against.
- */
- char *master = argv[i];
-
- /* If the user said '-M', echo the name of the master
- * file; if not, suppress it. */
- if (formopt == 'M')
- printf("%s\n",master);
-
- for (i += 1; i<argc; i++) {
- if (argv[i] == NULL) continue;
-
- /* If for some reason the name of the master file appears more than
- once on the command line, suppress it. (This happens all the time
- in shell scripts.)
- */
- if (STREQ(argv[i], master)) continue;
-
- match = compare(master, argv[i]);
- if (match == SAME)
- printf("%s\n", argv[i]);
- /* If `match' was an error return, `compare()' printed an error */
- /* message for us and we need do nothing special. */
- }
-
- exit(0); /* This part of the program can't fail. */
- }
-
- /* If we're here, then the user didn't select -m or -M, and we
- do the normal thing, which is to look at all the files and sort them
- into groups.
- */
-
- /* This next bit of code catches a peculiar bug: If it were not here, and
- if we couldn't open the first file named on the command line, we would
- put it into the linked list anyway (list->data->data = argv[i]) and
- subsequent files would get checked against it, yielding many error
- messages, much wasted time, and erroneous output--there would be a
- `Class 1' with the bad file alone in it.
-
- Putting in this check allows us to make much simpler
- list-initialization code. I hate writing special-case code for
- starting off linked lists!
- */
- while (((checkit = fopen(argv[i],"r")) == NULL) &&
- i < argc)
- fprintf(stderr, "Couldn't open file %s.\n", argv[i++]);
- fclose(checkit);
-
- if (i == argc) exit(0); /* Couldn't read *any* of the input files. */
-
- /* Initialize linked lists */
- list = newmasternode();
- list->data->data = argv[i];
- /* Wasn't that simple? Told you so. */
-
- for (i += 1; i < argc; i++) { /* Loop through filenames... */
- if (argv[i] == NULL) continue; /* ... skipping nulls ... */
- match = DIFFERENT;
- j=list;
- do {
- /* ... matching the current file with the file at the head of each */
- /* class-list ... */
- match = compare(argv[i], j->data->data);
- if (match == DIFFERENT) j = j->next;
- /* ... until we run out of class lists or find a match or an error. */
- } while (j && (match == DIFFERENT));
-
- /* Now, if there was an error, then... */
- if (ERROR(match)) {
- /* ... I hope it was in the current file--that's no problem; we just
- obliterate it from the list of files to check, and move on, but...
- */
- if ((match & BADFILE1) == BADFILE1) {
- argv[i] = NULL;
- continue;
- }
- /* ... if the problem was with the file in the class list, I am very
- upset, because it _was_ okay when I put it into the list.
- (I have violated Steinbach's Guideline for Systems Programming:
- ``Never test for an error condition you don't know how to
- handle.'' But actually I could handle this; we could delete the
- bogus file from the class-list in which it appears. This is a lot
- of work and it will happen only very rarely and in bizarre
- circumstances, so I choose not to bother. So sue me.
- */
- else if ((match & BADFILE2) == BADFILE2) {
- fprintf(stderr,"WARNING:\tSomething went wrong with file %s\n",
- j->data->data);
- fprintf(stderr,"since the last time I looked at it.\n");
- /* Yes, Virginia, this is correct behavior. */
- }
- }
-
- /* Okay, there was no error, but the current file was *not* like
- any of the ones we've seen so far. Make a new classification and
- put the current filename into it.
- */
- else if (match == DIFFERENT) {
- mnew = newmasternode();
- mnew->data->data = argv[i];
- mappend(list,mnew);
- }
- /* Ah, we found a match--the current file is identical to the ones in */
- /* the classification j->data. */
- else {
- #ifdef DEBUG
- fprintf(stderr, "%s matched %s.\n", argv[i], j->data->data);
- #endif
- fnew = newfilenode();
- fnew->data = argv[i];
- fappend(j->data, fnew);
- }
- } /* for (i += 1; ... ) */
-
- /* We are out of the main loop and all the files have been handled,
- one way or another. Now it is time to spit out the output.
- */
-
- /* `formopt' is '1' if the user selected the `-1' option. It means
- * that the proram should not do the default thing, which is to make a
- * nice long report of who matched whom, but rather should just dump out
- * a list of files each of which represents exactly one of the classes. */
- if (formopt == '1') {
- for (j=list; j; j=j->next)
- printf("%s\n", j->data->data);
- }
- /* `formopt' is 's' if the user selected the '-s' option. That
- * means that the program should make a short, awkable kind of
- * output, with one line per class, filenames separated by a single
- * space. Note that we do not number the lines. (I almost had it
- * number the lines.) The idea is that if the user wanted the lines
- * numbered, they would pipe the output through 'cat -n'. */
- else if (formopt == 's') {
- for (j=list; j; j=j->next) {
- for (k = j->data; k; k=k->next)
- printf("%s ", k->data);
- printf("\n");
- }
- }
- /* Here we make the nice long report. The temptation to add many
- bells and whistles and have the program accept a format-specification
- string and so on is very tempting, but I will not give in to foolish
- creeping featurism. At least, not any more than I already have.
- Actually, a short-form option, the puts the output in the form
- 1 foo.c bar.c baz.c la.c
- 2 la de da oakum yokum
- 3 cruft FOCUS
- 4 adventure
- might be very useful, because as it is you can't really feed this
- program's output to AWK in a reasonable way.
- */
- /* Note added in proof: I gave in to creeping featurism. See the
- * '-s' option. Sigh. At least I did not make it number the lines. */
- else {
- for (j=list, i=1; j; j=j->next, i++) {
- printf("\nClass %d:\n",i);
- for (k = j->data; k; k=k->next) {
- printf("\t%s\n",k->data);
- }
- }
- }
-
- exit(0); /* Au 'voir! */
- }
-
- /* This next `compare' routine is what I used to do, but there are good
- reasons for not using either diff(1) or cmp(1):
-
- 1. Do not use diff(1) because it is too intelligent (intelligent ->
- slow.) Diff tells you where the files differ and that is not what we
- want--we just want to know if they are different or not.
-
- 2. Do not use cmp(1) because we want to use this program for comparing
- things like /etc/rc.local and /etc/motd which are very likely to differ
- only in a few whitespaces, and we want this program to report that such
- files are identical, even though cmp says they're not.
-
- Maybe UNIX needs a nice, simple, flexible file-compare utility? Naah,
- you can always string awk and sed and things onto the front of cmp. But
- that's too slow for us here.
- */
-
- /* Do not do this:
- int
- compare(path1,path2)
- char *path1, *path2;
- {
- char compare[1024];
-
- sprintf(compare,"cmp -s %s %s",path1,path2);
- sprintf(compare,"diff -w %s %s > /dev/null 2>&1",path1,path2);
- return((system(compare) >> 8 == 0) ? SAME : DIFFERENT );
- }
- */
-
- /* So this is what we do instead. */
-
- int
- compare(path1, path2)
- char *path1, *path2;
- {
- FILE *file1, *file2;
- int c1,c2;
-
- if ((file1 = fopen(path1,"r")) == NULL) {
- fprintf(stderr, "Couldn't open file %s.\n", path1);
- return(BADFILE1);
- }
- if ((file2 = fopen(path2,"r")) == NULL) {
- fprintf(stderr, "Couldn't open file %s.\n", path2);
- return(BADFILE2); /* For symmetry, even though this program will become
- quite irate if `compare' ever returns this code.
- */
- }
-
- do {
- do {
- c1 = getc(file1);
- /* You may need to make a Karnaugh map to understand this termination
- condition, but it essentially means to ignore the right white spaces
- if the right option flags are set, and I have tested it for you,
- so you may assume it is doing the thing that the man page says it
- does.
- */
- } while (! ((!blankflag && !whiteflag) ||
- ((c1 != ' ' && c1 != '\t') && (c1 != '\n' || !whiteflag)))
- ) ;
- do {
- c2 = getc(file2);
- } while (! ((!blankflag && !whiteflag) || /* Ditto */
- ((c2 != ' ' && c2 != '\t') && (c2 != '\n' || !whiteflag)))
- ) ;
-
- /* Fold case if requested with `-f' flag. */
- if (foldflag) {
- c1 = (isupper(c1) ? tolower(c1) : c1);
- c2 = (isupper(c2) ? tolower(c2) : c2);
- }
-
- if (c1 != c2) {
- fclose(file1);
- fclose(file2);
- return DIFFERENT;
- }
-
- } while (c1 != EOF && c2 != EOF);
-
- fclose(file1);
- fclose(file2);
-
- /* If we're here, then both files were identical and we tapped out at */
- /* least one of them. If we tapped out both, they really are identical. */
- /* If, on the other hand, only one is finished, then it is a strict */
- /* prefix of the other and so the two files are *not* the same. */
- if (c1 == EOF && c2 == EOF)
- return SAME;
- else
- return DIFFERENT;
- }
-
- /* Nyahh nyah! User is a big stupid-head! */
- void
- usage(progname)
- char *progname;
- {
- char *tail;
- tail = strrchr(progname,'/');
-
- if (tail) progname = tail+1;
- fprintf(stderr,"Usage:\t %s [-1 | -s | -l | -m | -M] [-f] [-b | -w]\n",progname);
- fprintf(stderr,"\tfile1 file2 [...]\n");
- fprintf(stderr,"\n\nTry %s -h\t for help.\n", progname);
- exit(-1);
- }
-
- /* I put this here 'cause I didn't want to write a man page. Duuhhhhh. */
- void
- help()
- {
- fprintf(stderr,"Classify: Examine and group identical files.\n\n");
- fprintf(stderr,"Flags:\n\t-f\tFold case in file comparisions.\n");
- fprintf(stderr,"\t-b\tIgnore blanks and TABs in file comparisions.\n");
- fprintf(stderr,"\t-w\tIgnore all whitespace in file comparisions.\n");
- fprintf(stderr,"\t-1\tPrint the name of only one file from each class.\n");
- fprintf(stderr,"\t-l\tPrint long-format output (default).\n");
- fprintf(stderr,"\t-s\tPrint short-format output.\n");
- fprintf(stderr,"\t-M\tPrint only names of files that match first file named.\n");
- fprintf(stderr,"\t-m\tLike -M, but suppress first filename.\n");
- return;
- }
-
- /* Parse the args and set the flags.
- We want the argument list to be free-form so you can mix filenames and
- options. That is because I am a masochist. So to save trouble, we just
- obliterate the flag arguments by setting them to NULL, and then we have
- the main routine ignore NULL arguments if it sees any. Programmers who
- say `but then you can't tell when you've reached the end of the arg list
- because it is supposed to be a NULL-terminated array!' get a boot to the
- head.
-
- Returns the number of non-flag arguments.
- */
-
- int
- parseargs(argc,argv)
- int argc;
- char *argv[];
- {
- int i, j, numnonflags = argc-1;
- void usage(), help();
-
- for (i=1; i<argc; i++) {
- if (argv[i][0] != '-') continue;
- numnonflags -= 1;
- if (argv[i][1] == '\0') { /* If flag is "-", stop parsing args */
- /* Probably `-' should mean to read the stdin. I will put in
- that feature three days after next tishabov.
-
- (Translation for gentiles: I will put the feature in on the
- fourth Thursday of next week. )
- */
- argv[i] = NULL;
- return numnonflags;
- }
- for (j=1; argv[i][j]; j++) {
- switch (argv[i][j]) {
- case '-': /* If flag is "--", stop parsing args */
- if (j==1) {
- argv[i] = NULL;
- return;
- } /* Else we got a flag like -f-w, so ignore the second "-" sign. */
- break;
- case 'f':
- foldflag = 1;
- break;
- case 'b':
- blankflag = 1;
- break;
- case 'w':
- whiteflag = 1;
- break;
- case 'l':
- formopt = 'l';
- break;
- case 's':
- formopt = 's';
- break;
- case '1':
- formopt = '1';
- break;
- case 'h':
- help(); /* ``Why does this function return?''
- `` `Cause you're an idiot.''
- ``Oh yeah. I forgot.''
- */
- exit(0);
- break;
- case 'm':
- formopt = 'm';
- break;
- case 'M':
- formopt = 'M';
- break;
- default:
- fprintf(stderr, "Unknown option: -%c.\n", argv[i][j]);
- usage(argv[0]);
- }
- }
- if (argv[i][0] == '-') argv[i] = NULL; /* Obliterate flag arguments. */
- }
- return numnonflags;
- }
-
- /* Manufacture a new masternode whose car is a new filenode. Return a */
- /* pointer to the new masternode. */
- masternode *
- newmasternode()
- {
- masternode *foo;
- filenode *newfilenode();
-
- foo = NEW(masternode);
- foo->next = NULL;
- foo->data = newfilenode();
-
- return(foo);
- }
-
- /* Manufacture a new filenode whose car is the null string. Return a */
- /* pointer to the new filenode. */
- filenode *
- newfilenode()
- {
- filenode *foo;
-
- foo = NEW(filenode);
- foo->next = NULL;
- foo->data = NULL;
-
- return(foo);
- }
-
- /* head and tail are pointers to masternodes. (i.e., they are linked lists */
- /* of masternodes.) Append tail to the end of head. (LISP pepole would */
- /* call this operation `nconc'. I can't say the word `nconc' without */
- /* bursting out laughing, so I called it `mappend' instead.) */
- void
- mappend(head,tail)
- masternode *head, *tail;
- {
- masternode *i;
-
- /* Find the end of the linked list `head' */
- for (i=head; i->next; i = i->next) ;
-
- /* Concatenate. */
- i->next = tail;
-
- return;
- }
-
- /* This is the same as mappend, except it works on filenode-lists instead */
- /* of masternode-lists. Big deal. */
- void
- fappend(head,tail)
- filenode *head, *tail;
- {
- filenode *i;
-
- for (i=head; i->next; i = i->next) ;
-
- /* nconc! nconc! nconc! hahahaha! */
- i->next = tail;
-
- return;
- }
-
-
-