home *** CD-ROM | disk | FTP | other *** search
- From: davidsen@crdos1.crd.ge.com (Bill Davidsen)
- Newsgroups: comp.sources.misc
- Subject: v17i013: finddup - Find duplicate files, Part01/01
- Message-ID: <1991Feb24.041419.20094@sparky.IMD.Sterling.COM>
- Date: 24 Feb 91 04:14:19 GMT
- Approved: kent@sparky.imd.sterling.com
- X-Checksum-Snefru: 82482af9 35a55990 18851344 93d80b3e
-
- Submitted-by: Bill Davidsen <davidsen@crdos1.crd.ge.com>
- Posting-number: Volume 17, Issue 13
- Archive-name: finddup/part01
- Supersedes: finddup: Volume 16, Issue 95
-
- #!/bin/sh
- # shar: Shell Archiver (v1.29)
- #
- # Run the following text with /bin/sh to create:
- # finddup.c
- # finddup.1C
- # makefile
- # getopt.c
- #
- echo "x - extracting finddup.c (Text)"
- sed 's/^X//' << 'SHAR_EOF' > finddup.c &&
- X/****************************************************************\
- X| finddup.c - the one true find duplicate files program
- X|----------------------------------------------------------------
- X| Bill Davidsen, last hacked 2/22/91
- X| Copyright (c) 1991 by Bill Davidsen, all rights reserved. This
- X| program may be freely used and distributed in its original
- X| form. Modified versions must be distributed with the original
- X| unmodified source code, and redistribution of the original code
- X| or any derivative program may not be restricted.
- X|----------------------------------------------------------------
- X| Calling sequence:
- X| finddup [-l] checklist
- X|
- X| where checklist is the name of a file containing filenames to
- X| be checked, such as produced by "find . -type f -print >file"
- X| returns a list of linked and duplicated files.
- X|
- X| If the -l option is used the hard links will not be displayed.
- X\***************************************************************/
- X
- X#include <stdio.h>
- X#include <sys/types.h>
- X#include <sys/stat.h>
- X#include <malloc.h>
- X
- X/* parameters */
- X#define MAXFN 120 /* max filename length */
- X
- X/* constants */
- X#define EOS ((char) '\0') /* end of string */
- X#define FL_CRC 0x0001 /* flag if CRC valid */
- X#define FL_DUP 0x0002 /* files are duplicates */
- X#define FL_LNK 0x0004 /* file is a link */
- X
- X/* macros */
- X#ifdef DEBUG
- X#define debug(X) if (DebugFlg) printf X
- X#define OPTSTR "lhd"
- X#else
- X#define debug(X)
- X#define OPTSTR "lh"
- X#endif
- X#define SORT qsort((char *)filelist, n_files, sizeof(filedesc), comp1);
- X#define GetFlag(x,f) ((filelist[x].flags & (f)) != 0)
- X#define SetFlag(x,f) (filelist[x].flags |= (f))
- X
- Xtypedef struct {
- X off_t length; /* file length */
- X unsigned long crc32; /* CRC for same length */
- X dev_t device; /* physical device # */
- X ino_t inode; /* inode for link detect */
- X off_t nameloc; /* name loc in names file */
- X char flags; /* flags for compare */
- X} filedesc;
- X
- Xfiledesc *filelist; /* master sorted list of files */
- Xlong n_files = 0; /* # files in the array */
- Xlong max_files = 0; /* entries allocated in the array */
- Xint linkflag = 1; /* show links */
- Xint DebugFlg = 0; /* inline debug flag */
- XFILE *namefd; /* file for names */
- Xextern int
- X opterr, /* error control flag */
- X optind; /* index for next arg */
- X
- X/* help message, in a table format */
- Xstatic char *HelpMsg[] = {
- X "Calling sequence:",
- X "",
- X " finddup [options] list",
- X "",
- X "where list is a list of files to check, such as generated",
- X "by \"find . -type f -print > file\"",
- X "",
- X "Options:",
- X " -l - don't list hard links",
- X#ifdef DEBUG
- X " -d - debug (must compile with DEBUG)"
- X#endif /* ?DEBUG */
- X};
- Xstatic HelpLen = sizeof(HelpMsg)/sizeof(char *);
- X
- X#ifndef lint
- Xstatic char *SCCSid[] = {
- X "@(#)finddup.c v1.13 - 2/22/91",
- X "Copyright (c) 1991 by Bill Davidsen, all rights reserved"
- X};
- X#endif
- X
- Xint comp1(); /* compare two filedesc's */
- Xvoid scan1(); /* make the CRC scan */
- Xvoid scan2(); /* do full compare if needed */
- Xvoid scan3(); /* print the results */
- Xunsigned long get_crc(); /* get crc32 on a file */
- Xchar *getfn(); /* get a filename by index */
- X
- Xmain(argc, argv)
- Xint argc;
- Xchar *argv[];
- X{
- X char curfile[MAXFN];
- X struct stat statbuf;
- X int ch;
- X int firsterr = 0; /* flag on 1st error for format */
- X int firsttrace = 0; /* flag for 1st trace output */
- X off_t loc; /* location of name in the file */
- X int zl_hdr = 1; /* need header for zero-length files list */
- X filedesc *curptr; /* pointer to current storage loc */
- X
- X /* parse options, if any */
- X opterr = 0;
- X while ((ch = getopt(argc, argv, OPTSTR)) != EOF) {
- X switch (ch) {
- X case 'l': /* set link flag */
- X linkflag = 0;
- X break;
- X#ifdef DEBUG
- X case 'd': /* debug */
- X ++DebugFlg;
- X break;
- X#endif /* ?DEBUG */
- X case 'h': /* help */
- X case '?':
- X for (ch = 0; ch < HelpLen; ++ch) {
- X printf("%s\n", HelpMsg[ch]);
- X }
- X exit(1);
- X }
- X }
- X
- X /* correct for the options */
- X argc -= (optind-1);
- X argv += (optind-1);
- X
- X /* check for filename given, and open it */
- X if (argc != 2) {
- X fprintf(stderr, "Needs name of file with filenames\n");
- X exit(1);
- X }
- X namefd = fopen(argv[1], "r");
- X if (namefd == NULL) {
- X perror("Can't open names file");
- X exit(1);
- X }
- X
- X /* start the list of name info's */
- X filelist = (filedesc *) malloc(50 * sizeof(filedesc));
- X if (filelist == NULL) {
- X perror("Can't start files vector");
- X exit(1);
- X }
- X /* finish the pointers */
- X max_files = 50;
- X debug(("First vector allocated @ %08lx, size %d bytes\n",
- X (long) filelist, 50*sizeof(filedesc)
- X ));
- X fprintf(stderr, "build list...");
- X
- X /* this is the build loop */
- X while (loc = ftell(namefd), fgets(curfile, MAXFN, namefd) != NULL) {
- X /* check for room in the buffer */
- X if (n_files == max_files) {
- X /* allocate more space */
- X max_files += 50;
- X filelist =
- X (filedesc *) realloc(filelist, (max_files)*sizeof(filedesc));
- X if (filelist == NULL) {
- X perror("Out of memory!");
- X exit(1);
- X }
- X debug(("Got more memory!\n"));
- X }
- X curfile[strlen(curfile)-1] = EOS;
- X
- X /* add the data for this one */
- X if (stat(curfile, &statbuf)) {
- X fprintf(stderr, "%c %s - ",
- X (firsterr++ == 0 ? '\n' : '\r'), curfile
- X );
- X perror("ignored");
- X continue;
- X }
- X
- X /* check for zero length files */
- X if ( statbuf.st_size == 0) {
- X if (zl_hdr) {
- X zl_hdr = 0;
- X printf("Zero length files:\n\n");
- X }
- X printf("%s\n", curfile);
- X continue;
- X }
- X
- X curptr = filelist + n_files++;
- X curptr->nameloc = loc;
- X curptr->length = statbuf.st_size;
- X curptr->device = statbuf.st_dev;
- X curptr->inode = statbuf.st_ino;
- X curptr->flags = 0;
- X debug(("%cName[%d] %s, size %ld, inode %d\n",
- X (firsttrace++ == 0 ? '\n' : '\r'), n_files, curfile,
- X (long) statbuf.st_size, statbuf.st_ino
- X ));
- X }
- X
- X /* sort the list by size, device, and inode */
- X fprintf(stderr, "sort...");
- X SORT;
- X
- X /* make the first scan for equal lengths */
- X fprintf(stderr, "scan1...");
- X scan1();
- X
- X /* make the second scan for dup CRC also */
- X fprintf(stderr, "scan2...");
- X scan2();
- X
- X fprintf(stderr, "done\n");
- X
- X#ifdef DEBUG
- X for (loc = 0; DebugFlg > 1 && loc < n_files; ++loc) {
- X curptr = filelist + loc;
- X printf("%8ld %08lx %6d %6d %02x\n",
- X curptr->length, curptr->crc32,
- X curptr->device, curptr->inode,
- X curptr->flags
- X );
- X }
- X#endif
- X
- X /* now scan and output dups */
- X scan3();
- X
- X exit(0);
- X}
- X
- X/* comp1 - compare two values */
- Xint
- Xcomp1(p1, p2)
- Xchar *p1, *p2;
- X{
- X register filedesc *p1a = (filedesc *)p1, *p2a = (filedesc *)p2;
- X register int retval;
- X
- X (retval = p1a->length - p2a->length) ||
- X (retval = p1a->crc32 - p2a->crc32) ||
- X (retval = p1a->device - p2a->device) ||
- X (retval = p1a->inode - p2a->inode);
- X
- X return retval;
- X}
- X
- X/* scan1 - get a CRC32 for files of equal length */
- X
- Xvoid
- Xscan1() {
- X FILE *fp;
- X int ix, needsort = 0;
- X
- X for (ix = 1; ix < n_files; ++ix) {
- X if (filelist[ix-1].length == filelist[ix].length) {
- X /* get a CRC for each */
- X if (! GetFlag(ix-1, FL_CRC)) {
- X filelist[ix-1].crc32 = get_crc(ix-1);
- X SetFlag(ix-1, FL_CRC);
- X }
- X if (! GetFlag(ix, FL_CRC)) {
- X filelist[ix].crc32 = get_crc(ix);
- X SetFlag(ix, FL_CRC);
- X }
- X needsort = 1;
- X }
- X }
- X
- X if (needsort) SORT;
- X}
- X
- X/* scan2 - full compare if CRC is equal */
- X
- Xvoid
- Xscan2() {
- X int ix, ix2, lastix;
- X int inmatch; /* 1st filename has been printed */
- X int need_hdr = 1; /* Need a hdr for the hard link list */
- X int lnkmatch; /* flag for matching links */
- X register filedesc *p1, *p2;
- X filedesc wkdesc;
- X
- X /* mark links and output before dup check */
- X for (ix = 0; ix < n_files; ix = ix2) {
- X p1 = filelist + ix;
- X for (ix2 = ix+1, p2 = p1+1, inmatch = 0;
- X ix2 < n_files
- X && p1->device == p2->device
- X && p1->inode == p2->inode;
- X ++ix2, ++p2
- X ) {
- X SetFlag(ix2, FL_LNK);
- X if (linkflag) {
- X if (need_hdr) {
- X need_hdr = 0;
- X printf("\n\nHard link summary:\n\n");
- X }
- X
- X if (!inmatch) {
- X inmatch = 1;
- X printf("\nFILE: %s\n", getfn(ix));
- X }
- X printf("LINK: %s\n", getfn(ix2));
- X }
- X }
- X }
- X debug(("\nStart dupscan"));
- X
- X /* now really scan for duplicates */
- X for (ix = 0; ix < n_files; ix = lastix) {
- X p1 = filelist + ix;
- X for (lastix = ix2 = ix+1, p2 = p1+1, lnkmatch = 1;
- X ix2 < n_files
- X && p1->length == p2->length
- X && p1->crc32 == p2->crc32;
- X ++ix2, ++p2
- X ) {
- X if ((GetFlag(ix2, FL_LNK) && lnkmatch)
- X || fullcmp(ix, ix2) == 0
- X ) {
- X SetFlag(ix2, FL_DUP);
- X /* move if needed */
- X if (lastix != ix2) {
- X int n1, n2;
- X
- X debug(("\n swap %d and %d", lastix, ix2));
- X wkdesc = filelist[ix2];
- X for (n1 = ix2; n1 > lastix; --n1) {
- X filelist[n1] = filelist[n1-1];
- X }
- X filelist[lastix++] = wkdesc;
- X }
- X lnkmatch = 1;
- X }
- X else {
- X /* other links don't match */
- X lnkmatch = 0;
- X }
- X }
- X }
- X}
- X
- X/* scan3 - output dups */
- X
- Xvoid
- Xscan3()
- X{
- X register filedesc *p1, *p2;
- X int ix, ix2, inmatch, need_hdr = 1;
- X char *fnptr; /* pointer to the filename for sups */
- X char headfn[132]; /* temp holder of the filename */
- X
- X /* now repeat for duplicates, links or not */
- X for (ix = 0; ix < n_files; ++ix) {
- X if (GetFlag(ix, FL_DUP)) {
- X /* put out a header if you haven't */
- X if (!inmatch) {
- X strcpy(headfn, getfn(ix-1));
- X fnptr = headfn;
- X }
- X inmatch = 1;
- X if (linkflag || !GetFlag(ix, FL_LNK)) {
- X /* header on the very first */
- X if (need_hdr) {
- X need_hdr = 0;
- X printf("\n\nList of files with duplicate contents");
- X if (linkflag) printf(" (includes hard links)");
- X putchar('\n');
- X }
- X
- X /* 1st filename if any dups */
- X if (fnptr != NULL) {
- X printf("\nFILE: %s\n", fnptr);
- X fnptr = NULL;
- X }
- X printf("DUP: %s\n", getfn(ix));
- X }
- X }
- X else inmatch = 0;
- X }
- X}
- X
- X/* get_crc - get a CRC32 for a file */
- X
- Xunsigned long
- Xget_crc(ix)
- Xint ix;
- X{
- X FILE *fp;
- X register unsigned long val1 = 0x90909090, val2 = 0xeaeaeaea;
- X register int carry;
- X int ch;
- X char fname[MAXFN];
- X
- X /* open the file */
- X fseek(namefd, filelist[ix].nameloc, 0);
- X fgets(fname, MAXFN, namefd);
- X fname[strlen(fname)-1] = EOS;
- X debug(("\nCRC start - %s ", fname));
- X if ((fp = fopen(fname, "r")) == NULL) {
- X fprintf(stderr, "Can't read file %s\n", fname);
- X exit(1);
- X }
- X
- X /* build the CRC values */
- X while ((ch = fgetc(fp)) != EOF) {
- X carry = (val1 & 0x8000000) != 0;
- X val1 = (val1 << 1) ^ ch + carry;
- X val2 += ch << (ch & 003);
- X }
- X fclose(fp);
- X debug(("v1: %08lx v2: %08lx ", val1, val2));
- X
- X return ((val1 & 0xffff) << 12) ^ (val2 && 0xffffff);
- X}
- X
- X/* getfn - get filename from index */
- X
- Xchar *
- Xgetfn(ix)
- Xoff_t ix;
- X{
- X static char fnbuf[MAXFN];
- X
- X fseek(namefd, filelist[ix].nameloc, 0);
- X fgets(fnbuf, MAXFN, namefd);
- X fnbuf[strlen(fnbuf)-1] = EOS;
- X
- X return fnbuf;
- X}
- X
- X/* fullcmp - compare two files, bit for bit */
- X
- Xint
- Xfullcmp(v1, v2)
- Xint v1, v2;
- X{
- X FILE *fp1, *fp2;
- X char filename[MAXFN];
- X register int ch;
- X
- X /* open the files */
- X strcpy(filename, getfn(v1));
- X fp1 = fopen(filename, "r");
- X if (fp1 == NULL) {
- X fprintf(stderr, "%s: ", filename);
- X perror("can't access for read");
- X exit(1);
- X }
- X debug(("\nFull compare %s\n and", filename));
- X
- X strcpy(filename, getfn(v2));
- X fp2 = fopen(filename, "r");
- X if (fp2 == NULL) {
- X fprintf(stderr, "%s: ", filename);
- X perror("can't access for read");
- X exit(1);
- X }
- X debug(("%s", filename));
- X
- X /* now do the compare */
- X while ((ch = getc(fp1)) != EOF) {
- X if (ch - getc(fp2)) break;
- X }
- X
- X /* close files and return value */
- X fclose(fp1);
- X fclose(fp2);
- X debug(("\n return %d", !(ch == EOF)));
- X return (!(ch == EOF));
- X}
- SHAR_EOF
- chmod 0444 finddup.c || echo "restore of finddup.c fails"
- echo "x - extracting finddup.1C (Text)"
- sed 's/^X//' << 'SHAR_EOF' > finddup.1C &&
- X.TH FINDDUP 1 LOCAL
- X.SH NAME
- Xfinddup - find duplicate files in a list
- X.SH SYNOPSIS
- Xfinddup [options] filename
- X.SH DESCRIPTION
- X.ds fd \fBfinddup\fP
- X\*(fd reads a list of filenames from the named file and scans them,
- Xbuilding a list of duplicate files and hard links. These are then
- Xwritten to stdout for the user's information. This can be used to reduce
- Xdisk usage, etc.
- X.SS OPTIONS
- X -l - don't show info on hard links
- X -d - debug. May be used more than once for more info
- X.SS How it works
- X\*(fd stats each name and saves the file length, device, and inode. It
- Xthen sorts the list and builds a CRC for each file which has the same
- Xlength as another file. For files which have the same length and CRC, a
- Xbyte by byte comparison is done to be sure that they are duplicates.
- X.sp
- XThe CRC step for N files of size S bytes requires reading n*S total
- Xbytes, while the byte by byte check must be done for every file against
- Xevery other, and read S*N*(N-1) bytes. Thus the CRC is a large timesaver
- Xin most cases.
- X.SH EXAMPLES
- X $ find /u -type f -print > file.list.tmp
- X $ finddup file.list.tmp
- X.SH FILES
- XOnly the file with the filenames.
- X.SH SEE ALSO
- Xfind(1).
- X.SH DIAGNOSTICS
- XIf files are named in the specification file but not present they will
- Xbe ignored. If an existing file can not be read the program will
- Xterminate rather than generate an incomplete list of duplicates.
- X.SH LIMITATIONS
- XAn option to generate a partial list could be added when a file can't be
- Xaccessed. An option to list only duplites which are not hard links could
- Xbe added.
- X.SH AUTHOR
- XBill Davidsen, davidsen@crdos1.crd.ge.com
- X.SH Copyright
- XCopyright (c) 1991 by Bill Davidsen, all rights reserved. This
- Xprogram may be freely used and distributed in its original
- Xform. Modified versions must be distributed with the original
- Xunmodified source code, and redistribution of the original code
- Xor any derivative program may not be restricted.
- SHAR_EOF
- chmod 0666 finddup.1C || echo "restore of finddup.1C fails"
- echo "x - extracting makefile (Text)"
- sed 's/^X//' << 'SHAR_EOF' > makefile &&
- X# makefile for finddup
- X
- X# ================> custom here
- XCC = /bin/cc
- XCFLAGS = -O
- XLFLAGS =
- XLIBS =
- X# if you don't have getopt on your library, add getopt.o here
- XOPBLST = finddup.o
- X# your favorite shar prog
- XSHAR = shar
- XSHARLST = finddup.c finddup.1C makefile getopt.c
- XBACKLST = s.finddup.c finddup.1C makefile getopt.c
- X# ================> custom ends
- X
- Xfinddup: $(OBJLST)
- X $(CC) -o finddup $(LFLAGS) $(OBJLST) -s
- X
- Xdebug: rfinddup
- X
- X# note that this does NOT leave an object file
- Xrfinddup: finddup.c
- X $(CC) -o rfinddup -g -DDEBUG finddup.c
- X
- Xshar: finddup.shar
- X
- Xfinddup.shar: $(SHARLST)
- X $(SHAR) $(SHARLST) > finddup.shar
- X
- Xclean:
- X if [ -f p.finddup.c -o ! -f s.finddup.c ]; then exit 1; fi
- X rm -f finddup.[co] finddup rfinddup finddup.shar
- X
- Xbackup: clean finddup.zoo
- X
- Xfinddup.zoo: $(BACKLST)
- X zoo aunM finddup $(BACKLST)
- SHAR_EOF
- chmod 0644 makefile || echo "restore of makefile fails"
- echo "x - extracting getopt.c (Text)"
- sed 's/^X//' << 'SHAR_EOF' > getopt.c &&
- X/*
- X** @(#)getopt.c 2.5 (smail) 9/15/87
- X*/
- X
- X/*
- X * Here's something you've all been waiting for: the AT&T public domain
- X * source for getopt(3). It is the code which was given out at the 1985
- X * UNIFORUM conference in Dallas. I obtained it by electronic mail
- X * directly from AT&T. The people there assure me that it is indeed
- X * in the public domain.
- X *
- X * There is no manual page. That is because the one they gave out at
- X * UNIFORUM was slightly different from the current System V Release 2
- X * manual page. The difference apparently involved a note about the
- X * famous rules 5 and 6, recommending using white space between an option
- X * and its first argument, and not grouping options that have arguments.
- X * Getopt itself is currently lenient about both of these things White
- X * space is allowed, but not mandatory, and the last option in a group can
- X * have an argument. That particular version of the man page evidently
- X * has no official existence, and my source at AT&T did not send a copy.
- X * The current SVR2 man page reflects the actual behavor of this getopt.
- X * However, I am not about to post a copy of anything licensed by AT&T.
- X */
- X
- X
- X/*LINTLIBRARY*/
- X#define NULL 0
- X#define EOF (-1)
- X#define ERR(s, c) if(opterr){\
- X extern int write();\
- X char errbuf[2];\
- X errbuf[0] = c; errbuf[1] = '\n';\
- X (void) write(2, argv[0], (unsigned)strlen(argv[0]));\
- X (void) write(2, s, (unsigned)strlen(s));\
- X (void) write(2, errbuf, 2);}
- X
- Xextern char *strchr();
- X
- Xint opterr = 1;
- Xint optind = 1;
- Xint optopt;
- Xchar *optarg;
- X
- Xint
- Xgetopt(argc, argv, opts)
- Xint argc;
- Xchar **argv, *opts;
- X{
- X static int sp = 1;
- X register int c;
- X register char *cp;
- X
- X if(sp == 1)
- X if(optind >= argc ||
- X argv[optind][0] != '-' || argv[optind][1] == '\0')
- X return(EOF);
- X else if(strcmp(argv[optind], "--") == NULL) {
- X optind++;
- X return(EOF);
- X }
- X optopt = c = argv[optind][sp];
- X if(c == ':' || (cp=strchr(opts, c)) == NULL) {
- X ERR(": illegal option -- ", c);
- X if(argv[optind][++sp] == '\0') {
- X optind++;
- X sp = 1;
- X }
- X return('?');
- X }
- X if(*++cp == ':') {
- X if(argv[optind][sp+1] != '\0')
- X optarg = &argv[optind++][sp+1];
- X else if(++optind >= argc) {
- X ERR(": option requires an argument -- ", c);
- X sp = 1;
- X return('?');
- X } else
- X optarg = argv[optind++];
- X sp = 1;
- X } else {
- X if(argv[optind][++sp] == '\0') {
- X sp = 1;
- X optind++;
- X }
- X optarg = NULL;
- X }
- X return(c);
- X}
- SHAR_EOF
- chmod 0644 getopt.c || echo "restore of getopt.c fails"
- exit 0
- --
- bill davidsen (ibmbin-request@crdgw1.crd.ge.com, uunet!crdgw1!ibmbin-request)
- moderator c.b.i.p newsgroup and 386users mailing list.
- binary submissions to ibmbin@crdgw1.crd.ge.com (uunet!crdgw1!ibmbin)
- "Stupidity, like virtue, is its own reward" -me
-
- exit 0 # Just in case...
- --
- Kent Landfield INTERNET: kent@sparky.IMD.Sterling.COM
- Sterling Software, IMD UUCP: uunet!sparky!kent
- Phone: (402) 291-8300 FAX: (402) 291-4362
- Please send comp.sources.misc-related mail to kent@uunet.uu.net.
-