home *** CD-ROM | disk | FTP | other *** search
- From: davidsen@crdos1.crd.ge.com (Bill Davidsen)
- Newsgroups: comp.sources.misc
- Subject: v16i095: finddup - Find duplicate files, Part01/01
- Message-ID: <1991Feb7.185240.16151@sparky.IMD.Sterling.COM>
- Date: 7 Feb 91 18:52:40 GMT
- Approved: kent@sparky.imd.sterling.com
- X-Checksum-Snefru: d288c996 f18e7c14 36b46d00 0ca5a73c
-
- Submitted-by: davidsen@crdos1.crd.ge.com (Bill Davidsen)
- Posting-number: Volume 16, Issue 95
- Archive-name: finddup/part01
-
- This is a program I wrote to help a friend sort through 300MB of GIF
- files, many of which were duplicates with different names, like
- "redwood.gif" and "bigtree.gif."
-
- It does a fairly fast and very complete scan for duplicates, and
- identifies hard links as well. Files with duplicate length are checked
- by CRC, and then if they still match a byte by byte compare is done.
- This seems to be the minimum which will find all duplicates while
- avoiding any possibility of a false match.
-
- Bill
- ---------------- Cut here --------------------
- #!/bin/sh
- # shar: Shell Archiver (v1.29)
- #
- # Run the following text with /bin/sh to create:
- # finddup.c
- # finddup.1C
- # makefile
- #
- echo "x - extracting finddup.c (Text)"
- sed 's/^X//' << 'SHAR_EOF' > finddup.c &&
- X/*****************************************************************
- X | finddup.c - the one true find duplicate files program
- X |----------------------------------------------------------------
- X | Bill Davidsen, last hacked 2/5/91
- X | Copyright (c) 1991 by Bill Davidsen, all rights reserved. This
- X | program may be freely used and distributed in its original
- X | form. Modified versions must be distributed with the original
- X | unmodified source code, and redistribution of the original code
- X | or any derivative program may not be restricted.
- X |----------------------------------------------------------------
- X | Calling sequence:
- X | finddup checklist
- X |
- X | where checklist is the name of a file containing filenames to
- X | be checked, such as produced by "find . -type f -print >file"
- X | returns a list of linked and duplicated files.
- X ****************************************************************/
- X
- X#include <stdio.h>
- X#include <sys/types.h>
- X#include <sys/stat.h>
- X#include <malloc.h>
- X
- X/* parameters */
- X#define MAXFN 120 /* max filename length */
- X
- X/* constants */
- X#define EOS ((char) '\0') /* end of string */
- X#define FL_CRC 0x0001 /* flag if CRC valid */
- X#define FL_DUP 0x0002 /* files are duplicates */
- X#define FL_LNK 0x0004 /* file is a link */
- X
- X/* macros */
- X#ifdef DEBUG
- X#define debug(X) printf X
- X#else
- X#define debug(X)
- X#endif
- X#define SORT qsort((char *)filelist, n_files, sizeof(filedesc), comp1);
- X#define GetFlag(x,f) ((filelist[x].flags & (f)) != 0)
- X#define SetFlag(x,f) (filelist[x].flags |= (f))
- X
- Xtypedef struct {
- X off_t length; /* file length */
- X unsigned long crc32; /* CRC for same length */
- X dev_t device; /* physical device # */
- X ino_t inode; /* inode for link detect */
- X off_t nameloc; /* name loc in names file */
- X char flags; /* flags for compare */
- X} filedesc;
- X
- Xfiledesc *filelist; /* master sorted list of files */
- Xlong n_files = 0; /* # files in the array */
- Xlong max_files = 0; /* entries allocated in the array */
- XFILE *namefd; /* file for names */
- X#ifndef lint
- Xstatic char *SCCSid[] = {
- X "@(#)finddup.c v1.10 - 2/5/91",
- X "Copyright (c) 1991 by Bill Davidsen, all rights reserved"
- X};
- X#endif
- X
- Xint comp1(); /* compare two filedesc's */
- Xvoid scan1(); /* make the CRC scan */
- Xvoid scan2(); /* do full compare if needed */
- Xvoid scan3(); /* print the results */
- Xunsigned long get_crc(); /* get crc32 on a file */
- Xchar *getfn(); /* get a filename by index */
- X
- Xmain(argc, argv)
- Xint argc;
- Xchar *argv[];
- X{
- X char curfile[MAXFN];
- X struct stat statbuf;
- X off_t loc; /* location of name in the file */
- X int zl_hdr = 1; /* need header for zero-length files list */
- X filedesc *curptr; /* pointer to current storage loc */
- X
- X /* check for filename given, and open it */
- X if (argc != 2) {
- X fprintf(stderr, "Needs name of file with filenames\n");
- X exit(1);
- X }
- X namefd = fopen(argv[1], "r");
- X if (namefd == NULL) {
- X perror("Can't open names file");
- X exit(1);
- X }
- X
- X /* start the list of name info's */
- X filelist = (filedesc *) malloc(50 * sizeof(filedesc));
- X if (filelist == NULL) {
- X perror("Can't start files vector");
- X exit(1);
- X }
- X /* finish the pointers */
- X max_files = 50;
- X debug(("First vector allocated @ %08lx, size %d bytes\n",
- X (long) filelist, 50*sizeof(filedesc)
- X ));
- X fprintf(stderr, "build list...");
- X
- X /* this is the build loop */
- X while (loc = ftell(namefd), fgets(curfile, MAXFN, namefd) != NULL) {
- X /* check for room in the buffer */
- X if (n_files == max_files) {
- X /* allocate more space */
- X max_files += 50;
- X filelist =
- X (filedesc *) realloc(filelist, (max_files)*sizeof(filedesc));
- X if (filelist == NULL) {
- X perror("Out of memory!");
- X exit(1);
- X }
- X debug(("Got more memory!\n"));
- X }
- X curfile[strlen(curfile)-1] = EOS;
- X
- X /* add the data for this one */
- X if (stat(curfile, &statbuf)) {
- X fprintf(stderr, "%s - ", curfile);
- X perror("ignored");
- X continue;
- X }
- X
- X /* check for zero length files */
- X if ( statbuf.st_size == 0) {
- X if (zl_hdr) {
- X zl_hdr = 0;
- X printf("Zero length files:\n\n");
- X }
- X printf("%s\n", curfile);
- X continue;
- X }
- X
- X curptr = filelist + n_files++;
- X curptr->nameloc = loc;
- X curptr->length = statbuf.st_size;
- X curptr->device = statbuf.st_dev;
- X curptr->inode = statbuf.st_ino;
- X curptr->flags = 0;
- X debug(("Name[%d] %s, size %ld, inode %d\n", n_files, curfile,
- X (long) statbuf.st_size, statbuf.st_ino
- X ));
- X }
- X
- X /* sort the list by size, device, and inode */
- X fprintf(stderr, "sort...");
- X SORT;
- X
- X /* make the first scan for equal lengths */
- X fprintf(stderr, "scan1...");
- X scan1();
- X
- X /* make the second scan for dup CRC also */
- X fprintf(stderr, "scan2...");
- X scan2();
- X
- X fprintf(stderr, "done\n");
- X
- X#ifdef DEBUG
- X for (loc = 0; loc < n_files; ++loc) {
- X curptr = filelist + loc;
- X printf("%8ld %08lx %6d %6d %02x\n",
- X curptr->length, curptr->crc32,
- X curptr->device, curptr->inode,
- X curptr->flags
- X );
- X }
- X#endif
- X
- X /* now scan and output dups */
- X scan3();
- X
- X exit(0);
- X}
- X
- X/* comp1 - compare two values */
- Xint
- Xcomp1(p1, p2)
- Xchar *p1, *p2;
- X{
- X register filedesc *p1a = (filedesc *)p1, *p2a = (filedesc *)p2;
- X register int retval;
- X
- X (retval = p1a->length - p2a->length) ||
- X (retval = p1a->crc32 - p2a->crc32) ||
- X (retval = p1a->device - p2a->device) ||
- X (retval = p1a->inode - p2a->inode);
- X
- X return retval;
- X}
- X
- X/* scan1 - get a CRC32 for files of equal length */
- X
- Xvoid
- Xscan1() {
- X FILE *fp;
- X int ix, needsort = 0;
- X
- X for (ix = 1; ix < n_files; ++ix) {
- X if (filelist[ix-1].length == filelist[ix].length) {
- X /* get a CRC for each */
- X if (! GetFlag(ix-1, FL_CRC)) {
- X filelist[ix-1].crc32 = get_crc(ix-1);
- X SetFlag(ix-1, FL_CRC);
- X }
- X if (! GetFlag(ix, FL_CRC)) {
- X filelist[ix].crc32 = get_crc(ix);
- X SetFlag(ix, FL_CRC);
- X }
- X needsort = 1;
- X }
- X }
- X
- X if (needsort) SORT;
- X}
- X
- X/* scan2 - full compare if CRC is equal */
- X
- Xvoid
- Xscan2() {
- X int ix, ix2, lastix;
- X int inmatch; /* 1st filename has been printed */
- X int need_hdr = 1; /* Need a hdr for the hard link list */
- X int lnkmatch; /* flag for matching links */
- X register filedesc *p1, *p2;
- X filedesc wkdesc;
- X
- X /* mark links and output before dup check */
- X for (ix = 0; ix < n_files; ix = ix2) {
- X p1 = filelist + ix;
- X for (ix2 = ix+1, p2 = p1+1, inmatch = 0;
- X ix2 < n_files
- X && p1->device == p2->device
- X && p1->inode == p2->inode;
- X ++ix2, ++p2
- X ) {
- X SetFlag(ix2, FL_LNK);
- X if (need_hdr) {
- X need_hdr = 0;
- X printf("\n\nHard link summary:\n\n");
- X }
- X
- X if (!inmatch) {
- X inmatch = 1;
- X printf("\nFILE: %s\n", getfn(ix));
- X }
- X printf("LINK: %s\n", getfn(ix2));
- X }
- X }
- X debug(("\nStart dupscan"));
- X
- X /* now really scan for duplicates */
- X for (ix = 0; ix < n_files; ix = lastix) {
- X p1 = filelist + ix;
- X for (lastix = ix2 = ix+1, p2 = p1+1, lnkmatch = 1;
- X ix2 < n_files
- X && p1->length == p2->length
- X && p1->crc32 == p2->crc32;
- X ++ix2, ++p2
- X ) {
- X if ((GetFlag(ix2, FL_LNK) && lnkmatch)
- X || fullcmp(ix, ix2) == 0
- X ) {
- X SetFlag(ix2, FL_DUP);
- X /* move if needed */
- X if (lastix != ix2) {
- X int n1, n2;
- X
- X debug(("\n swap %d and %d", lastix, ix2));
- X wkdesc = filelist[ix2];
- X for (n1 = ix2; n1 > lastix; --n1) {
- X filelist[n1] = filelist[n1-1];
- X }
- X filelist[lastix++] = wkdesc;
- X }
- X lnkmatch = 1;
- X }
- X else {
- X /* other links don't match */
- X lnkmatch = 0;
- X }
- X }
- X }
- X}
- X
- X/* scan3 - output dups */
- X
- Xvoid
- Xscan3()
- X{
- X register filedesc *p1, *p2;
- X int ix, ix2, inmatch, need_hdr = 1;
- X
- X /* now repeat for duplicates, links or not */
- X for (ix = 0; ix < n_files; ++ix) {
- X if (GetFlag(ix, FL_DUP)) {
- X /* header on the very first */
- X if (need_hdr) {
- X need_hdr = 0;
- X printf("\n\nList of files with duplicate contents %s\n",
- X "(includes hard links)"
- X );
- X }
- X
- X /* put out a header if you haven't */
- X if (!inmatch) printf("\nFILE: %s\n", getfn(ix-1));
- X inmatch = 1;
- X printf("DUP: %s\n", getfn(ix));
- X }
- X else inmatch = 0;
- X }
- X}
- X
- X/* get_crc - get a CRC32 for a file */
- X
- Xunsigned long
- Xget_crc(ix)
- Xint ix;
- X{
- X FILE *fp;
- X register unsigned long val1 = 0x90909090, val2 = 0xeaeaeaea;
- X register int carry;
- X char ch;
- X char fname[MAXFN];
- X
- X /* open the file */
- X fseek(namefd, filelist[ix].nameloc, 0);
- X fgets(fname, MAXFN, namefd);
- X fname[strlen(fname)-1] = EOS;
- X debug(("\nCRC start - %s ", fname));
- X if ((fp = fopen(fname, "r")) == NULL) {
- X fprintf(stderr, "Can't read file %s\n", fname);
- X exit(1);
- X }
- X
- X /* build the CRC values */
- X while ((ch = fgetc(fp)) != EOF) {
- X carry = (val1 & 0x8000000) != 0;
- X val1 = (val1 << 1) ^ ch + carry;
- X val2 += ch << (ch & 003);
- X }
- X fclose(fp);
- X debug(("v1: %08lx v2: %08lx ", val1, val2));
- X
- X return ((val1 & 0xffff) << 12) ^ (val2 && 0xffffff);
- X}
- X
- X/* getfn - get filename from index */
- X
- Xchar *
- Xgetfn(ix)
- Xoff_t ix;
- X{
- X static char fnbuf[MAXFN];
- X
- X fseek(namefd, filelist[ix].nameloc, 0);
- X fgets(fnbuf, MAXFN, namefd);
- X fnbuf[strlen(fnbuf)-1] = EOS;
- X
- X return fnbuf;
- X}
- X
- X/* fullcmp - compare two files, bit for bit */
- X
- Xint
- Xfullcmp(v1, v2)
- Xint v1, v2;
- X{
- X FILE *fp1, *fp2;
- X char filename[MAXFN];
- X register int ch;
- X
- X /* open the files */
- X strcpy(filename, getfn(v1));
- X fp1 = fopen(filename, "r");
- X if (fp1 == NULL) {
- X fprintf(stderr, "%s: ", filename);
- X perror("can't access for read");
- X exit(1);
- X }
- X debug(("\nFull compare %s\n and", filename));
- X
- X strcpy(filename, getfn(v2));
- X fp2 = fopen(filename, "r");
- X if (fp2 == NULL) {
- X fprintf(stderr, "%s: ", filename);
- X perror("can't access for read");
- X exit(1);
- X }
- X debug(("%s", filename));
- X
- X /* now do the compare */
- X while ((ch = getc(fp1)) != EOF) {
- X if (ch - getc(fp2)) break;
- X }
- X
- X /* close files and return value */
- X fclose(fp1);
- X fclose(fp2);
- X debug(("\n return %d", !(ch == EOF)));
- X return (!(ch == EOF));
- X}
- SHAR_EOF
- chmod 0644 finddup.c || echo "restore of finddup.c fails"
- echo "x - extracting finddup.1C (Text)"
- sed 's/^X//' << 'SHAR_EOF' > finddup.1C &&
- X.TH FINDDUP 1 LOCAL
- X.SH NAME
- Xfinddup - find duplicate files in a list
- X.SH SYNOPSIS
- Xfinddup filename
- X.SH DESCRIPTION
- X.ds fd \fBfinddup\fP
- X\*(fd reads a list of filenames from the named file and scans them,
- Xbuilding a list of duplicate files and hard links. These are then
- Xwritten to stdout for the user's information. This can be used to reduce
- Xdisk usage, etc.
- X.SS How it works
- X\*(fd stats each name and saves the file length, device, and inode. It
- Xthen sorts the list and builds a CRC for each file which has the same
- Xlength as another file. For files which have the same length and CRC, a
- Xbyte by byte comparison is done to be sure that they are duplicates.
- X.sp
- XThe CRC step for N files of size S bytes requires reading n*S total
- Xbytes, while the byte by byte check must be done for every file against
- Xevery other, and read S*N*(N-1) bytes. Thus the CRC is a large timesaver
- Xin most cases.
- X.SH EXAMPLES
- X $ find /u -type f -print > file.list.tmp
- X $ finddup file.list.tmp
- X.SH FILES
- XOnly the file with the filenames.
- X.SH SEE ALSO
- Xfind(1).
- X.SH DIAGNOSTICS
- XIf files are named in the specification file but not present they will
- Xbe ignored. If an existing file can not be read the program will
- Xterminate rather than generate an incomplete list of duplicates.
- X.SH LIMITATIONS
- XAn option to generate a partial list could be added when a file can't be
- Xaccessed. An option to list only duplites which are not hard links could
- Xbe added.
- X.SH AUTHOR
- XBill Davidsen, davidsen@crdos1.crd.ge.com
- X.SH Copyright
- XCopyright (c) 1991 by Bill Davidsen, all rights reserved. This
- Xprogram may be freely used and distributed in its original
- Xform. Modified versions must be distributed with the original
- Xunmodified source code, and redistribution of the original code
- Xor any derivative program may not be restricted.
- SHAR_EOF
- chmod 0666 finddup.1C || echo "restore of finddup.1C fails"
- echo "x - extracting makefile (Text)"
- sed 's/^X//' << 'SHAR_EOF' > makefile &&
- X# makefile for finddup
- X
- X# ================> custom here
- XCC = /bin/cc
- XCFLAGS = -O
- XLFLAGS =
- XLIBS =
- X# your favorite shar prog
- XSHAR = shar
- XSHARLST = finddup.c finddup.1C makefile
- X# ================> custom ends
- X
- Xfinddup: finddup.o
- X $(CC) -o finddup $(LFLAGS) finddup.o -s
- X
- Xdebug: rfinddup
- X
- X# note that this does NOT leave an object file
- Xrfinddup: finddup.c
- X $(CC) -o rfinddup -g -DDEBUG finddup.c
- X
- Xshar: finddup.shar
- X
- Xfinddup.shar: $(SHARLST)
- X $(SHAR) $(SHARLST) > finddup.shar
- X
- Xclean:
- X [ -f p.finddup.c -o ! -f s.finddup.c ] && exit 1
- X rm -f finddup.[co] finddup rfinddup finddup.shar
- SHAR_EOF
- chmod 0644 makefile || echo "restore of makefile fails"
- exit 0
-
- exit 0 # Just in case...
- --
- Kent Landfield INTERNET: kent@sparky.IMD.Sterling.COM
- Sterling Software, IMD UUCP: uunet!sparky!kent
- Phone: (402) 291-8300 FAX: (402) 291-4362
- Please send comp.sources.misc-related mail to kent@uunet.uu.net.
-