home *** CD-ROM | disk | FTP | other *** search
Wrap
#!/usr/bin/perl # -*- Mode: Perl -*- # dirsplit --- # Author : Eduard Bloch ( blade@debian.org ) # Last Modified On : Sun, 06 Feb 2005 14:59:51 +0100 # Status : Working, but use with caution! # License: GPLv2 my $version="0.3.1"; require v5.8.1; use strict; use List::Util 'shuffle'; use Getopt::Long qw(:config no_ignore_case bundling); use File::Basename; use Cwd 'abs_path'; my $ret=0; my $max="4488M"; my $prefix="vol_"; my $acc=500; my $emode=1; my $bsize=2048; my $ofac =50; my $opt_help; my $opt_longhelp; my $opt_sim; my $opt_dir; my $opt_flat; my $opt_cor; my $opt_move; my $opt_ver; my $opt_sln; my $opt_ln; my $opt_filter; my $get_ver; my $msg=" dirsplit [options] [advanced options] (directory|content-list-file) -H|--longhelp Show the long help message with more advanced options -n|--no-act Only print the commands, no action (implies -v) -s|--size NUMBER - Size of the medium (default: $max) -e|--expmode NUMBER - directory exploration mode (recommended, see long help) -m|--move Move files to target dirs (default: create mkisofs catalogs) -f|--flatdir Flat dir mode, don't recreate directory structure -p|--prefix STRING - first part of catalog/directory name (default: vol_) -c|--correct Remove directory space summaries, eg. from du output -h|--help Show this option summary -v|--verbose More verbosity The complete help can be displayed with the --longhelp (-H) option. The default mode is creating file catalogs useable with: mkisofs -D -r --joliet-long -graft-points -path-list CATALOG Example: dirsplit -m -s 700M -e4 random_data_to_backup/ "; my $msglong=" dirsplit [options] [advanced options] < directory | content-list-file > -n|--no-act Only print the commands, no action (implies -v) -s|--size NUMBER - Size of the medium (default: $max) -m|--move Move files to target dirs (default: create mkisofs catalogs) -l|--symlink similar to -m but just creates symlinks in the target dirs -L|--hardlink like -l but creates hardlinks -p|--prefix STRING - first part of catalog/directory name (default: vol_) -f|--filter EXPR - Filter expression, see examples below and perlre manpage --flat Flat dir mode, don't recreate subdirectory structure (not recommended) -e|--expmode NUMBER, special exploration modes, used with directory argument 0: dumb file search with \"du -a\", file sizes are rounded up by du, every file treated as object. Slightly till very inaccurate, dependending on filesystem types and mkisofs options. 1: (default) native exploration of the specified directory, but file sizes are rounded up to 2048 blocks plus estimated overhead for filenames (see -o option) 2: like 1, but when single files _and_ directory found in the same directory somewhere, the files are treated as one object 3: like 2, but don't coalesc when the size of the virtual object becomes too large for a medium size (currently $max) 4: like 2, but the max. size of the virtual object built on files is limited to $max (create another after max. size) -b|--blksize NUMBER, block size of the target filesystem (currently $bsize). Works in exploration mode. -o|--overhead NUMBER, overhead caused by directory entries (as factor for the filename length, default: 50, empiricaly found for Joliet+RR with not-so-deep directory structure). Works in exploration mode. -a|--accuracy NUMBER (1=faster, large number=better efficiency, default: 500) -c|--correct Fix input data when it comes from du (KiB expansion, dupes/summaries removal) -h|--help Show this option summary -v|--verbose More verbosity The content list may be read from a file or from standard input (use -) and contain lines with file sizes and file/directory names. File sizes are expected to be in KiB, append modifier letters to recalculate them, b for bytes, m for megabytes (10^6) or mebibytes (2^10). The default mode is creating file catalogs useable with mkisofs -D -r --joliet-long -graft-points -path-list CATALOG Examples: dirsplit -m -s 120M -e4 largedirwithdata/ -p /zipmedia/backup_ #move stuff into splitted backup dirs dirsplit -s 700M -e2 music/ # make mkisofs catalogs to burn all music to 700M CDRs, keep single files in each dir together dirsplit -s 700M -e2 -f '/other\\/Soundtracks/' music/ # like above, only take files from other/Soundtracks dirsplit -s 700M -e2 -f '!/Thumbs.db|Desktop.ini|\\.m3u\$/i' # like above, ignore some junk files and playlists, both letter cases (old method:) du -s mp3/Collections/Rock/* mp3/Singles/Pop/* | dirsplit -s 701M - You should compare the required size of the created catalogs, as in_ for x in *list ; do mkisofs -quiet -D -r --joliet-long -graft-points -path-list \$x -print-size; done with the media data (cdrecord -v -toc ...). dirsplit calculates very sharp but without knowing the result in advance, so unexpected deep directory structures may create additional overhead. Make sure you have some reserve capacity when specifying the max media size. "; my %options = ( "h|help" => \$opt_help, "d|dirhier" => \$opt_dir, "flat" => \$opt_flat, "f|filter=s" => \$opt_filter, "e|expmode=i" => \$emode, "o|overhead=i" => \$ofac, "b|blksize=i" => \$bsize, "n|no-act" => \$opt_sim, "m|move" => \$opt_move, "l|symlink" => \$opt_sln, "L|hardlink" => \$opt_ln, "v|verbose" => \$opt_ver, "s|size=s" => \$max, "p|prefix=s" => \$prefix, "c|correct" => \$opt_cor, "a|accuracy=i" => \$acc, "H|longhelp" => \$opt_longhelp, "version" => \$get_ver ); die $msg unless ( GetOptions(%options)); if($opt_help) { print $msg; exit 0; } if($opt_longhelp) { print $msglong; exit 0; } if($get_ver) { print $version; exit 0; } # ignore the old dirhier setting since it is default now and disable the flag when opt_flat is specified $opt_dir = !$opt_flat; $opt_ver = 1 if $opt_sim; $opt_move=1 if ($opt_sln || $opt_ln); sub fixnr { # args: # Number # optional: default multiplier my $fac; my $nr; if($_[0]=~/(\d+)(\D)/) { $nr=$1; $fac=$2; } elsif(defined($_[1])) { $nr=$_[0]; $fac=$_[1]; } else { return $_[0]; } return $nr*1000000 if($fac eq "m"); return $nr*1048576 if($fac eq "M"); return $nr*1000 if($fac eq "k"); return $nr*1024 if($fac eq "K"); return $nr if($fac eq "b"); die "$fac is not a valid multiplier!"; } sub mkdirhier { return 1 if($_[0] eq "."); return 1 if(-d $_[0] && -w $_[0]); return 0 if !mkdirhier(dirname($_[0])); return mkdir $_[0]; } my $l; my @in; my %names; my %coalesced; # this will contain arrays with coalesced files # name to size my %ntos; my @indata; my $inputdir; $max=fixnr($max); # about 400kB for iso headers $max-=420000; # parse du -s output if(-f $ARGV[0] || (-f readlink($ARGV[0])) || $ARGV[0] eq "-") { die "Exploration mode argument is useless with pregenerated data, aborting...\n" if($emode); open($l, "<".$ARGV[0]); @indata=<$l>; &parseduinput; } elsif(-d $ARGV[0] || (-d readlink($ARGV[0]))) { if($emode) { #die "not implemented yet"; $opt_cor=0; $inputdir=Cwd::abs_path($ARGV[0]); &explore($inputdir); } else { $opt_cor=1; @indata=`du -l -a $ARGV[0]/`; &parseduinput; } } else { die "Directory or contents listing needed!\n"; } # recursive function # parameter: directory # mode 1: descend as far as possible and index all non-directories # mode 2++: # put all files of a dir into coaleseced-object, then descend into each dir sub explore { (my $dir) = @_; my @stuff; my @dirs; my @files; # print "D: $dir\n"; opendir(DIR, $dir) || die "Could not open $dir\n"; while (my $f = readdir(DIR)) { next if ($f eq "." || $f eq ".."); #print "\$f=$opt_filter;\n"; $f="$dir/$f" if($dir ne "."); if(-d $f && !-l $f) { push(@dirs, $f); } else { if ($opt_filter) { # print "D:exp: \$f=$opt_filter;, Wert: ".eval("\$f=~$opt_filter;")."\n"; if(eval("\$f=~$opt_filter;")) { # print "D: added $f\n"; push(@files, $f); } } else { # print "D: added $f\n"; push(@files, $f); } } } closedir(DIR); #print "D: $dir fertig",@dirs; if($#dirs < 0 && $#files < 0) { # this one is empty, register for cosmetics reason return if ($opt_filter && !eval("\$dir=~$opt_filter;")); $ntos{$dir}=getsize($dir); return; } explore($_) for(@dirs); if($emode==1) { $ntos{$_}=getsize($_) for(@files); } else { my $filesum=0; for(@files) { my $tmp=getsize($_); if($tmp>$max) { # already too large, stop right here die "Too large file ($_) for the given max size $max, aborting...\n"; } $filesum += $tmp; }; if($filesum>$max) { # too large coal. object... if($emode==3) { # don't coalesc in this mode, leave them alone $ntos{$_}=getsize($_) for(@files); return; } if($emode==4) { # print "D: mode 4!\n"; # a bit complicated, split file set into coal.objects my $partsum=0; my @sorted=sort(@files); my @tmpvol; for(my $i=0;$i<=$#sorted;$i++) { # print "D: i: $i, partsum: $partsum, file: $sorted[$i]\n"; my $tmp=getsize($sorted[$i]); $partsum+=$tmp; if($partsum>$max) { # undo the last step then build the coal.object $partsum-=$tmp; $i--; # print "D: coal: ".join(",", @tmpvol)."\n"; my $iname = ("### Coalesced file object, placeholder for the directory $dir up to file $sorted[$i] ###"." "x256); @{$coalesced{$iname}} = @tmpvol; $ntos{$iname}=$partsum; # tmps reseten undef @tmpvol; undef $partsum; } else { push(@tmpvol, $sorted[$i]); } } return; } } # be an invalid filename if($filesum) { my $iname = ("### Coalesced file object, placeholder for stuff in the directory $dir ###"." "x256); @{$coalesced{$iname}} = @files; $ntos{$iname}=$filesum; } } } sub getsize { (my $file) = @_; my $size = ((stat($file))[7]); my $rest = ($size % $bsize); $size = ($size + $bsize - $rest) if ($rest); return 1+int(200 + $ofac*length(basename($file)) + $size); } sub parseduinput { for(@indata) { chomp; if(/^(\w+)\s+(.+)/ && $2 ne "./") { #print "D: ntos $2 ist ".fixnr($1, "K")."\n"; $ntos{$2}=fixnr($1, "K"); } } } # sort and kill dupes/summaries if($opt_cor) { my @intmp=sort(keys %ntos); # die join("\n", @intmp, ""); my @newin; for(my $i=0;$i<=$#intmp;$i++) { $_=$intmp[$i]; chomp; # weed out directory summaries # print "vgl. ". "$_/ ne substr(".$intmp[$i+1].",0,length($_)+1)) \n"; /.*(.)$/; if ($1 ne "/" && "$_/" ne substr($intmp[$i+1],0,length($_)+1)) { # feed the final data holders push(@in, $ntos{$_}); # linked list behind the hash entry my $realname=$_; $realname=~s!^\./!!; push(@{$names{$ntos{$_}}}, $realname); } } } else { # copy around for(keys %ntos) { # print "wtf, $_, $ntos{$_}\n"; push(@in, $ntos{$_}); # linked list behind the hash entry push(@{$names{$ntos{$_}}}, $_); } } for(@in) { die "Too large object(s) ($_) for the given max size: ".join(", ", @{$names{$_}})."\n" if($_>$max); } $a=0; for(@in) {$a+=$_}; $acc=1 if ($a <= $max); # just generate a list, more trials are pointless print "\nSumm: $a\n" if($opt_ver); die "Nothing to do!\n" if($a<4096); # looks like just an empty dir my $i; my @out; # Parms: bin size (int), input array (arr reference), output array (arr reference) # Returns: wasted space (int) sub bp_bestfit { my $max=$_[0]; my @in = @{$_[1]}; my $target = $_[2]; my @out; my @bel; my @tmp; push(@tmp,$in[0]); push(@out, \@tmp); $bel[0] = $in[0]; shift @in; for(@in) { my $bestplace=$#out+1; my $bestwert=$max; for($i=0;$i<=$#out;$i++) { my $rest; $rest=$max-$bel[$i]-$_; if($rest>0 && $rest < $bestwert) { $bestplace=$i; $bestwert=$rest; }; } if($bestplace>$#out) { my @bin; $bel[$bestplace]=$_; push(@bin, $_); push(@out,\@bin); } else{ $bel[$bestplace]+=$_; push( @{$out[$bestplace]} , $_); } } my $ret=0; # count all rests but the last one for($i=0;$i<$#out;$i++) { $ret+=($max-$bel[$i]); } @{$target} = @out; return $ret; } # Parms: bin size (int), input array (arr reference), output array (arr reference) # Returns: wasted space (int) sub bp_firstfit { my $max=$_[0]; my @in = @{$_[1]}; my $target = $_[2]; my @out; my @bel; piece: foreach my $obj (@in) { # first fit, use the first bin with enough free space for($i=0;$i<=$#out;$i++) { my $newsize=($bel[$i]+$obj); # print $bel[$i]."\n"; if( $newsize <= $max ) { # print "F: bin$i: $bel[$i]+$obj=$newsize\n"; #fits here $bel[$i]=$newsize; push( @{$out[$i]} , $obj); next piece; # break } } # neues Bin my @bin; $bel[$i]=$obj; # print "N: bin$i: $bel[$i]=$obj\n"; push(@bin, $obj); push(@out,\@bin); } my $ret=0; # sum up all rests except of the one from the last bin for($i=0;$i<$#out;$i++) { # print "hm, bel $i ist :".$bel[$i]." und res:".($max-$bel[$i])."\n"; $ret+=($max-$bel[$i]); } @{$target} = @out; # print "wtf, ".join(",", @{$out[0]})."\n"; return $ret; } my @erg; my $globwaste=$max*($#in+1); my $starttime=time; for(1..$acc) { syswrite(STDOUT,"."); my @tmp; #my $waste = bp_bestfit($max, \@in, \@tmp); my $waste = bp_firstfit($max, \@in, \@tmp); #print "D: waste - $waste\n"; @in=shuffle(@in); if($waste < $globwaste) { $globwaste=$waste; @erg=@tmp; } if($starttime && time > $starttime+10) { syswrite(STDOUT,"\nSpent already over 10s (for $_ iterations)\nHint: reduce accuracy to make it faster!\n"); undef $starttime; } } print "\nCalculated, using ".($#erg+1)." volumes.\n"; print "Wasted: $globwaste Byte (estimated, check mkisofs -print-size ...)\n"; # and the real work $i=1; my $inDirLen=length($inputdir); for(@erg) { my $o; open($o, ">$prefix$i.list") if(! ($opt_move | $opt_sim)); for(@{$_}) { my @stuff; my $object=shift(@{$names{$_}}); if(-e $object) { @stuff=($object); } elsif(defined $coalesced{$object}) { @stuff=@{$coalesced{$object}}; } else { print "Warning, $object not found, skipping...\n"; $ret=1; } my $dirPrefix=dirname($prefix); my $prefixBase=basename($prefix); my $dirPrefixAbs=Cwd::abs_path($dirPrefix); for my $file (@stuff) { my $relFile=substr($file,$inDirLen+1); my $base=basename($relFile); if($opt_move) { my $targetsubdir = $dirPrefixAbs."/$prefixBase$i"; $targetsubdir .= "/".dirname($relFile) if($opt_dir); print "$file -> $targetsubdir/$base\n" if($opt_ver); if(!$opt_sim) { mkdirhier $targetsubdir || die "Problems creating $targetsubdir\n"; # last check die "Could not create $targetsubdir?\n" if(!(-d $targetsubdir && -w $targetsubdir)); if($opt_sln) { symlink($file, "$targetsubdir/$base"); } elsif($opt_ln) { if(-d $file && !-l $file) { mkdir "$targetsubdir/$base"; } else { link($file, "$targetsubdir/$base"); } } else { rename($file, "$targetsubdir/$base"); } } } else { # escape = in mkisofs catalogs, they are used as separator my $isoname = ($opt_dir?$relFile : $base); $isoname=~s/=/\\=/g; my $sourcefile=$file; $sourcefile=~s/=/\\=/g; print "$i: /$isoname=$sourcefile\n" if $opt_ver; print $o "/$isoname=$sourcefile\n" if(!$opt_sim); } } } $i++; close($o) if($o); } exit $ret;