#!/usr/bin/perl -w # # splitBigDir: move a bunch of files to subdirs of somewhere, sorting by first n # characters of the filenames. # # 2009-09-09: Written by Steven J. DeRose. # 2009-09-17 sjd: Add Normalize. # 2011-01-19 sjd: Tweak. Add -lpad, -prefix, -test. # # To do: # Ability to move only files of certain extensions? # use strict; use Getopt::Long; use Unicode::Normalize; my $version = "2011-01-19"; my $nChars = 3; my $ignoreCase = 1; my $ignoreNonWord = 0; my $lpad = 0; my $out = ""; my $padNames = 0; my $prefix = "dir"; #my $rpad = ""; my $quiet = 0; my $test = 0; my $tickInterval = 1000; my $unorm = 0; my $verbose = 0; ############################################################################### # Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "h|help" => sub { system "perldoc splitBigDir"; exit; }, "ignoreCase!" => \$ignoreCase, "ignoreNonWord!" => \$ignoreNonWord, "lpad=n" => \$lpad, "n|nChars=n" => \$nChars, "out=s" => \$out, "q!" => \$quiet, "padNames=n" => \$padNames, # "rpad=s" => \$rpad, "test!" => \$test, "tick=n" => \$tickInterval, "unorm!" => \$unorm, "v+" => \$verbose, "version" => sub { die "Version of $version, by Steven J. DeRose.\n"; } ); ($result) || die "Bad options.\n"; $out =~ s/\/$//; ############################################################################### # Set implied options, validate option values... # ($nChars>0) || die "-nChars value must be positive.\n"; if ($lpad!=$padNames && $lpad!=0 && $padNames!=0) { die "-lpad conflicts with -padNames.\n"; } ############################################################################### # my $fileCount = 0; my $renameFail = 0; while (my $originalDir = shift) { ($originalDir && -d $originalDir) || die "Can't find source directory '$originalDir'.\n"; $originalDir =~ s/\/$//; ($verbose) && warn "\nStarting directory '$originalDir'.\n"; if (!$out) { $out = $originalDir; } else { my $cmd = "mkdir -p $out"; if ($test) { warn "Would run: $cmd\n"; } else { system $cmd; } } opendir(ORIG, $originalDir) || die "Couldn't open source dir $originalDir.\n"; # binmode ORIG, ":utf8"; while (my $fname = readdir(ORIG)) { if ($fname eq "." || $fname eq "..") { next; } if (-d "$originalDir/$fname") { ($quiet) || warn "Directory at '$originalDir/$fname'. Not moved.\n"; next; } $fileCount++; handleOneFile($originalDir, $out, $fname); if ($fileCount % $tickInterval == 0) { warn "Processed $fileCount files.\n"; } } close ORIG; } # while args left ($quiet) || warn "Done, $fileCount files " . ($test ? "would have been ":"") . "moved to '$out/' ($renameFail failed).\n"; exit; ############################################################################### # sub handleOneFile { my ($srcdir, $tgtdir, $fname) = @_; $fname =~ s/'/\\'/g; if (!-e "$srcdir/$fname") { warn "Can't find original file at '$srcdir/$fname'.\n"; } my $padded = $fname; my $needed = $lpad - length($fname); if ($needed>0) { $padded = ("0" x $needed) . $fname; #warn "Padding needed ($needed): '$fname' -> '$padded'.\n"; } my $subdir = $prefix . substr($padded.("_" x $nChars),0,$nChars); if ($unorm) { $subdir = NFD($subdir); $subdir =~ s/\pM//g; } if ($ignoreNonWord) { $subdir =~ s/\W//g; } if ($ignoreCase) { $subdir = lc($subdir); } #warn "subdir chosen: '$subdir'\n"; if (! -d "$tgtdir/$subdir") { my $cmd = "mkdir -p '$tgtdir/$subdir'"; if ($test) { warn "Would run: $cmd\n"; } else { system $cmd; } } # Perl readdir, rename, etc. can't deal w/ Unicode well. #system "mv '$odir/$fname' '$odir/$prefix/'"; my $tgtname = ($padNames) ? $padded:$fname; if ($test) { warn "Would call rename($srcdir/$fname, $tgtdir/$subdir/$tgtname).\n"; } elsif (!rename("$srcdir/$fname", "$tgtdir/$subdir/$tgtname")) { warn "Rename failed '$srcdir/$fname' -> '$tgtdir/$subdir/$tgtname'.\n"; $renameFail++; } } # handleOneFile ############################################################################### # =pod =head1 Usage splitBigDir [options] [dirs] Move the files in I<'dirs'> to several other directories, based on the first few characters(s) of the files' names. Use the I<-nChars> option to specify how many leading characters to use (default 3). The consequent subdirectories are created as needed. If you don't specify a target directory with I<-o>, the subdirectories will be within the source directory(s); in that case, the script is smart enough not to mess with subdirectories that are already there. This is useful when a directory is (or would become) just way too big. Most current file systems get slow with more than about 1,000 files in a single directory. =head1 Options =over =item * B<-ignoreCase> Fold to lower case for names of target dirs (default -- required on Mac OS X!). =item * B<-ignoreNonword> Drop initial non-alpha characters for target directory names. =item * B<-lpad> I Left-pad filenames with '0', out to I characters when calculating what subdirectory to put them in. This is most useful for numeric filenames, so they all become the same width, and then the first I<-nChars> characters are taken from the full-width names. The files themselves are not actually renamed by this option; it only affects what subdirectory files are sorted into. To actually extend the filenames to match specify the I<-padnames> I option. B: This length must include any "." and extension! =item * B<-nChars> I Number of initial letters to split by. Names that are shorter than I characters (even after I<-lpad> is applied), will be right-padded with '_' as needed. =item * B<-padnames> I See I<-lpad>. This option does the same thing, but I actually renamed the files themselves, to be left-padded with "0" to the given length. =item * B<-o> I Path under which to find/create the new directories. Default is the source directory itself. =item * B<-prefix> I Put the string I at the start of all the sub-directory names (following by the leading I<-nChars> characters from filenames. Default 'dir'. =item * B<-q> Suppress most messages. =item * B<-test> Just show what I be done, but don't actually move anything or create any directories. =item * B<-unorm> Strip accents from the characters used for directory names (otherwise the rename/move fails). =item * B<-v> Add more messages, and check integrity frequently. =item * B<-version> Show version/license info and exit. =back =head1 Known bugs and limitations Might be better to collapse digits, punctuation, etc. Seems to choke on Unicode chars in filenames. =head1 Related commands padNames =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see http://creativecommons.org/licenses/by-sa/3.0/. The author's present email is sderose at acm.org. For the most recent version, see http://www.derose.net/steve/utilities/. =cut