#!/usr/bin/perl -w # # runWordRanks: runCategories + wordRanks. # # Keep calling the main/master machine to get categories, get hook words # for them with internal code, save, and go round again. # # sjd, 2009-09. # 2009-10-05 sjd: Integrate and move union word list to a hash instead of # spinning the 50MB file once per category.... Refactor and clean up. # use strict; use Getopt::Long; use Unicode::Normalize; my $version = "2010-09-12"; # Default values for various options (dcl so we can print in -help) # my $dft_master = "domU-12-31-39-00-50-74.compute-1.internal"; my $dft_pagedir = "/mnt/wikiData/wikiSplit"; my $dft_unionWordFreqsFile = "_union_ranklist.txt"; my $dft_catListFile = "pageCats.list"; my $dft_threshold = 0.5; my $dft_wikiFrequencyLimit = 50000; # (See showUsage() at end of this file for explanations of these options) # my @theCatsArray = (); # Requested category(s) my $catListFile = $dft_catListFile;# List of pages for each category my $justcount = 0; # Just count pages per category my $filter = 0; # Discard bogus 'words'? my $force = 1; # Survive missing page files my $ignoreCase = 0; my U = "U"; my $master = $dft_master; # Domain name of master server my $nPrefix = 2; # Split pagedir into smaller subdirs my $olineends = "U"; my $pagedir = $dft_pagedir; # Where to find per-page word-freq files my $pageext = "vocab"; # Extension for vocab output files my $quiet = 0; my $rank = 1; # Sort output by rank, not alpha my $standalone = 0; # Don't use category server my $tickInterval = 100000; # Progress reports how often my $threshold = $dft_threshold; # Don't issue hook words < this my $unicode = 1; # I/o in utf8 my $unionWordFreqsFile = $dft_unionWordFreqsFile; my $verbose = 0; my $wikiFrequencyLimit = $dft_wikiFrequencyLimit; ############################################################################### # Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "cat|category=s" => \@theCatsArray, "catListFile=s" => \$catListFile, "filter!" => \$filter, "force!" => \$force, "h|help" => sub { system "perldoc runWordRanks"; exit; }, "ignoreCase!" => \$ignoreCase, "justcount!" => \$justcount, "lineends=s" => \U, "master=s" => \$master, "n|nPrefix=n" => \$nPrefix, "olineends=s" => \$olineends, "pagedir=s" => \$pagedir, "quiet!" => \$quiet, "rank!" => \$rank, "standalone!" => \$standalone, "threshold=f" => \$threshold, "tick=n" => \$tickInterval, "unicode!" => sub { $iencoding = "utf8"; }, "uwf|unionWordFreqsFile=s"=> \$unionWordFreqsFile, "v+" => \$verbose, "version" => sub { die "Version of $version, by Steven J. DeRose.\n"; }, "wikiFrequencyLimit=n" => \$wikiFrequencyLimit, ); ############################################################################### # If we're running without a master server (that is, if we're getting one or # a few specific category names from options, instead of doing a whole series), # then move the options into a hash, or scream if there aren't any. # my %theCats = (); if ($standalone) { for my $c (@theCatsArray) { $theCats{$c}++; die "-cat no supported when using category server.\n" } if ($verbose) { for my $c (sort keys %theCats) { warn "Category: '$c'\n"; } } (scalar keys %theCats > 0) || die "No -category(s) specified.\n"; (-f $catListFile) || die "Can't find page-category list '$catListFile'\n"; open(PAGES, "<$catListFile") || die "Couldn't open $catListFile\n"; binmode PAGES, ":utf8"; ($verbose) && warn "PAGES file '$catListFile' opened successfully.\n"; } # standalone my $requestURI = "http://$master/cgi-bin/nextCategory.pl"; if ($pagedir) { (-d $pagedir) || die "Can't find per-page word-freq directory '$pagedir'\n"; } if ($nPrefix > 0) { # Make sure we have right pagedir subdirs my $aSubdir = "$pagedir/" . ("a" x $nPrefix); (-d $aSubdir) || die "Can't find per-page word-freq subdirectory '$aSubdir'\n"; } my %catVocab = (); # Table of summed word-frequencies my %catVocabPages = (); # Table of num. of cat pages each word was in my %relevance = (); # Table of word with tf-idf values ############################################################################### # Load the entire Wikipedia union word-frequency table (just once, please!) # my %unionWordFreqsHash = (); loadUnionWordFreqsHash(); ############################################################################### # MAIN: ask the master server for categories, and run stats for each. # my $runCount = 0; # How many categories have we done? my $pnum = 0; # Record counter for current category my %pagesDone = (); # Avoid duplicates (if multiple cats) my $pageOpenFailed = 0; # Num. pages we couldn't find my $pagesInCat = 0; # Pages found to be in current category. while(1) { my $catName = getCategoryFromServer(); (defined $catName) || last; $theCats{$catName}++; my $outfile = makeFilename2($catName); ($verbose) && warn "Running category \#$runCount: '$catName'\n"; # Do the big calculation # runOneCategory($catName, "$outfile.hooks"); # Save the result back to the master # my $scpCmd = "scp '$outfile.hooks'" . " 'root\@$master:/hookwords/'"; ($verbose) && warn "Running: $scpCmd\n"; system "$scpCmd" || warn "Couldn't scp '$outfile.hooks'\n"; system "rm '$outfile.hooks'"; $runCount++; warn "\n"; } # Forever close PAGES; warn "Done, handled $runCount categories.\n"; exit; ############################################################################### # Scan the PAGES file (consisting of category:filename), and read all the files # that it indicates are in the desired category (there's a later version of # this file available, with each line having a category name followed by all # pages in that category, tab-separated. Extracted by ___). # # Basically same code as 'wordRanks', except doesn't spin the union word-freqs # file for every category -- just use pre-loaded hash. # sub runOneCategory { my $catName = $_[0]; my $outfile = $_[1]; my $catStartTime = time(); %catVocab = (); # Table of summed word-frequencies %catVocabPages = (); # Table of num. of cat pages each word was in keys(%catVocab) = 1<<12; keys(%catVocabPages) = 1<<12; addUpAllPagesInCategory(); # Results stored in %catVocab if ($verbose) { warn "Starting the tf-idf calculation.\n"; } my $pass = my $fail = 0; for my $v (keys %catVocab) { if ($filter && !filter($v)) { next; } my $divisor = 1.0 + log($unionWordFreqsHash{$v}); my $x = $catVocab{$v} / $divisor; if ($x >= $threshold) { $relevance{$v} = $x; $pass++; } else { $fail++; } } # for category vocab report($outfile,$catStartTime,$pass,$fail); } # runOneCategory ############################################################################### # Scan the file that lists every category\tpage pairs, to find all pages that # are in the desired category(s). For each such page, read it and add the count # for each word to the total word-frequency table for the category. We also # count how many *pages* each word is found in, since we might also want that # for various statistical calculations. # sub addUpAllPagesInCategory { seek PAGES, 0, 0; # rewind while (my $l = ) { $pnum++; ($pnum % $tickInterval == 0) && warn sprintf("Processed %10d pagerefs, found %6d pages, %6d words.\n", $pnum, $pagesInCat, (scalar keys %catVocab)); chomp $l; $l =~ m/^(.*)\t(.*)\s*$/; if (!defined $2) { next; } my $curPage = $1; my $curCat = $2; my $curCatFilename = makeFilename2($curCat); if (!defined $theCats{$curCat}) { next; } if (defined $pagesDone{$curPage}) { next; } $pagesDone{$curPage} = 1; $pagesInCat++; ($verbose) && warn "Page matches: '$curPage'\n"; ($justcount) && next; # Turn page title into a filename, the same as categoryVocabulary did. # my $curPageFilename = makeFilename($curPage); my $prefix = lc(substr($curPage,0,$nPrefix)); my $pagepath = "$pagedir/$prefix/$curPage.$pageext"; if (!open(PAGE,"<$pagepath")) { ($quiet) || warn "Couldn't open page $pagesInCat: '$pagepath'\n"; ($force) || die "Terminating.\n"; $pageOpenFailed++; next; } binmode PAGE, ":utf8"; ($verbose) && warn "Opened page vocab file $pagesInCat: '$pagepath'\n"; my $pnum = 0; my $typeCount = 0; while (my $pline = ) { $pnum++; chomp $pline; ($pline =~ m/^\s*1) && warn "Line $pnum parses to '$n' '$w'\n"; if ($ignoreCase) { $w = lc($w); } if ($n !~ m/^\d+$/) { warn "Bad num: '$pline'\n"; } $catVocab{$w} += $n; $catVocabPages{$w}++; $typeCount++; } else { ($verbose) && warn " Line $pnum parse fails: '$pline'\n"; } } # PAGE EOF close PAGE; } # PAGES EOF ($pagesInCat>0) || die "No pages found in category(s) " . join(", ", keys %theCats) . ".\n"; (scalar keys %catVocab > 0) || die "No words were loaded. All files missing?\n"; warn sprintf( "%d of %d page-category pairs (%8.5f%%) match category(s); %d not found.\n", $pagesInCat, $pnum, $pagesInCat*100.0/$pnum, $pageOpenFailed); ($justcount) && exit; warn " for " . (scalar keys %catVocab) . " word-types.\n"; } # addUpAllPagesInCategory ############################################################################### # Report the main result: tf-idf table. # sub report { my $tgtfile = $_[0]; # my $catStartTime = $_[1]; # my $elapsed = time() - $catStartTime; print $tgtfile "# tf-idf calculation for category(s) " . join(", ",%theCats) . "\n"; print $tgtfile "# $pagesInCat pages attempted, $pageOpenFailed failed to open.\n"; print $tgtfile "# words from category pages: " . (scalar keys %catVocab) . ", from Wiki total: " . (scalar keys %unionWordFreqsHash) . ", above threshold:" . (scalar keys %relevance) . ".\n"; print $tgtfile "# tf-idf cat occ wiki occ word\n"; for my $v (sort byrank keys %relevance) { if ($unionWordFreqsHash{$v} > $wikiFrequencyLimit) { next; } print $tgtfile sprintf(" %8.5f: %8d: %8d: %s\n", $relevance{$v}, $catVocab{$v}, $unionWordFreqsHash{$v}, $v); } } ############################################################################### # Allow for sorting the final tf-idf report alphabetically or by rank. # sub byrank { # to sort %vocab if ($rank) { return ($relevance{$b} <=> $relevance{$a}); } if ($a lt $b) { return(-1); } if ($a gt $b) { return(1); } return(0); } ############################################################################### # Discard words that we obviously don't want.... # sub filter { if ($_[0] =~ m/[^\w]/) { return(0); } if (length($_[0])<4) { return(0); } return(1); } ############################################################################### # Load the union word-freqs table into a mongo hash for speed. Expect # about 3.8M words (but we don't store the ~54% of them that only occur once, # because we can just assume '1' if we don't find an entry). # sub loadUnionWordFreqsHash { warn "Loading union word-frequency table...\n"; open(UWF, "<$unionWordFreqsFile") || die "No union word-freqs list found at '$unionWordFreqsFile'.\n"; binmode UWF, ":utf8"; keys(%unionWordFreqsHash) = 1<<20; my $lnum = 0; while (my $l = ) { $lnum++; ($lnum % $tickInterval == 0) && warn sprintf( "Scanned %8d union word-freqs lines (sorted?), loaded %8d.\n", $lnum, scalar keys %unionWordFreqsHash); chomp $l; $l =~ m/\s*(\d+):\s*(.*)$/; if (defined $1 && $1 <= 1) { next; } # Don't store hapax legomena. $unionWordFreqsHash{$2} = $1; } close UWF; warn "Union word frequency hash has been loaded.\n\n"; } # loadUnionWordFreqsHash ############################################################################### # Get the next category name from the master server. # sub getCategoryFromServer { warn "Requesting category from '$requestURI'\n"; my $catName = `curl --silent $requestURI`; chomp $catName; if ($catName eq "#EOF") { warn "Hit EOF.\n"; return(); } if ($catName eq "") { warn "Empty category name from master.\n"; return(""); } return($catName); } ############################################################################### ############################################################################### # makeFilename() cleans up Wikipedia category names to use as filenames. # # This is because (at least in the first cut), script 'categoryVocabulary' # writes the word-frequency table for each page to a separate file. So we get # rid of nearly all punctuation, case distinctions, etc.; but we need to keep # all that info so that pages with similar titles don't collide. Newer plan is # to put all the word-freq tables into *one* file (save zillions of fopen()s), # and make another file with an index from (raw) page-titles to the offsets of # their portions of the big file. # ############################################################################### # Safe way to make filename -- preserves all info, but in case-insens ASCII. # (xxx) for anything but [a-zA-Z0-9] Put "_" before A-Z. # Note: Can't do a whole path, will toast [/.\\:] # (copied from categoryVocabulary -- keep in sync!!!) # NOTE: Not compatible with earlier makeFilename()! # sub makeFilename2 { my $rc = ""; $_[0] =~ s/\|.*$//; # Remove 'sortby' field if present for (my $i=0; $i= ord("a") && $o <= ord("z") || $o >= ord("0") && $o <= ord("9")) { $rc .= $c; } elsif ($o >= ord("A") && $o <= ord("Z")) { $rc .= "_" . lc($c); } if ($o >= ord("a") && $o <= ord("z")) { $rc .= sprintf("(%x)", $o); } } return($rc); } # makeFilename2 # Version to match filenames originally generated by categoryVocabulary. # Bad idea because fixing name-conflicts can't be done compatibly at generate # and search time (generator finds conflict and renames, but how would we # know which number suffix to use when searching for any of the titles later? # # Only here for backwards-(broken)-compatibility. Need to re-run the whole # Wikipedia scan to fix this for real (and should store the unmodified page # title within each page file to prevent later similar problems. # sub makeFilename { my $nameConflicts = 0; my $f = $_[0]; my $ext = $_[1]; $f =~ s/[^\w\d\.]/_/g; $f =~ s/__+/_/g; $f =~ s/^\./_/; $f = lc($f); my $prefix = ""; if ($nPrefix > 0) { $prefix = substr($f."_______",0,$nPrefix) . "/"; if (!-e $prefix) { system "mkdir -p '$prefix'" || warn "Couldn't make dir '$prefix'\n"; } } my $suf = 1; my $newf = $f; while (-f "$newf.$ext") { # Count until no filename conflict $suf++; $newf = sprintf("%s%04d",$f,$suf); } if ($newf ne $f) { $nameConflicts++; } return("$newf.$ext"); } # makeFilename ############################################################################### # sub showUsage { warn " =head1 Usage wordRanks [options] file 0: Load the union Wikipedia word-frequencies list into a (big) hash. (generated by sumVocabForCategory -category '*' or ____) 1: Fetch a category from the master cloud server and: a: Scan a file that lists all the WikiPedia pages for a given category. (generated by categoryVocabulary -collectPagesByCategory). Retrieve and sum the word-frequency lists for each page found, b: Estimate how characteristic each word is for the category. c: Print out the words that come out above a certain threshold (and that aren't discardable for some other obvious reason). =head1 Options (prefix 'no' to negate where applicable) =over =item * B<-filter> Discard various words a prior (by length, charset, etc) =item * B<-force> Ok if some page-vocab files not found (default). =item * B<-ignorecase> Union data for words differing only in case. =item * B<-justcount> Just count how many pages for the category. =item * B<-lineends t> Assume Unix, Dos, or Mac line-breaks for input. =item * B<-master domain> Domain for the master machine to supply category names (default = $dft_master). Note: clouds used different URIs for internal and external access. =item * B<-nPrefix n> How many letters from start of page-title, to use to choose subdirectory of -pagedir (to keep directories from getting slow due to too many files). =item * B<-olineends t> Write Unix, Dos, or Mac line-breaks for output. =item * B<-pagedir path> Specify path to the dir of per-page word-freq files. If -nprefix is n, this must have subdirs for each n-char prefix of page titles, to keep dirs smaller (would be better to hash the name for dispersion). =item * B<-q> Suppress most messages. =item * B<-rank> Sort output by tf-idf rank (default). =item * B<-standalone> Get categories from a file, not a server. =item * B<-tick n> Report progress every n records (0 to turn off). =item * B<-threshold n> Only issue words with relevence > n (dft $dft_threshold) =item * B<-unicode> Assume input is utf-8 (default). =item * B<-v> Add more messages (repeatable). =item * B<-version> Show version/license info and exit. =item * B<-wikiFrequencyLimit n> Discard words that occur > this many times in total Wikipedia (default = $dft_wikiFrequencyLimit). =back =head1 Options unused (used for standalone/non-server wordRanks): =over =item * B<-category c> A category from WikiPedia to look for (repeatable). =item * B<-catListFile path> Where to find the pagetitle\tcategory file when not using server (default = $dft_catListFile). =back =head1 Known bugs and limitations The all-WikiPedia union vocabulary file should list num pages the word shows up in, not just the total number of occurrences, so we can use that in tf-idf calculation. Master script could load up whole page-list and pass back list of pages instead of just category name, to save spinning that whole file. Should rebuild with all word-freq tables in one file and use seek(). =head1 Related commands categoryVocabulary, splitBitDir, sumVocabForCategory, wordRanks, nextCategory, runCategories. =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see http://creativecommons.org/licenses/by-sa/3.0/. The author's present email is sderose at acm.org. For the most recent version, see http://www.derose.net/steve/utilities/. =cut "; }