#!/usr/bin/perl -w # # wordRanks: A Wikipedia tf-idf calculator. # # 2009-09-09: Written by Steven J. DeRose. # 2009-09-14 sjd: Finishing. -jsutcount. # 2009-09-16 sjd: Normalize. count words right. # 2009-09-22 sjd: Support multiple categories. Deal with "|" in category name. # # To do: # Add some alternative calculations. # Count total tokens for the collected pages. # Regexes for categories? # Run tick off of pages opened, not just scanned, since that's the # rate-determining factor? # use strict; use Getopt::Long; use Unicode::Normalize; my $version = "2010-09-12"; my $dft_catListFile = "pageCats.list"; my $dft_wRankList = "_union_ranklist.txt"; my $dft_threshold = 0.5; my $dft_wikiFrequencyLimit = 80000; my @theCatsArray = (); my $catListFile = $dft_catListFile; my $catVocabWrite = 0; my $justcount = 0; my $filter = 0; my $force = 1; my $ignoreCase = 0; my U = "U"; my $nPrefix = 2; my $olineends = "U"; my $pagedir = ""; my $pageext = "vocab"; my $quiet = 0; my $rank = 1; my $tickInterval = 100000; my $threshold = $dft_threshold; my $unicode = 1; my $verbose = 0; my $wikiFrequencyLimit = $dft_wikiFrequencyLimit; my $wRankList = $dft_wRankList; ############################################################################### # Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "cat|category=s" => \@theCatsArray, "catListFile=s" => \$catListFile, "catVocabWrite!" => \$catVocabWrite, "filter!" => \$filter, "force!" => \$force, "h|help" => sub { system "perldoc wordRanks"; exit; }, "i|ignoreCase!" => \$ignoreCase, "justcount!" => \$justcount, "lineends=s" => \U, "n|nPrefix=n" => \$nPrefix, "olineends=s" => \$olineends, "pagedir=s" => \$pagedir, "quiet!" => \$quiet, "rank!" => \$rank, "threshold=f" => \$threshold, "tick=n" => \$tickInterval, "unicode!" => sub { $iencoding = "utf8"; }, "v+" => \$verbose, "version" => sub { die "Version of $version, by Steven J. DeRose.\n"; }, "wikiFrequencyLimit=n" => \$wikiFrequencyLimit, ); ($result) || die "Bad options.\n"; # Copy the category(s) to hash instead of array. my %theCats = (); for my $c (@theCatsArray) { $theCats{$c}++; } for my $c (sort keys %theCats) { warn "Category: '$c'\n"; } (scalar keys %theCats > 0) || die "No -category(s) specified.\n"; ############################################################################### # (-f $catListFile) || die "Can't find page-category list '$catListFile'\n"; (-f $wRankList) || die "Can't find rank-list '$wRankList'\n"; if ($pagedir) { (-d $pagedir) || die "Can't find page directory '$pagedir'\n"; } open(PAGES, "<$catListFile") || die "Couldn't open $catListFile\n"; binmode PAGES, ":utf8"; ($verbose) && warn "PAGES file '$catListFile' opened successfully.\n"; open(TOTAL, "<$wRankList") || die "Couldn't open $wRankList\n"; binmode TOTAL, ":utf8"; ($verbose) && warn "TOTAL file '$wRankList' opened successfully.\n"; # Add up all the word-frequencies for pages in the category. # my $catVocabSize = 0; my %catVocab = (); keys(%catVocab) = 1<<12; my $pagesInCat = 0; my %pagesDone = (); my $pnum = 0; my $pageOpenFailed = 0; # Scan the PAGES file (consisting of category:filename), and read all the files # that it indicates are in the desired category. # while (my $l = ) { $pnum++; ($pnum % $tickInterval == 0) && warn sprintf("Processed %10d pagerefs, found %6d pages, %6d words.\n", $pnum, $pagesInCat, (scalar keys %catVocab)); chomp $l; $l =~ m/^(.*)\t(.*)\s*$/; if (!defined $2) { next; } my $curPage = $1; my $curCat = $2; $curCat =~ s/\|.*$//; # Remove 'sortby' field. #($verbose) && ($curCat =~ "Greek myth") && # warn "Cat '$curCat' vs. '$theCat'.\n"; if (!defined $theCats{$curCat}) { next; } if (defined $pagesDone{$curPage}) { next; } $pagesDone{$curPage}++; $pagesInCat++; ($verbose) && warn "Page matches: '$curPage'\n"; ($justcount) && next; # Turn page title into a filename, the same as categoryVocabulary did. # $curPage =~ s/ /_/g; my $prefix = lc(substr($curPage,0,$nPrefix)); my $pagepath = "$pagedir/$prefix/$curPage.$pageext"; if (!open(PAGE,"<$pagepath")) { ($quiet) || warn "Couldn't open page $pagesInCat: '$pagepath'\n"; ($force) || die "Terminating.\n"; $pageOpenFailed++; } else { ($verbose) && warn "Opened page vocab file $pagesInCat: '$pagepath'\n"; binmode PAGE, ":utf8"; my $pnum = 0; my $typeCount = 0; while (my $pline = ) { $pnum++; chomp $pline; ($pline =~ m/^\s*1) && warn "Line $pnum parses to '$n' '$w'\n"; if ($ignoreCase) { $w = lc($w); } if ($n !~ m/^\d+$/) { warn "Bad num: '$pline'\n"; } $catVocab{$w} += $n; $typeCount++; } else { ($verbose) && warn " Line $pnum parse fails: '$pline'\n"; } } # PAGE EOF close PAGE; } } # PAGES EOF ($pagesInCat>0) || die "No pages found in category(s) " . join(", ", keys %theCats) . ".\n"; (scalar keys %catVocab > 0) || die "No words were loaded. All files missing?\n"; $catVocabSize = scalar keys %catVocab; warn sprintf( "%d of %d page-category pairs (%8.5f%%) match category(s); %d not found.\n", $pagesInCat, $pnum, $pagesInCat*100.0/$pnum, $pageOpenFailed); ($justcount) && exit; warn " for $catVocabSize word-types.\n"; if ($catVocabWrite) { open CV, ">$theCatsArray[0].catVocab.list" || die "Couldn't open CV\n"; binmode CV, ":utf8"; for my $v (sort keys %catVocab) { print CV sprintf(" %10d: %s\n", $catVocab{$v}, $v); } close CV; } # Load the corresponding word frequencies from the overall ranklist. # my %wCatVocab = (); keys(%wCatVocab) = 1<<14; # Pre-allocate for speed my $lnum = 0; ($quiet) || warn "Starting to scan union vocab file...\n"; while (my $l = ) { $lnum++; ($lnum % $tickInterval == 0) && warn sprintf( "Processed %8d union vocab lines, words matched: %5d.\n", $lnum, scalar keys %wCatVocab); chomp $l; $l =~ m/^\s*(\d+):\s+(.*)$/; if (!defined $2) { next; } # Could verify s, too. my $n = $1 - 0; my $w = $2; if ($ignoreCase) { $w = lc($w); } if (defined $catVocab{$w}) { $wCatVocab{$w} += $n; } } if (scalar keys %wCatVocab != $catVocabSize) { ($verbose) && warn "Size out of sync.\n"; } # Do the tf-idf calculation. # Why is log() traditionally in there? # my %relevance = (); my $pass = my $fail = 0; if ($verbose) { warn "Starting the tf-idf calculation ($pagesInCat pages, " . (scalar keys %wCatVocab) . " words.\n"; } for my $v (keys %wCatVocab) { if ($filter && !filter($v)) { next } my $divisor = 1.0 + log($wCatVocab{$v}); my $x = $catVocab{$v} / $divisor; if ($x >= $threshold) { $relevance{$v} = $x; $pass++; } else { $fail++; } } # Report the main result: tf-idf table # print "# tf-idf calculation for category(s) " . join(", ",%theCats) . "\n"; print "# $pagesInCat pages attempted, $pageOpenFailed failed to open.\n"; print "# " . "words from category pages: " . (scalar keys %catVocab) . ", from Wiki total: " . (scalar keys %wCatVocab) . ", above threshold:" . (scalar keys %relevance) . ".\n"; print "# tf-idf cat occ wiki occ word\n"; for my $v (sort byrank keys %relevance) { if ($wCatVocab{$v} > $wikiFrequencyLimit) { next; } print sprintf(" %8.5f: %8d: %8d: %s\n", $relevance{$v}, $catVocab{$v}, $wCatVocab{$v}, $v); } ($quiet) || warn "Done.\n"; exit; ############################################################################### # sub byrank { # to sort %vocab if ($rank) { return ($relevance{$b} <=> $relevance{$a}); } if ($a lt $b) { return(-1); } if ($a gt $b) { return(1); } return(0); } # Discard words that we don't want... sub filter { if ($_[0] =~ m/[^\w]/) { return(0); } if (length($_[0])<4) { return(0); } return(1); } ############################################################################### # sub showUsage { warn " =head1 Usage wordRanks [options] file 1: Scan a file that lists all the WikiPedia pages for a given category. (generated by categoryVocabulary -collectPagesByCategory). Retrieve and sum the word-frequency lists for each page found, 2: Retreive the similar union list for all of WikiPedia (generated by sumVocabForCategory -category '*') 3: Run a tf-idf calculation to estimate how characteristic each word is for the category. 4: Print out the words that are above a certain threshold. =head1 Options (prefix 'no' to negate where applicable) =over =item * B<-category c> A category from WikiPedia to look for (repeatable). =item * B<-catListFile> I Where to find the pagetitle\tcategory file (default = $dft_catListFile). '|...' is removed from category names. =item * B<-catVocab> Write out the summed vocab from the pages of the chosen -category. =item * B<-filter> Discard various words a priori (by length, charset, etc) =item * B<-force> Ok if some page-vocab files not found (default). =item * B<-ignorecase> Union data for words differing only in case. =item * B<-justcount> Just count how many pages for the category. =item * B<-lineends> I Assume Unix, Dos, or Mac line-breaks for input. =item * B<-nPrefix> I How many letters from start of page-title, to use to choose subdirectory of I<-pagedir> (to keep directories from getting slow due to too many files). =item * B<-olineends> I Write Unix, Dos, or Mac line-breaks for output. =item * B<-pagedir> I Specify path to the dir of per-page word-freq files. =item * B<-q> Suppress most messages. =item * B<-rank> Sort output by tf-idf rank (default). =item * B<-tick> I Report progress every I records (0 to turn off). =item * B<-threshold> I Only issue words with relevence > I. =item * B<-unicode> Assume input is utf-8 (default). =item * B<-v> Add more messages (repeatable). =item * B<-version> Show version/license info and exit. =item * B<-wikiFrequencyLimit> I Discard words that occur > this many times in all of Wikipedia. =back =head1 Known bugs and limitations The all-WikiPedia union vocabulary file should list # of pages the word shows up in, not just the total number of occurrences, so we can use that as the tf-idf divisor. Filenames for the required files are hard-coded for the moment: List of page:category pairs: $dft_catListFile Total W word frequencies: $dft_wRankList =head1 Related commands categoryVocabulary, splitBitDir, sumVocabForCategory. =head1 Ownership By Steven J. DeRose, OpenAmplify.com. =cut "; }