#!/usr/bin/perl -w
#
# runWordRanks: runCategories + wordRanks.
#
# Keep calling the main/master machine to get categories, get hook words
# for them with internal code, save, and go round again.
#
# sjd, 2009-09.
# 2009-10-05 sjd: Integrate and move union word list to a hash instead of
#    spinning the 50MB file once per category.... Refactor and clean up.
#

use strict;
use Getopt::Long;
use Unicode::Normalize;

my $version = "2010-09-12";

# Default values for various options (dcl so we can print in -help)
#
my $dft_master                 = "domU-12-31-39-00-50-74.compute-1.internal";
my $dft_pagedir                = "/mnt/wikiData/wikiSplit";
my $dft_unionWordFreqsFile     = "_union_ranklist.txt";
my $dft_catListFile            = "pageCats.list";
my $dft_threshold              = 0.5;
my $dft_wikiFrequencyLimit     = 50000;


# (See showUsage() at end of this file for explanations of these options)
#
my @theCatsArray  = ();              # Requested category(s)
my $catListFile   = $dft_catListFile;# List of pages for each category
my $justcount     = 0;               # Just count pages per category
my $filter        = 0;               # Discard bogus 'words'?
my $force         = 1;               # Survive missing page files
my $ignoreCase    = 0;
my U      = "U";
my $master        = $dft_master;     # Domain name of master server
my $nPrefix       = 2;               # Split pagedir into smaller subdirs
my $olineends   = "U";
my $pagedir       = $dft_pagedir;    # Where to find per-page word-freq files
my $pageext       = "vocab";         # Extension for vocab output files
my $quiet         = 0;      
my $rank          = 1;               # Sort output by rank, not alpha
my $standalone    = 0;               # Don't use category server
my $tickInterval  = 100000;          # Progress reports how often
my $threshold     = $dft_threshold;  # Don't issue hook words < this
my $unicode       = 1;               # I/o in utf8
my $unionWordFreqsFile = $dft_unionWordFreqsFile;
my $verbose       = 0;
my $wikiFrequencyLimit = $dft_wikiFrequencyLimit;


###############################################################################
#
Getopt::Long::Configure ("ignore_case");
my $result = GetOptions(
    "cat|category=s"          => \@theCatsArray,
    "catListFile=s"           => \$catListFile,
    "filter!"                 => \$filter,
    "force!"                  => \$force,
    "h|help"                  => sub { system "perldoc runWordRanks"; exit; },
    "ignoreCase!"             => \$ignoreCase,
    "justcount!"              => \$justcount,
    "lineends=s"              => \U,
    "master=s"                => \$master,
    "n|nPrefix=n"             => \$nPrefix,
    "olineends=s"           => \$olineends,
    "pagedir=s"               => \$pagedir,
    "quiet!"                  => \$quiet,
    "rank!"                   => \$rank,
    "standalone!"             => \$standalone,
    "threshold=f"             => \$threshold,
    "tick=n"                  => \$tickInterval,
    "unicode!"                => sub { $iencoding = "utf8"; },    
    "uwf|unionWordFreqsFile=s"=> \$unionWordFreqsFile,
    "v+"                      => \$verbose,
    "version"                 => sub {
        die "Version of $version, by Steven J. DeRose.\n";
    },
    "wikiFrequencyLimit=n"    => \$wikiFrequencyLimit,
    );


###############################################################################
# If we're running without a master server (that is, if we're getting one or
# a few specific category names from options, instead of doing a whole series),
# then move the options into a hash, or scream if there aren't any.
#
my %theCats = ();
if ($standalone) {
  for my $c (@theCatsArray) {
    $theCats{$c}++;
	die "-cat no supported when using category server.\n"
  }
  if ($verbose) {
	for my $c (sort keys %theCats) {
	  warn "Category: '$c'\n";
	}
  }
  (scalar keys %theCats > 0) || die "No -category(s) specified.\n";
  (-f $catListFile) ||
    die "Can't find page-category list '$catListFile'\n";
  open(PAGES, "<$catListFile") || die "Couldn't open $catListFile\n";
  binmode PAGES, ":utf8";
  ($verbose) && warn "PAGES file '$catListFile' opened successfully.\n";
} # standalone


my $requestURI = "http://$master/cgi-bin/nextCategory.pl";

if ($pagedir) {
    (-d $pagedir) ||
	  die "Can't find per-page word-freq directory '$pagedir'\n"; 
}
if ($nPrefix > 0) { # Make sure we have right pagedir subdirs
    my $aSubdir = "$pagedir/" . ("a" x $nPrefix);
    (-d $aSubdir) ||
	  die "Can't find per-page word-freq subdirectory '$aSubdir'\n"; 
}

my %catVocab = ();             # Table of summed word-frequencies
my %catVocabPages = ();        # Table of num. of cat pages each word was in
my %relevance = ();            # Table of word with tf-idf values


###############################################################################
# Load the entire Wikipedia union word-frequency table (just once, please!)
#
my %unionWordFreqsHash = ();
loadUnionWordFreqsHash();


###############################################################################
# MAIN: ask the master server for categories, and run stats for each.
#
my $runCount   = 0;        # How many categories have we done?
my $pnum = 0;              # Record counter for current category
my %pagesDone = ();        # Avoid duplicates (if multiple cats)
my $pageOpenFailed = 0;    # Num. pages we couldn't find
my $pagesInCat = 0;        # Pages found to be in current category.

while(1) {
    my $catName = getCategoryFromServer();
	(defined $catName) || last;
	$theCats{$catName}++;

    my $outfile = makeFilename2($catName);
    ($verbose) && warn "Running category \#$runCount: '$catName'\n";

	# Do the big calculation
	#
	runOneCategory($catName, "$outfile.hooks");

	# Save the result back to the master
	#
    my $scpCmd = "scp '$outfile.hooks'"
	. " 'root\@$master:/hookwords/'";
    ($verbose) && warn "Running: $scpCmd\n";
    system "$scpCmd" ||
	warn "Couldn't scp '$outfile.hooks'\n";
    system "rm '$outfile.hooks'";
    $runCount++;
    warn "\n";
} # Forever

close PAGES;
warn "Done, handled $runCount categories.\n";
exit;


###############################################################################
# Scan the PAGES file (consisting of category:filename), and read all the files
# that it indicates are in the desired category (there's a later version of
# this file available, with each line having a category name followed by all
# pages in that category, tab-separated. Extracted by ___).
#
# Basically same code as 'wordRanks', except doesn't spin the union word-freqs
# file for every category -- just use pre-loaded hash.
#
sub runOneCategory {
  my $catName = $_[0];
  my $outfile = $_[1];
  my $catStartTime = time();

  %catVocab = ();            # Table of summed word-frequencies
  %catVocabPages = ();       # Table of num. of cat pages each word was in
  keys(%catVocab) = 1<<12;  
  keys(%catVocabPages) = 1<<12;  

  addUpAllPagesInCategory(); # Results stored in %catVocab

  if ($verbose) {
	warn "Starting the tf-idf calculation.\n";
  }
  my $pass = my $fail = 0;
  for my $v (keys %catVocab) {
	if ($filter && !filter($v)) { next; }
	my $divisor = 1.0 + log($unionWordFreqsHash{$v});
	my $x = $catVocab{$v} / $divisor;
	if ($x >= $threshold) { $relevance{$v} = $x; $pass++; }
	else { $fail++; }
  } # for category vocab

  report($outfile,$catStartTime,$pass,$fail);
} # runOneCategory


###############################################################################
# Scan the file that lists every category\tpage pairs, to find all pages that
# are in the desired category(s). For each such page, read it and add the count
# for each word to the total word-frequency table for the category. We also
# count how many *pages* each word is found in, since we might also want that
# for various statistical calculations.
#
sub addUpAllPagesInCategory {
  seek PAGES, 0, 0;          # rewind
  while (my $l = <PAGES>) {
    $pnum++;
    ($pnum % $tickInterval == 0) && warn 
	  sprintf("Processed %10d pagerefs, found %6d pages, %6d words.\n",
			  $pnum, $pagesInCat, (scalar keys %catVocab));
    chomp $l;
    $l =~ m/^(.*)\t(.*)\s*$/;
    if (!defined $2) { next; }
    my $curPage = $1;
    my $curCat = $2;
	my $curCatFilename = makeFilename2($curCat);
    if (!defined $theCats{$curCat})   { next; }
    if (defined $pagesDone{$curPage}) { next; }
    $pagesDone{$curPage} = 1;
    $pagesInCat++;
    ($verbose) && warn "Page matches: '$curPage'\n";
    ($justcount) && next;

    # Turn page title into a filename, the same as categoryVocabulary did.
    #
    my $curPageFilename = makeFilename($curPage);
    my $prefix = lc(substr($curPage,0,$nPrefix));
    my $pagepath = "$pagedir/$prefix/$curPage.$pageext";
    if (!open(PAGE,"<$pagepath")) {
	    ($quiet) || warn "Couldn't open page $pagesInCat: '$pagepath'\n";
	    ($force) || die "Terminating.\n";
	    $pageOpenFailed++;
		next;
    }
	binmode PAGE, ":utf8";
	($verbose) &&
	    warn "Opened page vocab file $pagesInCat: '$pagepath'\n";

	my $pnum = 0; my $typeCount = 0;
	while (my $pline = <PAGE>) {
	    $pnum++;
	    chomp $pline;
	    ($pline =~ m/^\s*</ || $pline =~ m/^\s*\[\[/ || $pline eq "")
		&& next;
	    $pline =~ m/^\s*(\d+):\s+(.*)$/;
	    if ($2 && $1) {
		my $n = $1; my $w = $2;
		($verbose>1) && warn "Line $pnum parses to '$n' '$w'\n";
		if ($ignoreCase) { $w = lc($w); }
		if ($n !~ m/^\d+$/) { warn "Bad num: '$pline'\n"; }
		$catVocab{$w} += $n;
		$catVocabPages{$w}++;
		$typeCount++;
	    }
	    else {
		($verbose) && warn "    Line $pnum parse fails: '$pline'\n";
	    }
	} # PAGE EOF
	close PAGE;
  } # PAGES EOF

  ($pagesInCat>0) ||
    die "No pages found in category(s) " . join(", ", keys %theCats) . ".\n";
  (scalar keys %catVocab > 0) || 
    die "No words were loaded. All files missing?\n";

  warn sprintf(
   "%d of %d page-category pairs (%8.5f%%) match category(s); %d not found.\n",
   $pagesInCat, $pnum, $pagesInCat*100.0/$pnum, $pageOpenFailed);
  ($justcount) && exit;
  warn "    for " . (scalar keys %catVocab) . " word-types.\n";
} # addUpAllPagesInCategory


###############################################################################
# Report the main result: tf-idf table.
#
sub report {
  my $tgtfile = $_[0];
  # my $catStartTime = $_[1];
  # my $elapsed = time() - $catStartTime;

  print $tgtfile
    "# tf-idf calculation for category(s) " . join(", ",%theCats) . "\n";
  print $tgtfile
    "# $pagesInCat pages attempted, $pageOpenFailed failed to open.\n";
  print  $tgtfile
    "# words from category pages: " . (scalar keys %catVocab) .
	", from Wiki total: " . (scalar keys %unionWordFreqsHash) .
	", above threshold:" . (scalar keys %relevance) . ".\n";
  print $tgtfile "# tf-idf    cat occ   wiki occ     word\n";
  for my $v (sort byrank keys %relevance) {
    if ($unionWordFreqsHash{$v} > $wikiFrequencyLimit) { next; }
    print $tgtfile sprintf(" %8.5f: %8d: %8d: %s\n",
	  $relevance{$v}, $catVocab{$v}, $unionWordFreqsHash{$v}, $v);
  }
}


###############################################################################
# Allow for sorting the final tf-idf report alphabetically or by rank.
#
sub byrank { # to sort %vocab
  if ($rank) {
	return ($relevance{$b} <=> $relevance{$a});
  }
  if ($a lt $b) { return(-1); }
  if ($a gt $b) { return(1); }
  return(0);
}


###############################################################################
# Discard words that we obviously don't want....
#
sub filter {
  if ($_[0] =~ m/[^\w]/) { return(0); }
  if (length($_[0])<4)   { return(0); }
  return(1);
}


###############################################################################
# Load the union word-freqs table into a mongo hash for speed. Expect
# about 3.8M words (but we don't store the ~54% of them that only occur once,
# because we can just assume '1' if we don't find an entry).
#
sub loadUnionWordFreqsHash {
  warn "Loading union word-frequency table...\n";
  open(UWF, "<$unionWordFreqsFile") ||
    die "No union word-freqs list found at '$unionWordFreqsFile'.\n";
  binmode UWF, ":utf8";
  keys(%unionWordFreqsHash) = 1<<20;
  my $lnum = 0;
  while (my $l = <UWF>) {
    $lnum++;
    ($lnum % $tickInterval == 0) && warn sprintf(
	  "Scanned %8d union word-freqs lines (sorted?), loaded %8d.\n",
	  $lnum, scalar keys %unionWordFreqsHash);
    chomp $l;
    $l =~ m/\s*(\d+):\s*(.*)$/;
	if (defined $1 && $1 <= 1) { next; } # Don't store hapax legomena.
	$unionWordFreqsHash{$2} = $1;
  }
  close UWF;
  warn "Union word frequency hash has been loaded.\n\n";
} # loadUnionWordFreqsHash


###############################################################################
# Get the next category name from the master server.
#
sub getCategoryFromServer {
    warn "Requesting category from '$requestURI'\n";
    my $catName = `curl --silent $requestURI`;
    chomp $catName;
    if ($catName eq "#EOF") {
	    warn "Hit EOF.\n";
        return();
    }
    if ($catName eq "") {
	  warn "Empty category name from master.\n";
	  return("");
    }
	return($catName);
}


###############################################################################
###############################################################################
# makeFilename() cleans up Wikipedia category names to use as filenames.
# 
# This is because (at least in the first cut), script 'categoryVocabulary'
# writes the word-frequency table for each page to a separate file. So we get
# rid of nearly all punctuation, case distinctions, etc.; but we need to keep
# all that info so that pages with similar titles don't collide. Newer plan is
# to put all the word-freq tables into *one* file (save zillions of fopen()s),
# and make another file with an index from (raw) page-titles to the offsets of
# their portions of the big file.
#
###############################################################################
# Safe way to make filename -- preserves all info, but in case-insens ASCII.
# (xxx) for anything but [a-zA-Z0-9] Put "_" before A-Z.
# Note: Can't do a whole path, will toast [/.\\:]
# (copied from categoryVocabulary -- keep in sync!!!)
# NOTE: Not compatible with earlier makeFilename()!
#
sub makeFilename2 {
    my $rc = "";
    $_[0] =~ s/\|.*$//; # Remove 'sortby' field if present
    for (my $i=0; $i<length($_[0]); $i++) {
        my $c = substr($_[0],$i,1);
        my $o = ord($c);
        if ($o >= ord("a") && $o <= ord("z") ||
            $o >= ord("0") && $o <= ord("9")) {
           $rc .= $c;
        }
        elsif ($o >= ord("A") && $o <= ord("Z")) {
           $rc .= "_" . lc($c);
        }
        if ($o >= ord("a") && $o <= ord("z")) {
           $rc .= sprintf("(%x)", $o);
        }
    }
    return($rc);
} # makeFilename2


# Version to match filenames originally generated by categoryVocabulary.
# Bad idea because fixing name-conflicts can't be done compatibly at generate
# and search time (generator finds conflict and renames, but how would we
# know which number suffix to use when searching for any of the titles later?
#
# Only here for backwards-(broken)-compatibility. Need to re-run the whole
# Wikipedia scan to fix this for real (and should store the unmodified page
# title within each page file to prevent later similar problems.
#
sub makeFilename {
    my $nameConflicts = 0;
    my $f = $_[0]; my $ext = $_[1];
    $f =~ s/[^\w\d\.]/_/g;
    $f =~ s/__+/_/g;
    $f =~ s/^\./_/;
    $f = lc($f);

    my $prefix = "";
    if ($nPrefix > 0) {
	  $prefix = substr($f."_______",0,$nPrefix) . "/";
	  if (!-e $prefix) {
	    system "mkdir -p '$prefix'" || warn "Couldn't make dir '$prefix'\n";
	  }
    }

    my $suf = 1;
    my $newf = $f;
    while (-f "$newf.$ext") { # Count until no filename conflict
        $suf++;
        $newf = sprintf("%s%04d",$f,$suf);
    }
    if ($newf ne $f) { $nameConflicts++; }
    return("$newf.$ext");
} # makeFilename


###############################################################################
#
sub showUsage {
    warn "

=head1 Usage

wordRanks [options] file

0:  Load the union Wikipedia word-frequencies list into a (big) hash.
(generated by sumVocabForCategory -category '*' or ____)

1:  Fetch a category from the master cloud server and:

a:  Scan a file that lists all the WikiPedia pages for a given category.
(generated by categoryVocabulary -collectPagesByCategory).
Retrieve and sum the word-frequency lists for each page found,

b:  Estimate how characteristic each word is for the category.
c:  Print out the words that come out above a certain threshold (and
that aren't discardable for some other obvious reason).


=head1 Options

(prefix 'no' to negate where applicable)

=over

=item * B<-filter> Discard various words a prior (by length, charset, etc)

=item * B<-force>            Ok if some page-vocab files not found (default).

=item * B<-ignorecase>       Union data for words differing only in case.

=item * B<-justcount>        Just count how many pages for the category.

=item * B<-lineends t>       Assume Unix, Dos, or Mac line-breaks for input.

=item * B<-master domain> Domain for the master machine to supply category names
(default = $dft_master). Note: clouds used different
URIs for internal and external access.

=item * B<-nPrefix n> How many letters from start of page-title, to use to
choose subdirectory of -pagedir (to keep directories
from getting slow due to too many files).

=item * B<-olineends t>    Write Unix, Dos, or Mac line-breaks for output.

=item * B<-pagedir path> Specify path to the dir of per-page word-freq files.
If -nprefix is n, this must have subdirs for each
n-char prefix of page titles, to keep dirs smaller
(would be better to hash the name for dispersion).

=item * B<-q>                Suppress most messages.

=item * B<-rank>             Sort output by tf-idf rank (default).

=item * B<-standalone>       Get categories from a file, not a server.

=item * B<-tick n>           Report progress every n records (0 to turn off).

=item * B<-threshold n> Only issue words with relevence > n (dft $dft_threshold)

=item * B<-unicode>          Assume input is utf-8 (default).

=item * B<-v>                Add more messages (repeatable).

=item * B<-version>          Show version/license info and exit.

=item * B<-wikiFrequencyLimit n>   Discard words that occur > this many times in
total Wikipedia (default = $dft_wikiFrequencyLimit).

=back


=head1 Options unused

(used for standalone/non-server wordRanks):

=over

=item * B<-category c>       A category from WikiPedia to look for (repeatable).

=item * B<-catListFile path> Where to find the pagetitle\tcategory file 
when not using server (default = $dft_catListFile).

=back


=head1 Known bugs and limitations

The all-WikiPedia union vocabulary file should list num pages the word
shows up in, not just the total number of occurrences, so we can use
that in tf-idf calculation.

Master script could load up whole page-list and pass back list of pages
instead of just category name, to save spinning that whole file.

Should rebuild with all word-freq tables in one file and use seek().


=head1 Related commands

categoryVocabulary, splitBitDir, sumVocabForCategory, wordRanks,
nextCategory, runCategories.


=head1 Ownership

This work by Steven J. DeRose is licensed under a Creative Commons 
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see http://creativecommons.org/licenses/by-sa/3.0/.

The author's present email is sderose at acm.org.

For the most recent version, see http://www.derose.net/steve/utilities/.

=cut
";
}