#!/usr/bin/perl -w
#
# xml2tab: Change a tab-delimited (or similar) file to XML.
#
# 2009-06-05: Written by Steven J. DeRose..
#     Based on tab2xml.
# 2009-11-05 sjd: Add -unicode. Avoid uninitialized vars. Fix quoting.
#     Handle names in thead or @class. 
# 2010-11-17 sjd: Cleanup.
#
# To do:
#    Fix spaces in header/property names.
#    It's not escaping all the quotes.
#    Test -multifact support for p (etc) tagging.
#    Provide a way to specify oquote/odelim/oescape by ord().
#    Write Manchester -> XHTML table converter (inverse of this).
#    Provide a way to copy attributes, esp. hrefs.
#    Option to align fact values (pad names).
#

use strict;
use Getopt::Long;
use csvFormat;

my $version = "2011-02-10";

my $dft_odelim  = "\t";
my $dft_oquote  = "\"";
my $dft_oescape = "\\";
my $dft_catalog = ""; # "$ENV{XML_CATALOG}";


###############################################################################
# Options:
#
my $catalog    = $dft_catalog;
my $class      = "";
my $manchester = 0;            # Convert to Manchester OWL syntax
my $multifact  = 0;            # Have >1 fact per cell
my $norm       = 1;            # Normalize whitespace?
my $nullValue  = "";           # Ignore cells that match this regex

my $ocomments  = 1;            # Copy -comments lines to output?
my $odelim     = $dft_odelim;  # Field separator to use
my $oescape    = $dft_oescape; # Escape char for quotes?
my $oheader    = 0;            # Write first record with field names?
my $oquote     = $dft_oquote;  # Quoting charcter to use
my $oqwhere    = "delim";      # Which fields to quote? none|all|num|text|delim
my $olineends  = "u";          # Line-end type to use

my $quiet      = 0;
my $useClasses = 0;            # Get output field names from @class
my $verbose    = 0;


###############################################################################
# Process options
#
Getopt::Long::Configure ("ignore_case");
my $result = GetOptions(
    "class=s"           => \$class,
    "h|help|?"          => sub { system "perldoc xml2tab"; exit; },
    "manchester!"       => \$manchester,
    "multifact!"        => \$multifact,
    "nullValue=s"       => \$nullValue,
    "ocomments"         => \$ocomments,
    "odelim=s"          => \$odelim,
    "oheader"           => \$oheader,
    "olineends=s"       => sub {
        $olineends = uc(substr("$ARGV[0]"."U", 0, 1));
    },
    "oquote"            => \$oquote,
    "oqwhere=s"         => \$oqwhere,
    "q|quiet!"          => \$quiet,
    "useClasses!"       => \$useClasses,
    "v|verbose+"        => \$verbose,
    "version"           => sub {
        die "Version of $version, by Steven J. DeRose.\n";
    }
    );

($result) || die "Bad options.\n";

($verbose && $nullValue) &&
    warn "nullValue is '$nullValue'.\n";


###############################################################################
# Validate and default options
#

($odelim) || die "Bad value for -odelim option.\n";

$olineends = lc($olineends);
if    ($olineends eq "u") { $olineends = chr(10); }
elsif ($olineends eq "m") { $olineends = chr(13); }
elsif ($olineends eq "d") { $olineends = chr(10) . chr(13); }
else { die "Unknown output line-end type '$olineends'.\n"; }

$oqwhere = lc($oqwhere);
if ($oqwhere eq "none" || $oqwhere eq "num" || $oqwhere eq "text"
    || $oqwhere eq "all" || $oqwhere eq "delim") {
    $oqwhere = $oqwhere;
}
else { die "Unknown value for '-oqwhere': '$oqwhere'.\n"; }


###############################################################################
#
my $file = shift;
($file && -f $file) || die "Can't find input XML file '$file'.\n";

($quiet) || warn "Starting parse of file '$file'.\n";

print "";
binmode STDOUT, ":utf8";

my $curline = 0;
my @tagStack =();
my $inTable = 0;
my $inHeaderRow = 0;
my $fldNum = 0;

my $tableCount = 0;
my @fields = ();
my @fieldClasses = ();
my @headers = ();

parseDocument($file);

($quiet) || warn "Done, $tableCount tables processed.\n";

exit;


###############################################################################
#
sub parseDocument {
    use XML::Parser;
    my $catalog = 0;
    my $parser;
    $parser = new XML::Parser(ErrorContext => 2);
    if ($catalog && -f $catalog) {
        # use XML::Catalog;
        $catalog = XML::Catalog->new($catalog);
        $parser->setHandlers(
            ExternEnt  => $catalog->get_handler($parser));
    }
    $parser->setHandlers(
        Start      => \&startHandler,
        End        => \&endHandler,
        Init       => \&initHandler,
        Final      => \&finalHandler,
        Char       => \&charHandler,
        Comment    => \&commentHandler, 
        DoctypeFin => \&doctypeFinHandler,
        Default    => \&defaultHandler);
    $parser->parsefile($_[0]);
} # sub parseDocument


###############################################################################
# initHandler: Called at start of parsing
#
sub initHandler {
    $curline = $_[0]->current_line;
}

# finalHandler: Called at end of parsing
sub finalHandler {
    $curline = $_[0]->current_line;
}

sub startHandler {
    $curline = $_[0]->current_line;
    my $parser = shift;
    my $name = shift;
    push @tagStack, $name;
    my %attrs = ();
    for (my $i=0; $i<scalar @_; $i+=2) {
        $attrs{$_[$i]} = $_[$i+1];
    }

    if ($name eq "table") { # Reset everything....
        ($inTable > 1) && warn "WARNING: Nested table at line $curline.\n";
        $tableCount++;
        $inTable++;
        $inHeaderRow = 0;
        @fields = ();
        @headers = ();
        @fieldClasses = ();
    }
    elsif  ($name eq "tr") {
        if (nestedIn("thead")) { $inHeaderRow = 1; } 
        @fields = ();
        $fldNum = 0;
    }
    elsif  ($name eq "th" ||
		    ($name eq "td" && nestedIn("thead"))) {
        $fldNum++;
        if (scalar @headers == 0) {
            ($verbose>1) && warn "Found a <th>\n";
            $inHeaderRow = 1;
		}
    }
    elsif  ($name eq "td") {
        $fldNum++;
		if ($useClasses || !$headers[$fldNum]) {
            $headers[$fldNum] = $attrs{"class"};
		}
    }
    elsif  ($name eq "p") {
        # set up for -multifact
        if ($multifact) { $fields[$fldNum] .= "\n"; }
    }
} # startHandler

sub endHandler {
    $curline = $_[0]->current_line;
    my ($parser, $name) = @_;
    if ($name eq "table") { 
        $inTable--;
    }
    elsif  ($name eq "thead") {
        if ($manchester) {
            print "Ontology: <http:example.com/ontologies/$class>\n";
            print "\nClass: $class\n";
            for my $f (@headers) {
                $f =~ s/\s//g;
                ($f) && print "DataProperty: $f\n";
            }
        }
    }
    elsif  ($name eq "tr") {
        writeRecord();
        @fields = ();
    }
    elsif  ($name eq "th") {
    }
    elsif  ($name eq "td") {
    }
    elsif  ($name eq "p") {
        if ($multifact) {
            $fields[$fldNum] .= "\n";
        }
    }
    pop @tagStack;
}

sub charHandler {
    $curline = $_[0]->current_line;
    my $fldBuf = $_[1];
    if (!defined $fields[$fldNum]) { $fields[$fldNum] = ""; }
    ($fldBuf =~ /^\s*$/) && return;
    ($inTable) || return;
    if ($norm) {
        $fldBuf =~ s/\s+/ /g;
        $fldBuf =~ s/^ //;
        $fldBuf =~ s/ \$//;
    }
    if ($oqwhere eq "all" ||
        ($oqwhere eq "num"   && $fldBuf =~ m/^\s*\d*\s*\$/) ||
        ($oqwhere eq "text"  && $fldBuf =~ m/[^\s\d]/) ||
        ($oqwhere eq "delim" && $fldBuf =~ m/$odelim/) ||
        ($oqwhere eq "delim" && $fldBuf =~ m/$oquote/)) {
        $fldBuf =~ s/$oquote/$oescape$oquote/g;
        $fldBuf =~ s/$odelim/$oescape$odelim/g;
    }
    elsif ($fldBuf =~ m/$odelim/) {
        $fldBuf =~ s/$odelim/$oescape$odelim/g;
    }
    if ($inHeaderRow) {
        $headers[$fldNum] .= $fldBuf;
	}

    $fields[$fldNum] .= $fldBuf;
}

sub commentHandler {
    $curline = $_[0]->current_line;
}

sub doctypeFinHandler {
    $curline = $_[0]->current_line;
} # doctypeFinHandler

sub defaultHandler {
    $curline = $_[0]->current_line;
    ($verbose>1) && warn "Default Handler called at line $curline.\n";
} # doctypeFinHandler


# Return whether a given tag is on the open tag stack.
#
sub nestedIn {
    for my $cur (@tagStack) {
        ($cur eq $_[0]) && return(1);
    }
    return(0);
}


###############################################################################
#
sub writeRecord {
    if ($manchester) {
        if ($inHeaderRow) {
            $inHeaderRow = 0;
            return;
        }
        my $id = cleanID($fields[1]);
        print "\nIndividual: $id$olineends";
        if ($class) { print "    Types: $class\n"; }
        for (my $i=2; $i<scalar @fields; $i++) {
            if ($multifact) {
                my @parts = split(/[\r\n]+/, $fields[$i]);
                my $buf = "    Facts: $headers[$i]";
                for (my $j=0; $i<scalar @parts; $j++) {
                    $buf .= " $headers[$i] $fields[$i],";
                }
                $buf =~ s/,\$//;
                print "$buf$olineends";
            }
            elsif ($nullValue eq "" ||
                   $fields[$i] !~ m/$nullValue/) {
                if (!$headers[$i]) { 
                    $headers[$i] = "???";
                    warn "Missing header for column $i\n";
                }
                print "    Facts: $headers[$i] \"$fields[$i]\"$olineends";
            }
        }
    } # manchester

    else {
        shift @fields;
        if (scalar @fields > 0) {
            (defined $fields[0]) || warn "\n????\n";
            my $orec = join("$odelim", @fields) . $olineends;
            print $orec;
        }
    }
} # writeRecord


###############################################################################
# Turn a string into a legitimate individual name.
#
sub cleanID {
    my $buf = $_[0];
    $buf =~ s/\s//g;
    $buf =~ s/[^\w\d_]/_/g;
    return($buf);
}


###############################################################################
#

=pod

=head1 Usage

xml2tab [options] fieldname1 fieldname2...

Parses some XHTML to find the table(s), and converts it/them to csv,
or to some other formats such as Manchester OWL.

No colspans, rowspans, or nested tables.


=head1 Options

=over

=item * B<-class> I<s>      With I<-manchester>, 
assign this as 'Types' to each Individual.

=item * B<-id> I<i>         Process only tables with specified 
I<-id> (repeatable).(not yet)

=item * B<-norm>         Normalize whitespace.

=item * B<-manchester>   Output OWL Manchester ontology syntax instead. 
First column must be the OWL entity's identifier; each column should
be a certain fact about that Individual.
Fact names can come from THs; a THEAD containing TH or TD,
or (I<-useClasses>) from the 'class' attribute of the fact TD.

=item * B<-multifact>    With I<-manchester>, treat lines within a table
cell as multiple instances of that column's fact.

=item * B<-nonxml>       Generate other than XML output, as defined by [tbd].

=item * B<-null> I<regex>   If a column matches this, ignore (discard) it.

=item * B<-odelim> I<c>     Use this as field delimiter (default: tab)

=item * B<-oescape> I<c>    Use this character when escaping things.

=item * B<-oheader>         Write a header record with field names 
(by default, take them from \@class.

=item * B<-olineends>    Use Unix/Dos/Mac line ends for output.

=item * B<-oquote>       Use this character as the quote 
(distinct open/close quotes are not yet supported).

=item * B<-oqwhere>      Which output fields to quote: none/all/num/text/delim
(default=delim: just quote fields that include the delim or quote char).

=item * B<-q>            Suppress most messages.

=item * B<-useClasses>   Label fact per 'class' attribute,
instead of TH contents.

=item * B<-v>            Add more detailed messages (to stderr).

=item * B<-version>      Display version info and exit.

=back


=head1 Related commands

=over

=item * tab2xml: does basically the opposite of this, except that it
can't read Manchester OWL.

=item * align: split lines into fields, and pad to line up.

=item * cut, paste.

=back


=head1 Ownership

This work by Steven J. DeRose is licensed under a Creative Commons 
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see http://creativecommons.org/licenses/by-sa/3.0/.

The author's present email is sderose at acm.org.

For the most recent version, see http://www.derose.net/steve/utilities/.

=cut