#!/usr/bin/perl -w # # xml2tab: Change a tab-delimited (or similar) file to XML. # # 2009-06-05: Written by Steven J. DeRose.. # Based on tab2xml. # 2009-11-05 sjd: Add -unicode. Avoid uninitialized vars. Fix quoting. # Handle names in thead or @class. # 2010-11-17 sjd: Cleanup. # # To do: # Fix spaces in header/property names. # It's not escaping all the quotes. # Test -multifact support for p (etc) tagging. # Provide a way to specify oquote/odelim/oescape by ord(). # Write Manchester -> XHTML table converter (inverse of this). # Provide a way to copy attributes, esp. hrefs. # Option to align fact values (pad names). # use strict; use Getopt::Long; use csvFormat; my $version = "2011-02-10"; my $dft_odelim = "\t"; my $dft_oquote = "\""; my $dft_oescape = "\\"; my $dft_catalog = ""; # "$ENV{XML_CATALOG}"; ############################################################################### # Options: # my $catalog = $dft_catalog; my $class = ""; my $manchester = 0; # Convert to Manchester OWL syntax my $multifact = 0; # Have >1 fact per cell my $norm = 1; # Normalize whitespace? my $nullValue = ""; # Ignore cells that match this regex my $ocomments = 1; # Copy -comments lines to output? my $odelim = $dft_odelim; # Field separator to use my $oescape = $dft_oescape; # Escape char for quotes? my $oheader = 0; # Write first record with field names? my $oquote = $dft_oquote; # Quoting charcter to use my $oqwhere = "delim"; # Which fields to quote? none|all|num|text|delim my $olineends = "u"; # Line-end type to use my $quiet = 0; my $useClasses = 0; # Get output field names from @class my $verbose = 0; ############################################################################### # Process options # Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "class=s" => \$class, "h|help|?" => sub { system "perldoc xml2tab"; exit; }, "manchester!" => \$manchester, "multifact!" => \$multifact, "nullValue=s" => \$nullValue, "ocomments" => \$ocomments, "odelim=s" => \$odelim, "oheader" => \$oheader, "olineends=s" => sub { $olineends = uc(substr("$ARGV[0]"."U", 0, 1)); }, "oquote" => \$oquote, "oqwhere=s" => \$oqwhere, "q|quiet!" => \$quiet, "useClasses!" => \$useClasses, "v|verbose+" => \$verbose, "version" => sub { die "Version of $version, by Steven J. DeRose.\n"; } ); ($result) || die "Bad options.\n"; ($verbose && $nullValue) && warn "nullValue is '$nullValue'.\n"; ############################################################################### # Validate and default options # ($odelim) || die "Bad value for -odelim option.\n"; $olineends = lc($olineends); if ($olineends eq "u") { $olineends = chr(10); } elsif ($olineends eq "m") { $olineends = chr(13); } elsif ($olineends eq "d") { $olineends = chr(10) . chr(13); } else { die "Unknown output line-end type '$olineends'.\n"; } $oqwhere = lc($oqwhere); if ($oqwhere eq "none" || $oqwhere eq "num" || $oqwhere eq "text" || $oqwhere eq "all" || $oqwhere eq "delim") { $oqwhere = $oqwhere; } else { die "Unknown value for '-oqwhere': '$oqwhere'.\n"; } ############################################################################### # my $file = shift; ($file && -f $file) || die "Can't find input XML file '$file'.\n"; ($quiet) || warn "Starting parse of file '$file'.\n"; print ""; binmode STDOUT, ":utf8"; my $curline = 0; my @tagStack =(); my $inTable = 0; my $inHeaderRow = 0; my $fldNum = 0; my $tableCount = 0; my @fields = (); my @fieldClasses = (); my @headers = (); parseDocument($file); ($quiet) || warn "Done, $tableCount tables processed.\n"; exit; ############################################################################### # sub parseDocument { use XML::Parser; my $catalog = 0; my $parser; $parser = new XML::Parser(ErrorContext => 2); if ($catalog && -f $catalog) { # use XML::Catalog; $catalog = XML::Catalog->new($catalog); $parser->setHandlers( ExternEnt => $catalog->get_handler($parser)); } $parser->setHandlers( Start => \&startHandler, End => \&endHandler, Init => \&initHandler, Final => \&finalHandler, Char => \&charHandler, Comment => \&commentHandler, DoctypeFin => \&doctypeFinHandler, Default => \&defaultHandler); $parser->parsefile($_[0]); } # sub parseDocument ############################################################################### # initHandler: Called at start of parsing # sub initHandler { $curline = $_[0]->current_line; } # finalHandler: Called at end of parsing sub finalHandler { $curline = $_[0]->current_line; } sub startHandler { $curline = $_[0]->current_line; my $parser = shift; my $name = shift; push @tagStack, $name; my %attrs = (); for (my $i=0; $i 1) && warn "WARNING: Nested table at line $curline.\n"; $tableCount++; $inTable++; $inHeaderRow = 0; @fields = (); @headers = (); @fieldClasses = (); } elsif ($name eq "tr") { if (nestedIn("thead")) { $inHeaderRow = 1; } @fields = (); $fldNum = 0; } elsif ($name eq "th" || ($name eq "td" && nestedIn("thead"))) { $fldNum++; if (scalar @headers == 0) { ($verbose>1) && warn "Found a \n"; $inHeaderRow = 1; } } elsif ($name eq "td") { $fldNum++; if ($useClasses || !$headers[$fldNum]) { $headers[$fldNum] = $attrs{"class"}; } } elsif ($name eq "p") { # set up for -multifact if ($multifact) { $fields[$fldNum] .= "\n"; } } } # startHandler sub endHandler { $curline = $_[0]->current_line; my ($parser, $name) = @_; if ($name eq "table") { $inTable--; } elsif ($name eq "thead") { if ($manchester) { print "Ontology: \n"; print "\nClass: $class\n"; for my $f (@headers) { $f =~ s/\s//g; ($f) && print "DataProperty: $f\n"; } } } elsif ($name eq "tr") { writeRecord(); @fields = (); } elsif ($name eq "th") { } elsif ($name eq "td") { } elsif ($name eq "p") { if ($multifact) { $fields[$fldNum] .= "\n"; } } pop @tagStack; } sub charHandler { $curline = $_[0]->current_line; my $fldBuf = $_[1]; if (!defined $fields[$fldNum]) { $fields[$fldNum] = ""; } ($fldBuf =~ /^\s*$/) && return; ($inTable) || return; if ($norm) { $fldBuf =~ s/\s+/ /g; $fldBuf =~ s/^ //; $fldBuf =~ s/ \$//; } if ($oqwhere eq "all" || ($oqwhere eq "num" && $fldBuf =~ m/^\s*\d*\s*\$/) || ($oqwhere eq "text" && $fldBuf =~ m/[^\s\d]/) || ($oqwhere eq "delim" && $fldBuf =~ m/$odelim/) || ($oqwhere eq "delim" && $fldBuf =~ m/$oquote/)) { $fldBuf =~ s/$oquote/$oescape$oquote/g; $fldBuf =~ s/$odelim/$oescape$odelim/g; } elsif ($fldBuf =~ m/$odelim/) { $fldBuf =~ s/$odelim/$oescape$odelim/g; } if ($inHeaderRow) { $headers[$fldNum] .= $fldBuf; } $fields[$fldNum] .= $fldBuf; } sub commentHandler { $curline = $_[0]->current_line; } sub doctypeFinHandler { $curline = $_[0]->current_line; } # doctypeFinHandler sub defaultHandler { $curline = $_[0]->current_line; ($verbose>1) && warn "Default Handler called at line $curline.\n"; } # doctypeFinHandler # Return whether a given tag is on the open tag stack. # sub nestedIn { for my $cur (@tagStack) { ($cur eq $_[0]) && return(1); } return(0); } ############################################################################### # sub writeRecord { if ($manchester) { if ($inHeaderRow) { $inHeaderRow = 0; return; } my $id = cleanID($fields[1]); print "\nIndividual: $id$olineends"; if ($class) { print " Types: $class\n"; } for (my $i=2; $i 0) { (defined $fields[0]) || warn "\n????\n"; my $orec = join("$odelim", @fields) . $olineends; print $orec; } } } # writeRecord ############################################################################### # Turn a string into a legitimate individual name. # sub cleanID { my $buf = $_[0]; $buf =~ s/\s//g; $buf =~ s/[^\w\d_]/_/g; return($buf); } ############################################################################### # =pod =head1 Usage xml2tab [options] fieldname1 fieldname2... Parses some XHTML to find the table(s), and converts it/them to csv, or to some other formats such as Manchester OWL. No colspans, rowspans, or nested tables. =head1 Options =over =item * B<-class> I With I<-manchester>, assign this as 'Types' to each Individual. =item * B<-id> I Process only tables with specified I<-id> (repeatable).(not yet) =item * B<-norm> Normalize whitespace. =item * B<-manchester> Output OWL Manchester ontology syntax instead. First column must be the OWL entity's identifier; each column should be a certain fact about that Individual. Fact names can come from THs; a THEAD containing TH or TD, or (I<-useClasses>) from the 'class' attribute of the fact TD. =item * B<-multifact> With I<-manchester>, treat lines within a table cell as multiple instances of that column's fact. =item * B<-nonxml> Generate other than XML output, as defined by [tbd]. =item * B<-null> I If a column matches this, ignore (discard) it. =item * B<-odelim> I Use this as field delimiter (default: tab) =item * B<-oescape> I Use this character when escaping things. =item * B<-oheader> Write a header record with field names (by default, take them from \@class. =item * B<-olineends> Use Unix/Dos/Mac line ends for output. =item * B<-oquote> Use this character as the quote (distinct open/close quotes are not yet supported). =item * B<-oqwhere> Which output fields to quote: none/all/num/text/delim (default=delim: just quote fields that include the delim or quote char). =item * B<-q> Suppress most messages. =item * B<-useClasses> Label fact per 'class' attribute, instead of TH contents. =item * B<-v> Add more detailed messages (to stderr). =item * B<-version> Display version info and exit. =back =head1 Related commands =over =item * tab2xml: does basically the opposite of this, except that it can't read Manchester OWL. =item * align: split lines into fields, and pad to line up. =item * cut, paste. =back =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see http://creativecommons.org/licenses/by-sa/3.0/. The author's present email is sderose at acm.org. For the most recent version, see http://www.derose.net/steve/utilities/. =cut