#!/usr/bin/perl -w # # saxtrace: List SAX events straight from parser. # # 2007-07-26: Written by Steven J. DeRose.. # Built from normalizeXML. # # To do: # use strict; my $version = "2010-09-12"; # Option defaults my $default_catalog = "$ENV{XML_CATALOG}"; my $ent = 0; # turn whitespace to entities? my $catalog = $default_catalog; my $dropPentities = 0; # Exclude parameter entity declarations? my $dropDoc = 0; my $fqgi = 0; my $includeDTD = 0; # Write out the whole DTD? my $lineends = 0; my $lnum = 0; # Show source line numbers? my $nows = 0; # Ignore whitespace nodes? my $outLineends = "U"; my $outputExtension = ""; # If non-nil, save to file(s), not stdout. my $parseParamEnt = 1; # Handle external parameters entities? my $startat = 1; # Don't show events before this line. my $quiet = 0; my $verbose = 0; while ($ARGV[0]) { if (index($ARGV[0],"--")==0) { $ARGV[0] = substr($ARGV[0],1); } if ($ARGV[0] eq "-q") { $quiet = 1; } elsif ($ARGV[0] eq "-dtd") { $includeDTD = 1; } elsif ($ARGV[0] eq "-ent") { $ent = 1; } elsif ($ARGV[0] eq "-fqgi") { $fqgi = 1; } elsif ($ARGV[0] eq "-linends" || $ARGV[0] eq "-lineends") { $lineends = 1; } elsif ($ARGV[0] eq "-lnum") { $lnum = 1; } elsif ($ARGV[0] eq "-noparam") { $parseParamEnt = 0; } elsif ($ARGV[0] eq "-nopentities") { $dropPentities = 1; $includeDTD = 1; } elsif ($ARGV[0] eq "-nows") { $nows = 1; } elsif ($ARGV[0] eq "-out") { shift; $outputExtension = $ARGV[0]; } elsif ($ARGV[0] eq "-outlinends" || $ARGV[0] eq "-outlineends") { shift; $outLineends = uc(substr($ARGV[0]."U",0,1)); (index("MDU",$outLineends) >= 0) || die "Unknown output line-end type '$ARGV[0]'.\n"; } elsif ($ARGV[0] eq "-startat") { shift; $startat = $ARGV[0] - 0; ($startat > 1) || die "Bad -startat value $startat.\n"; } elsif ($ARGV[0] eq "-v") { warn "Running in verbose mode.\n"; $verbose = 1; } elsif ($ARGV[0] eq "-version") { warn "Version of $version, by Steven J. DeRose.\n"; exit; } elsif (substr($ARGV[0],0,1) eq "-") { ($ARGV[0] eq '-h' or $ARGV[0] eq '-help') || warn "saxparse: Unknown option '$ARGV[0]'.\n"; showUsage(); exit; } else { last; } shift; } # options my $newline = chr(10); if ($outLineends eq "M") { $newline = chr(13); } elsif ($outLineends eq "D") { $newline = chr(13).chr(10); } ($quiet) || warn "saxtrace: Using catalog '$catalog'.\n"; ############################################################################### my @tagStack = (); my $pastDoctype = 0; # To suppress comments from DTD my $curline = 0; my $fh = STDOUT; foreach my $file (@ARGV) { (-f $file) || die "Can't find input file '$file'.\n"; if ($outputExtension) { my $outfile = $file; $outfile =~ s/\.[^.]+$/\.$outputExtension/; open $fh, ">$outfile"; } parseDocument($file,$fh); if ($outputExtension) { close $fh; } } ($quiet) || warn "Done.\n"; exit; ############################################################################### # # XML::Catalog original is at: # http://search.cpan.org/~ebohlman/XML-Catalog-0.02/Catalog.pm # ############################################################################### sub parseDocument() { use XML::Parser; use XML::Catalog; my $catalog=XML::Catalog->new($catalog); my $parser; if ($parseParamEnt) { $parser = new XML::Parser(ErrorContext => 2, ParseParamEnt => 1); } else { $parser = new XML::Parser(ErrorContext => 2); } $parser->setHandlers(ExternEnt => $catalog->get_handler($parser), Start => \&startHandler, End => \&endHandler, Init => \&initHandler, Final => \&finalHandler, Char => \&charHandler, Proc => \&procHandler, Comment => \&commentHandler, Doctype => \&doctypeHandler, DoctypeFin => \&doctypeFinHandler, Default => \&defaultHandler); if ($includeDTD) { $parser->setHandlers(Element => \&elementDclHandler, Attlist => \&attlistDclHandler, Entity => \&entityDclHandler, XMLDecl => \&xmlDclHandler); } $parser->parsefile($_[0]); } # sub parseDocument ############################################################################### # # Regarding the XML::Parser module: # # API doc is at http://search.cpan.org/~msergeant/XML-Parser/Parser.pm # There are a few entity-related events not caught here.... # initHandler: Called at start of parsing sub initHandler { $curline = $_[0]->current_line; markerLine("INIT"); } # finalHandler: Called at end of parsing sub finalHandler { $curline = $_[0]->current_line; markerLine("FINAL"); } sub startHandler { $curline = $_[0]->current_line; my $parser = shift; my $name = shift; push @tagStack, $name; markerLine("START"); if ($fqgi) { $name = join("/",@tagStack); } infoLine("NAME", $name); while ($_[0]) { my $n = shift; my $v = shift; infoLine($n,$v); } } sub endHandler { $curline = $_[0]->current_line; markerLine("END"); my ($parser, $name) = @_; if ($fqgi) { $name = join("/",@tagStack); } infoLine("NAME", $name); pop @tagStack; } sub charHandler { $curline = $_[0]->current_line; if ($_[1] =~ /^\s*$/) { if ($nows) { return; } markerLine("SPACE"); } else { markerLine("CHAR"); } setupLine($_[1], "|", "|"); } sub procHandler { $curline = $_[0]->current_line; markerLine("PROC"); my ($parser, $name, $value) = @_; infoLine("NAME", $name); infoLine("VALUE", $value); } sub commentHandler { $curline = $_[0]->current_line; if ($includeDTD || $pastDoctype) { markerLine("COMMENT"); shift; for $a (@_) { setupLine($a, "|", "|"); } } } sub doctypeHandler { $curline = $_[0]->current_line; markerLine("DOCTYPE"); my ($parser, $name, $public, $system, $internal) = @_; infoLine("NAME", $name); infoLine("PUBLIC", $public); infoLine("SYSTEM", $system); infoLine("INTERNAL",$internal); } # doctypeHandler sub doctypeFinHandler { $curline = $_[0]->current_line; $pastDoctype = 1; markerLine("DOCTYPEFIN"); shift; for $a (@_) { setupLine($a); } } # doctypeFinHandler sub cdataStart { $curline = $_[0]->current_line; markerLine("CDATASTART"); shift; for my $a (@_) { setupLine($a); } } sub cdataEnd { $curline = $_[0]->current_line; markerLine("CDATAEND"); shift; for my $a (@_) { setupLine($a); } } ############################################################################### # DTD stuff sub entityDclHandler { $curline = $_[0]->current_line; (my $p, my $name, my $value, my $sysid, my $pubid, my $ndata, my $isParam) = @_; markerLine("ENTITY"); infoLine("NAME", $name); infoLine("VALUE", $value); infoLine("PUBLIC", $pubid); infoLine("SYSTEM", $sysid); infoLine("NDATA", $ndata); infoLine("ISPARAM",$isParam); } # entityDclHandler sub elementDclHandler { $curline = $_[0]->current_line; (my $p, my $name, my $model) = @_; markerLine("ELEMENT"); infoLine("NAME", $name); infoLine("MODEL", $model); } # sub elementDclHandler sub attlistDclHandler { $curline = $_[0]->current_line; (my $p, my $name, my $aname, my $avalue, my $adft, my $afix) = @_; markerLine("ATTLIST"); infoLine("ELEMENT", $name); infoLine("NAME", $aname); infoLine("VALUE", $avalue); infoLine("DEFAULT", $adft); infoLine("FIXED", $afix); } # attlistDclHandler sub xmlDclHandler { $curline = $_[0]->current_line; (my $p, my $version, my $iencoding, my $standalone) = @_; markerLine("XMLDCL"); infoLine("VERSION", $version); infoLine("ENCODING", $iencoding); infoLine("STANDALONE", $standalone); } # xmlDclHandler sub defaultHandler { return unless ($includeDTD || $pastDoctype); $curline = $_[0]->current_line; markerLine("DEFAULT"); shift; for my $a (@_) { setupLine($a); } } ############################################################################### sub markerLine() { return unless ($curline >= $startat); my $event = $_[0]; my $flag = "*$event*"; print $flag; if ($lnum) { if (length($flag) < 8) { print (" " x (8-length($flag))); } print "\t[$curline]"; } print "$newline"; } sub infoLine() { return unless ($curline >= $startat); my $name = $_[0]; my $value = $_[1]; print "\t$name=$value$newline"; } sub setupLine() { return unless ($curline >= $startat); my $a = $_[0]; if ($ent) { my $buf = ""; for (my $i=0; $i Use XML catalog 'name' (default = $default_catalog). =item * B<-dtd> Include the DTD (with parameter entities expanded!) =item * B<-ent> Turn whitespace chars other than space into entities. =item * B<-fqgi> Show entire FQGI, not just current element type. =item * B<-lnum> Show line number for each event. =item * B<-nows> Drop whitespace nodes. =item * B<-noparam> Don't parse parameter entities or DTD. =item * B<-out ext> Results to [filename].ext (instead of stdout) =item * B<-q> Suppress most messages. =item * B<-startat n> Don't print anything until line n. =item * B<-v> Add more messages. =item * B<-version> Display version info and exit Notes: If the doctype does not point directly to the DTD, you'll either need a catalog, or the -noparam, to make the parser happy. "; }