#!/usr/bin/perl -w # # Written by Steven J. DeRose.. # # A basic XML parser, started 2:13 2008-08-14. Working in 3 hours 50 minutes, except # for recursive and external entities, and entities inside attribute values. # 2008-08-18 sjd: Add real entity manager for external entities (~1hr more). # 2008-09-04 sjd: Catch/report more WF errors. Split out sub doAttlist. # Parse inside of XML dcl. Fix bugs in dumpEntityStack. Catch recursive ents # Ignore non-first dcls of same entity name. Parse NDATA values. # Finish -dirs option. Keep internal ents on open ent stack. ~90min. # # To do: # Expand entities within attr values. # No refs to ext. entities within attrs. # Escape '\' in text-node output, because of '\n'? # Read fixed-size buffers, not lines. # Check that PEs only occur between dcls. use strict; use Getopt::Long; my $version = "2010-09-12"; my @dirs = (); # Places to look for external entity files my $dtd = 1; # Show DTD events? my $quiet = 0; my $verbose = 0; my $ws = 0; # Keep whitespace-only nodes? ############################################################################### Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "dir=s" => \@dirs, "dtd!" => \$dtd, "h|help" => sub { system "perldoc xmlparse"; exit; }, "q!" => \$quiet, "v+" => \$verbose, "version" => sub { die "Version of $version by Steven J. DeRose.\n"; }, "ws!" => \$ws ); if (scalar @dirs == 0) { push @dirs, "./"; } ($result) || die "Bad options.\n"; ############################################################################### # List of defined constructs (markup declarations will add to this) my %elements = (); # Defined elements (later) my %attlists = (); # Defined attlists, by element (later) my %ents = ( "lt" => "<", # Defined internal entities "gt" => ">", "amp" => "&", # "lsqb" => "[", # "rsqb" => "]", "apos" => "'", "quo" => "\"" ); my %systemEnts = (); # General external entities my %pents = (); # Defined parameter entities (later) my %systemPents = (); # Parameter external entities # Parse state my @nameSpaceStack = (); # Active namespace lists, in sync with tagStack my @tagStack = (); # open element types my @tagStartLocs = (); # entity/loc where each was opened my $seenXmlDcl = 0; # Have we seen it yet? my $standalone = 0; my $iencoding = ""; my $xmlVersion = ""; my $inDoctype = 0; # In middle of parsing Doctype declaration? my $seenDoctype = 0; # Past start of Doctype declaration? my $pastDoctype = 0; # Past end of Doctype declaration? my $textBuf = 0; # Accumulated text node contents (-f $ARGV[0]) || die "Cannot find main document file at '$ARGV[0]'.\n"; dumpEvent("INIT",""); defineEntity("-document-", $ARGV[0], "SYSTEM"); openExternalEntity("-document-", $ARGV[0]); (defined fillBuf()) || err("Unable to load beginning of main document."); skipWhiteSpace(); my $c = ""; while ($c = curChar()) { vwarn2("Main loop, c is '$c'"); if ($c eq '<') { doTag(); } elsif ($c eq '&') { doEnt(); } elsif ($inDoctype && $c eq '%') { doParameterEntityRef(); } elsif ($inDoctype && $c eq ']' && nextChar() eq '>') { nextChar(); dumpEvent("DOCTYPEFIN",""); $inDoctype = 0; $pastDoctype = 1; } # Is ']]>' outside a marked section, a WF error? I forget.... else { doChar(); } } if (scalar(@tagStack)>0) { for (my $i=scalar(@tagStack); $i>=0; $i--) { print "Open element '$tagStack[$i]', started at $tagStartLocs[$i].\n"; } err("Unclosed element(s) at EOF."); } dumpEvent("FINAL",""); ($quiet) || vwarn("Done."); exit; ############################################################################### sub doTag { (curChar() eq "<") || err("doTag called while not at '<', but " . curChar()); nextChar(); vwarn2("doTag: char after '<' is '" . curChar() . "'"); if (curChar() eq '!') { doDcl(); } elsif (curChar() eq '?') { doPI(); } elsif (curChar() eq '/') { doEnd(); } else { doStart(); } } sub doDcl { (curChar() eq "!") || err("doDcl called while not at '!', but " . curChar()); nextChar(); # Discard the bang vwarn2("In doDcl, buf is: " . restOfBuf()); if (curChar() eq "-") { (nextChar() eq "-") || err("Comment(?) starts with '")); } elsif (curChar() eq "[" && restOfBuf() =~ /^\[CDATA\[/) { # Marked section for (my $i=0; $i")); } else { my $dclType = readName(); my $paramEntity = 0; vwarn2("Dcl type '$dclType', buf: " . restOfBuf()); skipWhiteSpace(); if (curChar() eq "%") { ($dclType eq "ENTITY") || err("Invalid '%'"); $paramEntity = 1; nextChar(); skipWhiteSpace(); } my $name = readName(); vwarn2("Declared item name '$name'"); skipWhiteSpace(); if ($dclType eq "ELEMENT") { dumpEvent("ELEMENT", $name); } elsif ($dclType eq "ENTITY") { my $idType = ""; skipWhiteSpace(); if (curChar() !~ /['"]/) { $idType= readName(); ($idType eq "PUBLIC" || $idType eq "SYSTEM") || err("Keyword not PUBLIC or SYSTEM for entity '$name'."); skipWhiteSpace(); } if ($idType eq "PUBLIC") { my $publicId = readAttrValue(); skipWhiteSpace(); } my $replacementText = readAttrValue("allowLT"); my $notationName = ""; if ($idType ne "") { skipWhiteSpace(); if (curChar() eq "N") { my $ndata = readName(); ($ndata ne "" && $ndata ne "NDATA") && err("Non-NDATA keyword '$ndata' in ENTITY dcl" . " for '$name'."); skipWhiteSpace(); $notationName = readName(); } } my $info = "$name=$replacementText"; if ($notationName) { $info .= "{$notationName}"; } if ($paramEntity ne 0) { dumpEvent("ENTITY_PAR", $info); defineParameterEntity($name, $replacementText); } else { defineEntity($name, $replacementText, $idType); if ($idType ne "") { dumpEvent("ENTITY_EXT", $info); } else { dumpEvent("ENTITY_INT", $info); } } } elsif ($dclType eq "ATTLIST") { dumpEvent("ATTLIST", $name); } elsif ($dclType eq "NOTATION") { dumpEvent("NOTATION", $name); } elsif ($dclType eq "DOCTYPE") { if ($seenDoctype) { err("Duplicate DOCTYPE, first was at line $seenDoctype"); } $seenDoctype = 1; $inDoctype = 1; dumpEvent("DOCTYPE", $name); readToAndDiscard("["); return; # Don't scan for '>', must parse subset.... } else { err("Uknown markup declaration type: '$dclType'."); } readToAndDiscard(">"); # fails on embedded quoted gt } } # doDcl sub doParameterEntityRef { (curChar() eq "%") || err("doParameterEntityRef called when not at '%'."); nextChar(); my $name = readName(); (curChar() eq ";") || err("Parameter entity reference not ended with ';', but '" . curChar() . "'."); nextChar(); # Nuke the semicolon err("Parameter entities references are not yet supported. Skipping '$name'"); } sub doPI { (curChar() eq "?") || err("doPI called while not at '?', but " . curChar()); nextChar(); my $target = readName(); if ($target ne "xml") { vwarn2("In doPI, target is '$target'"); dumpEvent("PITARGET", $target); dumpEvent("PIVALUE", readToAndDiscard("?>")); } else { vwarn2("In doPI for XML dcl."); ($seenXmlDcl) && err("Duplicate XML dcl, previous was at line $seenXmlDcl"); my %xmlDclAttrs = doAttlist("XMLDCL"); for my $xa (keys %xmlDclAttrs) { if ($xa eq "version") { $xmlVersion = $xmlDclAttrs{$xa}; ($xmlVersion eq "1.0") || err("XML version must be '1.0'."); } elsif ($xa eq "standalone") { $standalone = $xmlDclAttrs{$xa}; if ($standalone eq "") { $standalone = "no"; } ($standalone =~ m/^(yes|no)$/) || err("Value for 'standalone' must be 'yes' or 'no'."); } elsif ($xa eq "encoding") { $iencoding = $xmlDclAttrs{"encoding"} } else { err("Unknown pseudo-attribute '$xa' in XML dcl."); } } readToAndDiscard("?>"); dumpEvent("XMLDCL"); $seenXmlDcl = 1; } } # doPI sub doEnd { (curChar() eq "/") || err("doTag called while not at '/', but " . curChar()); nextChar(); my $name = readName(); skipWhiteSpace(); if (curChar() ne ">") { err("Can't find '>' in end-tag for '$name'."); } nextChar(); ($name eq $tagStack[-1]) || err("Close-tag '$name' does not match expected '$tagStack[-1]'"); pop @tagStack; pop @tagStartLocs; dumpEvent("END", $name); } # doEnd sub doStart { my $empty = 0; my $name = readName(); push @tagStack, $name; push @tagStartLocs, currentEntityLoc(); dumpEvent("START", $name); doAttlist(); if (curChar() eq "/") { $empty = 1; nextChar(); } if (curChar() eq ">") { nextChar(); } if ($empty) { pop @tagStack; dumpEvent("END", $name); # Empty element tag, so issue end-tag event too. } } # doStart # Upgrade this sub so it can be used for the XML declaration as well sub doAttlist { my $forXmlDcl = ($_[0] && $_[0] eq "XMLDCL"); my $done = 0; my %attlist = (); while (!$done) { skipWhiteSpace(); if (curChar() !~ m/\w/) { return; } vwarn2("Trying for aname, buf: " . restOfBuf()); my $aname = readName(); vwarn2(" Got aname '$aname'."); skipWhiteSpace(); (curChar() eq "=") || err("Missing '=' after attr name '$aname'."); nextChar(); skipWhiteSpace(); my $avalue = readAttrValue(); if (defined $attlist{$aname}) { err("Duplicate attribute '$aname'."); } $attlist{$aname} = $avalue; ($forXmlDcl) || dumpEvent("ATTR", "$aname=$avalue"); } # while return(\%attlist); } # doAttlist sub doEnt { # Deal with an entity or numeric character reference. my $numeric = my $hex = 0; if (curChar() eq "#") { # numeric $numeric = 1; nextChar(); if (curChar() =~ /[xX]/) { $hex = 1; nextChar(); } } my $gotSemi = 0; my $name = ""; while (my $c = nextChar()) { if ($c eq ";") { $gotSemi = 1; last; } $name .= $c; } $gotSemi || die "Unterminated entity/char reference '$name'"; if ($numeric) { my $n = 0; if ($hex) { ($name =~ /[0-9a-f]+/i) || err("Invalid char in hex char ref '$name'"); $n = hex($name); } else { ($name =~ /[0-9]+/i) || err("Invalid char in dec char ref '$name'"); $n = $name - 0; } if ($n <= 0 || $n > 65535) { err("Char ref out of numeric range: $n"); } doChar(chr($n)); } else { # named my $e; if ($e = $ents{$name}) { openInternalEntity($name,$e); } elsif ($e = $systemEnts{$name}) { openExternalEntity($name,$e); } else { # only a WF error if standalone='yes' err("Undefined entity name '$name'.\n"); } } } # doEnt sub doChar { my $c = curChar(); if ($verbose > 1) { if (ord($c) < 33) { dumpEvent("CHAR", "#" . ord($c)); } else { dumpEvent("CHAR", "'$c'"); } } else { $textBuf .= $c; } nextChar(); } sub dumpTextBuf { if ($textBuf eq "") { return; } if (!$dtd && !$pastDoctype) { $textBuf=""; return; } if ($textBuf =~ /^\s*$/ && ($inDoctype || !$ws)) { $textBuf=""; return; } $textBuf =~ s/\n/\\n/g; print "TEXT $textBuf\n"; $textBuf = ""; } sub dumpEvent { my ($label, $content) = @_; if (!$content) { $content = ""; } if (!$dtd && !$pastDoctype) { return; } if ($textBuf ne "") { dumpTextBuf(); } print sprintf("%-10s %s\n", $label, $content); } ############################################################################### ################## ENTITY / INPUT BUFFER MGMT sub defineEntity { vwarn("Defining entity $_[0], $_[1], $_[2].\n"); my $ename = $_[0]; ($ename =~ /^(lt|gt|amp|quo|apos)$/) && err("Cannot redefine pre-defined entity '$ename'."); if (defined $ents{$ename}) { warn "Warning: More than one dcl for entity '$ename'.\n"; return; } if (defined $_[2] && $_[2] ne "") { # SYSTEM $systemEnts{$ename} = $_[1]; } else { $ents{$ename} = $_[1]; } } sub defineParameterEntity { my $ename = $_[0]; if (defined $ents{$ename}) { warn "Warning: More than one dcl for parameter entity '$ename'.\n"; return; } if (defined $_[2] && $_[2] ne "") { # SYSTEM $systemPents{$ename} = $_[1]; } else { $pents{$ename} = $_[1]; } } BEGIN { # Stuff needed per open entity: my @oeHandle = (); # A stack of open entity file handles my @oeName = (); # Name of the entity my @oeFilename = (); # Corresponding file name my @oeLinenum = (); # Current line-number in the entity my @oeTagDepth = (); # How deeply nested are we at start of ent? my @l = (); # Current record of input file my @cursor = (); # Current loc in the input record sub curChar { # Return the character the cursor is on, without consuming it if (!scalar(@cursor)) { # warn "Nothing on open entity stack!\n"; return(""); } if ($cursor[-1] >= length($l[-1])) { fillBuf() || return(undef); } return(substr($l[-1],$cursor[-1],1)); } sub consumeChar { # Return the character the cursor is on, AND consume it my $c = curChar(); nextChar(); return($c); } sub nextChar { # Move to and return, the next character. $cursor[-1]++; return(curChar()); } sub pushBackChar { # not used (--$cursor[-1] >= 0) || err("Cannot push back another character, buf is: '" . restOfBuf() . "'"); } sub readName { # Read an XML NAME vwarn2("readName entered."); my $buf = ""; if (curChar() !~ /\w/) { err("Invalid name-start character '" . curChar() . "'"); } while (curChar() =~ /[-.:_\w\d]/) { $buf .= consumeChar(); } return($buf); } sub skipWhiteSpace { my $buf = ""; while (curChar() =~ /\s/) { $buf .= consumeChar(); } vwarn2("After skipWhiteSpace, curChar is '" . curChar() . "'"); return($buf); } # This needs to add handling for entity refs inside! sub readAttrValue { # Quoted either way # If we're actually parsing an entity value literal, "<" is ok. my $allowLT = (defined $_[0] && $_[0] eq "allowLT"); vwarn2("In readAttrValue, buf: " . restOfBuf()); my $origline = $oeLinenum[-1]; my $buf = ""; skipWhiteSpace(); if (curChar() !~ /["']/) { err("Invalid attribute-value start character '" . curChar() . "'"); } my $qtype = curChar(); while (nextChar() ne $qtype) { my $c = curChar(); ($c eq "<" && !$allowLT) && err("Less-than sign found in attribute value."); if ($c eq "&") { warn("Warning: Entity ref in attribute, not yet supported.\n"); dumpEntityStack(); } $buf .= $c; } nextChar() || err("EOF in middle of attribute value started on line $origline"); return($buf); } sub readToAndDiscard { # End-string cannot be broken across line boundary vwarn2("In readToAndDiscard for '$_[0]'"); my $origline = $oeLinenum[-1]; my $buf = ""; my $hit = ""; $l[-1] = substr($l[-1],$cursor[-1]); $cursor[-1] = 0; while (($hit = index($l[-1],$_[0])) < 0) { $buf .= $l[-1]; fillBuf(); } $buf .= substr($l[-1],0,$hit); $cursor[-1] = $hit + length($_[0]); curChar() || err("EOF in middle of scan for '$_[0]', started on line $origline"); return($buf); } sub restOfBuf { return(substr($l[-1],$cursor[-1])); } sub fillBuf { vwarn2("In fillBuf, entity stack depth " . scalar(@oeFilename)); while (scalar @oeFilename > 0) { local *CURFILE = $oeHandle[-1]; my $rec = ""; if (defined($rec = )) { vwarn2("Read line: $rec"); $l[-1] = $rec; $oeLinenum[-1]++; $cursor[-1] = 0; return $l[-1]; } closeEntity(); } return(undef); # EOF } # NOTE: Non-stack way throws off offsets. sub openInternalEntity { vwarn("Opening internal entity '$_[0]'.\n"); for (my $i=0; $i0) && warn " Referenced at line $oeLinenum[-1] of entity $oeName[-1].\n"; return; } local *FILE; open(FILE, "<$sysid") || err("Failed to open file '$sysid' for entity '$ename'."); push @oeHandle,*FILE; push @oeName, $ename; push @oeFilename, $sysid; push @oeTagDepth, scalar(@tagStack); push @l, ""; push @cursor, -1; push @oeLinenum, 0; } sub closeEntity { vwarn("Closing entity '$oeName[-1]'.\n"); close $oeHandle[-1]; pop @oeName; pop @oeHandle; pop @oeFilename; ($oeTagDepth[-1] != scalar(@tagStack)-1) && # ??? err("Entity '$oeName[-1]' is not a balanced sub-tree. " . "It started at depth $oeTagDepth[-1], but ends " . "at depth " . scalar(@tagStack) . "."); pop @oeTagDepth; pop @l; pop @cursor; pop @oeLinenum; } sub currentEntityLoc { return($oeName[-1] . " + " . $oeLinenum[-1]); } sub dumpEntityStack { for (my $i=scalar @oeFilename - 1; $i>=0; $i--) { print sprintf(" %2d: Entity %-12s line %6d, file '%s'\n", $i, $oeName[$i], $oeLinenum[$i], $oeFilename[$i]); } } } # END ############################################################################### sub vwarn { # Basic tracing, invoke with -v ($verbose) || return; print "Warning: $_[0]\n"; } sub vwarn2 { # Tons of detail, invoke with -v -v ($verbose > 1) || return; print "Warning: $_[0]\n"; } sub err { warn "FATAL ERROR: $_[0]\n"; dumpEntityStack(); exit; } ############################################################################### # sub showUsage { warn " =head1 Usage xmlparser.pl [options] xml-file Parses an XML file. Written in less than 4 hours after a talk whose speaker said that writing a 'basic XML parser' is hard. =head1 Options =over =item * B<-dir path> Append a directory to be seached for external entities (repeatable). If none are specified, uses current dir. =item * B<-dtd> Include DTD events (default, use -nodtd to suppress). =item * B<-q> Suppress most messages. =item * B<-v> Add more messages (repeatable). =item * B<-version> Show version info and exit. =item * B<-ws> Keep whitespace-only text nodes (normally dropped). =back =head1 Notes Each SAX-like event is reported on a single line, with 10 columns for the event type, then a space, then the event information. Attributes are given as separate events immediately following their start tag's event. Empty element tags are reported as a start and end (as in SAX). Because each event is on a single line, newline characters in event information (say, TEXT events), are displayed as '\\n'. =head1 Known bugs and limitations Doesn't expand entity references inside attribute values (but reports them) Doesn't report full data from inside markup declarations. Quoted '>' inside markup dcls may confuse the parser. Reads a line at a time, so a big file with no breaks could blow out memory. =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see http://creativecommons.org/licenses/by-sa/3.0/. The author's present email is sderose at acm.org. For the most recent version, see http://www.derose.net/steve/utilities/. =cut "; }