#!/usr/bin/perl -w # # FakeParser.pm # # Written 2011-03-11 by Steven J. DeRose, sderose@acm.org. # 2011-03-14 sjd: lookupEntity(). Empty elements generate ETAG events. Help. # 2011-03-17 sjd: Start adding SAX API. Entities in attributes. Built-ins. # 2011-05-13 sjd: More work on SAx API. Add isXmlName, addentity, reset, # getText, openElement, closeElement, getDepth, getFQGI, getCurrentType, # getCurrentLang, addEntity. Add setXmlEntities. Return array instead of # packed string from pull_more. Generalize pendingEvents. # Break out parseAttributeString(); use HTML::Entities. # 2011-05-25 sjd: getCurrentNsList(). A few bugs with pending events. # Add setNoNest(). Big CDATA MS's; DOCTYPE, XML Decl. Better queueing. # 2011-06-02 sjd: Don't return the quotes on attribute values. Add support # for value-only attributes like HTML border. Improve DOCTYPE parsing. # Support -returnForm for pending events. # 2012-05-10 sjd: Re-org a little and break down by first char in pull_more. # Work on parseToDom methods. Ditch 'prefixForm' option. Move options # into {options} hash. Break out ElementManager as internal package. # Support 'Default' event-handler. Finish attribute defaulting. # 2012-05-14ff sjd: Fix bug in omitted end tags. Testing. Rename NotationDecl # to Notation. Clean up markup dcl handling. Simplify ElementManager. # Implement attrsInPis. # 2012-05-18ff sjd: Syncing API to XML::Parser. Integrating temp EntityManager. # Add 'draconian' feature. Use sjdUtils to expand entities. Debugging pull. # 2013-01-21 sjd: Add cantRecurse notion. # Move ElementManager package to separate file. # # To do: # pull_more always gets buffer '8'. # Switch to EntityManager.pm. # Finish parse_ and pull_ interfaces (see TabularFormats). # Tweak comment regex to avoid internal "--" # Catch and return more WF errors # (see -help section and Checklists/wellFormednessErrors.txt) # Make parse...toDOM a package? # use strict; use HTML::Entities; use XML::DOM; use sjdUtils; use ElementManager; #use EntityManager; our $VERSION = "1.30"; ############################################################################### ############################################################################### ############################################################################### # Trivial, temporary version of Entity Manager package. # Doesn't know about %entities, catalogs, paths, resolver callbacks, notations, # char-level input, pushback, external entity refs, location reports, zips.... # # ### Switch to EntityManager.pm. # package EntityManager; sub new { my $class = shift @_; my $self = { theText => "", current_file => undef, FH => undef, curLine => 1, curCharInLine => 1, curCharInSource => 0, }; bless $self, $class; } sub reset { my ($self, $s) = @_; $self->{theText} = ""; $self->{curCharInLine} = 1; $self->{curLine} = 1; $self->{curCharInSource} = 0; } sub pushback { my ($self, $s) = @_; $self->{theText} = $s . $self->{theText}; (my $copy = $s) =~ s/[^\n]+//gs; $self->{curCharInSource} -= length($s); $self->{curLine} -= length($copy); $self->{curCharInLine} = 1; # a guess... } sub addText { my ($self, $s) = @_; $self->{theText} .= $s; } sub addFile { my ($self, $path) = @_; open(F, "<$path") || return(0); while (my $rec = ) { $self->{theText} .= $rec; } close(F); return(1); } sub readLine { my ($self) = @_; my $rawLine = ""; if (!$self->{theText}) { my $fh = $self->{FH}; if ($self->{FH}) { $rawLine = <$fh>; return($rawLine); } else { return(undef); # EOD } } else { my $eol = index($self->{theText}, "\n"); if ($eol>=0) { $rawLine = substr($self->{theText}, 0, $eol+1); $self->{theText} = substr($self->{theText}, $eol+1); } else { $rawLine = $self->{theText}; $self->{theText} = ""; } } $self->{curCharInSource} += length($rawLine); $self->{curLine}++; $self->{curCharInLine} = 1; return($rawLine); } sub getLinesUpto { # Typically scanning to ">", "?>", etc) my ($self, $delim) = @_; my $buf = ""; #$self->{theText}; while (defined(my $line = $self->readLine())) { #warn " getLinesUpto($delim): '$line'\n"; $buf .= $line; (index($line, $delim) >= 0) && last; } #$buf = $self->expandEntities($buf); #warn " expanded to: '$buf'\n"; return($buf); } sub getExpandedLine { my ($self) = @_; return($self->expandEntities($self->readLine())); } # Temp: sub expandEntities { my ($self, $s) = @_; return(sjdUtils::expandXml($s)); } # End of EntityManager Package ############################################################################### ############################################################################### ############################################################################### # Main package # package FakeParser; my $xname = "[_\\w][-_.:\\w\\d]*"; # Regex to match XML NAME (approx) my $qs = "\".*?\"|'.*?'"; # Regex to match quoted string my %dclEvents = ( # Reserved XML markup dcl names "ELEMENT" => "Element", "ATTLIST" => "Attlist", "ENTITY" => "Entity", "NOTATION" => "Notation", # "DOCTYPE" => "Doctype", # (handled separately) ); my $dclName = join("|",keys(%dclEvents)); # Same event names as CPAN XML::Parser, except additions where marked. # (see http://search.cpan.org/~msergeant/XML-Parser-2.36/Parser.pm) # my %eventNames = ( # Based on XML::Parser; see also tupleSets/, YMLParser. "Init" => 1, "Final" => 1, "Start" => 1, "End" => 1, "Char" => 1, "Proc" => 1, "Comment" => 1, "CdataStart" => 1, "CdataEnd" => 1, "Default" => 1, "Unparsed" => 1, "ExternEnt" => 1, "ExternEntFin" => 1, "Element" => 1, # Dcl "Attlist" => 1, # Dcl -- once per *attribute*! "Entity" => 1, # Dcl "Notation" => 1, # Dcl "Doctype" => 1, "DoctypeFin" => 1, "XMLDecl" => 1, "Attr" => 1, # Extension "AttrFin" => 1, # Extension "ProcAttr" => 1, # Extension "ProcAttrFin" => 1, # Extension "ERROR" => 1, # Extension ); my %theirOptions = ( "Style" => undef, # "Debug" "Subs" "Tree" "Objects" "Stream" # Handlers (separate) "Pkg" => undef, # Calling package "ErrorContext" => undef, # Number of lines of context to show "ProtocolEncoding" => "", # UTF-8 ISO-8859-1 UTF-16 US-ASCII "Namespaces" => 0, # Do namespace processing "NoExpand" => 0, # call Default handler for ents in conent "Stream_Delimiter" => undef, # Magic line, when read counts as EOF "ParseParamEnt" => undef, # If standalong and this, do it "NoLWP" => undef, # Forces use of file-based extern ent handler "Non-Expat-Options" => undef, # Hash of other options for subclassers ); sub new { my $class = shift @_; my $self = { version => "2013-01-21", options => { quiet => 0, verbose => 0, draconian => 0, # XML draconian error handling attrEvents => 0, # Return separate event per attribute attrsInPis => 0, # Parse inside of PIs like attrs normalize => 0, # Normalize white space useHtmlEntities => 0, # Support HTML named entities useXmlEntities => 1, # Support the 5 XML built-ins expandEntities => 1, # Handle entities transparently coalesce => 0, # Never return adjacent text nodes }, theirOptions => \%theirOptions, # Schema information and Entity Manager handlers => {}, # Event callbacks for 'push' parsing attrDefaults => {}, # Dft values for attrs by gi and name noNest => {}, # {child} => disallowed parent(s) entManager => new EntityManager(), # Parser element context and other state es => new ElementManager(), inCDATA => 0, # Are we inside a CDATA M S? inDoctype => 0, # Are we inside the DOCTYPE? seenTag => 0, # Have we seen a tag yet? current_file => "", # Where we are current_line => 1, pendingEvents => [], # Events waiting to be returned died => 0, # 'draconian' set, and seen a WF error }; # self bless $self, $class; $self->queueEvent("Init"); while (scalar(@_) > 1) { my $name = shift @_; my $value = shift @_; if ($name eq "Handlers") { $self->setHandlers($value); } elsif (defined $theirOptions{$name}) { $self->setHandlers($value); } else { die "FakeParser: Unknown constructor option '$name'.\n"; } } return $self; } # new FakeParser sub reset { my ($self) = @_; $self->{es} = new ElementManager(); $self->{entManager} = new EntityManager(); $self->clearPendingEvents(); $self->queueEvent("Init"); $self->{inCDATA} = 0; $self->{inDoctype} = 0; $self->{seenTag} = 0; $self->{current_file} = ""; $self->{current_line} = 1; } sub setOption { my ($self, $name, $value) = @_; (defined $self->{options}->{$name}) || die "Bad option name '$name'\n"; $self->{options}->{$name} = $value; } sub getOption { my ($self, $name) = @_; return($self->{options}->{$name}); } sub setHandlers { my ($self, $hhRef) = @_; my %hh = %$hhRef; for my $handlerName (keys %hh) { if (!defined $eventNames{$handlerName}) { die "FakeParser: Unknown handler '$handlerName'.\n"; } $self->{handlers}->{$handlerName} = $hh{$handlerName}; } } sub resetHandlers { my ($self) = @_; $self->{handlers} = {}; } # Maintain a list of what parent types must be closed before opening # a given element type. Parents are just a tab-separated list. # sub setNoNest { my ($self, $parent, $child) = @_; my $curList = $self->{noNest}->{$child}; $self->noNest->{$child} = ($curList) ? "$curList\t$parent" : $parent; } ############################################################################### ############################################################################### # Variations on parsing, first to produce a DOM. # sub parsefiletoDOM { my ($self, $file) = @_; $self->{current_file} = $file; $self->addFile($file); my $theDOM = $self->parsestringtoDOM(); return($theDOM); } sub parsestringtoDOM { my ($self, $text) = @_; my $theDoc = new XML::DOM::Document(); my $curNode = undef; my $newNode = undef; my $unhandled = 0; $self->pull_start(); while (my @args = @{$self->pull_more()}) { my $eType = shift @args; if ($eType eq "XMLDecl") { # XMLDecl # $newNode = $theDoc->createXMLDecl(shift @args); # $theDoc->setXMLDecl($newNode); } elsif ($eType eq "Doctype") { # Doctype # $newNode = $theDoc->createDocumentType(shift @args); # $theDoc->setDoctype($newNode); } elsif ($eType eq "Start") { # Start $newNode = $theDoc->createElement(shift @args); while (@args) { my $name = shift; my $value = shift; $newNode->setAttribute($name,$value); } if ($curNode) { $curNode->appendChild($newNode);} else { $theDoc->setRoot($newNode); } $curNode = $newNode; } elsif ($eType eq "Attr") { # Attr my $name = shift; my $value = shift; $curNode->setAttribute($name,$value); } elsif ($eType eq "End") { # End $curNode = $curNode->getParent(); } elsif ($eType eq "Char") { # Char $newNode = $theDoc->createTextNode(shift @args); $curNode->appendChild($newNode); } elsif ($eType eq "Proc") { # Proc my $tgt = shift @args; my $txt = shift @args; $newNode = $theDoc->createProcessingInstruction($tgt,$txt); $curNode->appendChild($newNode); } elsif ($eType eq "Comment") { # Comment $newNode = $theDoc->createComment(shift @args); $curNode->appendChild($newNode); } elsif ($eType eq "Unparsed") { # Unparsed $newNode = $theDoc->createEntityReference(shift @args); $curNode->appendChild($newNode); } elsif ($eType eq "ExternEnt") { # ExternEnt $newNode = $theDoc->createEntityReference(shift @args); $curNode->appendChild($newNode); } elsif ($eType eq "ERROR") { # ERROR if ($self->{died}) { return(undef); } } elsif (defined $eventNames{$eType}) { $unhandled++; # "AttrFin" "Init" "Final" "CdataStart" "CdataEnd" "ExternEntFin" # "Default" "Element" "Attlist" "Entity" "Notation" # "DoctypeFin" "ERROR" } else { die "Unknown event type '$eType'"; } } # while pull_more return($theDoc); } # parsestringtoDOM ############################################################################### ############################################################################### # More variations on parsing, using SAX callback interface # (cf SAX and TabularFormats.pm). # sub parsefile { my ($self, $file, $optionsHash) = @_; #warn "FakeParser::parseFile for '$file'\n"; #$self->{entManager}->addFile($file); open(my $fh, "<$file") || return(undef); $self->parse($fh, $optionsHash); close($fh); return(1); # (or result of Final() handler) } sub parsestring { my ($self, $s, $optionsHash) = @_; return($self->parse($s, $optionsHash)); } sub parse { my ($self, $fhOrString, $optionsHash) = @_; my $nEvents = 0; #warn "ref of that: '" . ref($fhOrString) . "'\n"; if (ref($fhOrString) eq "IO" || ref($fhOrString) eq "GLOB") { while (defined(my $rec = <$fhOrString>)) { #warn "parse: from file, addText('$rec')\n"; $self->{entManager}->addText($rec); } } else { warn "parse: from string, addText('$fhOrString')\n"; $self->{entManager}->addText($fhOrString); } $self->parse_run($optionsHash); } sub parse_run { # Run the actual parse, via pull_more. my ($self, $optionsHash) = @_; $self->pull_start(); my $nEvents = 0; while (my $eref = $self->pull_more()) { my @e = @$eref; shift @e; my $type = shift @e; if (defined ($self->{handlers}->{$type})) { $self->{handlers}->{$type}->($self,@e); } elsif (defined ($self->{handlers}->{"Default"})) { $self->{handlers}->{"Default"}->($self,@e); } $nEvents++; ($type eq "Final") && last; if ($self->{died}) { return(undef); } } return($nEvents); } ############################################################################### # Add parse_start/more/done here. # ############################################################################### # Provide a "pull" style parser (which SAX doesn't). # # As we're parsing, we simply push the event(s) we find or infer. # When there are such pending events available, we just return them; only # when they're exhausted, do we go back to actual parsing again. # # Note: We don't do things like push/pop the open element stack, until the # events are actually issued, that way our state is in sync w/ caller. # sub pull_start { my ($self, $optionsHash) = @_; warn "pull_start()\n"; } sub pull_more { my ($self) = @_; if ($self->nPendingEvents()>0) { return($self->shiftEvent()); } # Once there's nothing pending, do real parsing # my $raw .= $self->{entManager}->getLinesUpto(">"); # Meh.... $self->vMsg(1,"pull_more, we got buffer: '$raw'\n"); if (!$raw) { # EOF my $d = $self->{es}->getDepth(); if ($d > 0) { $self->queueEvent("End", $self->{es}->getCurrentType()); } else { $self->queueEvent("Final"); } return($self->shiftEvent()); } my $c1 = substr($raw,0,1); my $lengthUsed = 0; # How much parsed away? if ($self->{inDoctype}) { # IN DOCTYPE if ($raw =~ m/^(\s*\]?\s*>)/) { # DoctypeFin $lengthUsed = length($1); $self->queueEvent("DoctypeFin"); $self->{inDoctype} = 0; } elsif ($raw =~ m/^(\s*%$xname;\s*)/) { # Parameter entity $lengthUsed = length($1); # no-op for now } } elsif ($self->{inCDATA}) { # IN CDATA MS if ($raw =~ m/^(.*?)(]]>)/s) { # CDATA MS END $lengthUsed = length($1.$2); if ($1) { $self->queueEvent("Char", $1); } $self->queueEvent("CdataEnd"); } else { # raw content $lengthUsed = length($raw); $self->queueEvent("Char", $raw); } } elsif ($c1 eq "]" && $raw =~ m/^(]]>)/s) { # CDATA MS END $lengthUsed = length($1); my $msg = "Found ]]> not inside CDATA marked section\n"; $self->queueEvent("ERROR", $msg); $self->queueEvent("Char", $1); } elsif ($c1 eq "<") { # Any "<" $lengthUsed = $self->doPointy($raw); } elsif ($c1 eq "&") { # Any "&" $lengthUsed = $self->doAmpersand($raw); } ################################################## Text # Catch stream delimiter in entity manager? elsif ($raw =~ # EOF m/^([^>&]*\n)(\s*Stream_Delimiter\n)/) { $self->queueEvent("Char", $1); $self->{entManager}->reset(); $self->queueEvent("Final"); } else { # Text $raw =~ m/^([^<&]+)/s; $lengthUsed = (defined $1) ? length($1) : 1; my $txt = substr($raw,0,$lengthUsed); if ($self->{normalize}) { $txt =~ s/\s\s+/ /g; $txt =~ s/^\s+//; $txt =~ s/\s+$//; } $self->queueEvent("Char", $txt); } # Whatever we parsed, we end up here. Consume it and return some event. # if ($lengthUsed <= 0) { die "lengthUsed <= 0!\n"; } $self->{entManager}->pushback(substr($raw, $lengthUsed)); return($self->shiftEvent()); } # pull_more sub pull_done { my ($self) = @_; $self->{DONE} = 1; } ############################################################################### # Handle anything beginning with "<". # Return the number of characters consumed (so caller can updated buf). # sub doPointy { my ($self, $raw) = @_; if (length($raw)<2) { $self->queueEvent("ERROR", "Found '<' at EOF.\n"); $self->queueEvent("Char", "<"); return(1); } my $lengthUsed = 0; my $c2 = substr($raw,1,1); ######################################################### ")/s) { # End tag $lengthUsed = length($1); my $gi = ($2) ? $2:""; my $where = $self->{es}->findOpen($gi); if ($where < 0) { warn "Error: End-tag for '$gi', which is not open. " . "Open elements: " . $self->{es}->getCurrentFQGI() . ".\n"; $self->queueEvent("ERROR", "End-tag for non-open '$gi'"); } else { my $d = $self->{es}->getDepth(); for (my $i=$d-1; $i>=$where; $i--) { my $t = $self->{es}->getCurrentType($i); ($i>$where) && $self->queueEvent("ERROR", "Omitted end tag for '$t'."); $self->queueEvent("End", $t); } } } ######################################################### "queueEvent("CdataStart"); } elsif ($raw =~ m/^()/s) { # COMMENT $lengthUsed = length($1); $self->queueEvent("Comment", $2); } elsif ($raw =~ m/^queueEvent("Doctype", $dtname, $public, $system); if ($bracket) { $self->{inDoctype} = 1; } else { $raw =~ m/^(.*?>)/s; $lengthUsed = length($1); } } elsif ($raw =~ # MARKUP DCLS m/^()/s) { $lengthUsed = length($1); # Should generate one event per *attribute* in an ATTLIST. $self->queueEvent($dclEvents{$1}, $2, $3); } else { # Bad "]+?)/; $lengthUsed = length($1); $self->queueEvent("ERROR", "Unrecognized markup declaration"); } } # ")/si) { # XML DECL $lengthUsed = length($1); if ($self->{seenTag}) { $self->queueEvent( "ERROR", "XML declaration found after seeing tag(s)."); } else { $self->queueEvent("XMLDecl", $2); } } elsif ($raw =~ m/^(<\?(.*?)\?>)/s) { # PROCESSING INSTR $lengthUsed = length($1); my $pi = $2; $pi =~ s/^($xname)\s*//; my $target = ($1) ? ($1) : ""; if ($self->{attrsInPis} && $pi) { $self->queueEvent("Proc", $target, ""); my %attrs = %{$self->parseAttributeString($pi)}; for my $a (keys %attrs) { $self->queueEvent("ProcAttr", $a, $attrs{$a}); } $self->queueEvent("ProcAttrFin", "", ""); } else { $self->queueEvent("Proc", $target, $pi); } } } # "?" ######################################################### "]*)?(\/?>))/s) { # Start/empty tag $lengthUsed = length($1); $self->{seenTag} = 1; my $gi = $2; my $attrString = $3 ? $3:""; my $close = $4; if (my $closeEm = $self->{noNest}->{$gi}) { my $out = $self->{es}->findOutermost($closeEm); my $d = $self->{es}->getDepth(); for (my $i=$d-1; $i>=$out; $i--) { $self->queueEvent("End", $self->{es}->{tagStack}->[$i]); } } my $attrHashRef = $self->parseAttributeString($attrString); if (my $defaults = $self->{attrDefaults}->{gi}) { for my $dAttr (keys(%{$defaults})) { if (!defined $attrHashRef->{$dAttr}) { $attrHashRef->{$dAttr} = $defaults->{$dAttr}; } } } if (!$self->{attrEvents}) { # Start tag my @args = ("Start", $gi, $attrHashRef); $self->queueEvent(@args); } else { $self->queueEvent("Start", $gi); for my $a (sort keys(%{$attrHashRef})) { $self->queueEvent("Attr", $a, $attrHashRef->{$a}); } $self->queueEvent("AttrFin"); } if ($close eq "/>") { # Empty element $self->queueEvent("End", $gi); } } ######################################################### Syntax error else { $lengthUsed = 1; $self->queueEvent("ERROR", "Syntax error after '<'."); $self->queueEvent("Char", "<"); } return($lengthUsed); } # doPointy ############################################################################### # Handle anything beginning with "&". # Return the number of characters consumed (so caller can updated buf). # sub doAmpersand { my ($self, $raw) = @_; my $lengthUsed = 0; if ($raw =~ m/^(&(#|#x)?$xname;)/si) { # Entity/Char Ref $lengthUsed = length($1); if ($self->{options}->{expandEntities}) { $self->queueEvent("Char",$self->{entManager}->expandEntities($2)); } else { $self->queueEvent("Unparsed",$1); } } else { $lengthUsed = 1; $self->queueEvent("ERROR", "Syntax error after '&'."); $self->queueEvent("Char", "&"); } return($lengthUsed); } # doAmpersand ############################################################################### # Pending Events queue # # These are the only methods that should mess with the pendingEvents queue, # or implement side-effects of the events such as openElement/closeElement, # or setting inCDATA (??). # # sub clearPendingEvents { my ($self) = @_; $self->{pendingEvents} = []; } sub nPendingEvents { my ($self) = @_; return(scalar(@{$self->{pendingEvents}})); } sub queueEvent { my $self = shift; my @pe = @_; push @{$self->{pendingEvents}}, \@pe; return unless ($pe[0] eq "ERROR"); $self->vMsg(0,"ERROR: $pe[1]"); if ($self->{draconian}) { my @foo = ("FATAL ERROR"); push @{$self->{pendingEvents}}, \@foo; $self->{died} = 1; } } # This method also adds $self to the front of the returned event, so event # callbacks will have it available like in XML::Parser. # sub shiftEvent { my ($self) = @_; if ($self->nPendingEvents() <= 0) { return(undef); } my $eventRef = shift @{$self->{pendingEvents}}; my @event = @$eventRef; #vMsg(0,"shiftEvent: returning a $event[0].\n"); if ($event[0] eq "End") { my $gi = $event[1]; $self->{es}->closeElement($gi); } elsif ($event[0] eq "Start") { my $gi = $event[1]; my $attrHashRef = pop @event; #warn("shiftEvent: gi '$gi', attrs $attrHashRef\n"; $self->{es}->openElement($gi,$attrHashRef); for my $a (sort keys(%{$attrHashRef})) { push @event, $a; push @event, $attrHashRef->{$a}; } } elsif ($event[0] eq "CdataStart") { $self->{inCDATA} = 1; } elsif ($event[0] eq "CdataEnd") { $self->{inCDATA} = 0; } elsif (defined $eventNames{$event[0]}) { } else { $self->vMsg(0,"Undefined pending event type '$event[0]'.\n Known: (" . join(", ",keys(%eventNames)) . ")\n"); } unshift @event, $self; return(\@event); } # shiftEvent ############################################################################### # Anything from the element type up to "/?>" on start-tags, gets passed # here. We parse and expand the individual attributes. # Can an attribute refer to an external entity? # # Returns: a hash of the attributes. # # Allows unquoted attribute values. # Allow HTML-style bare attribute names? # Is also used for contents of PIs when attrsInPis option is in effect. # sub addAttributeDefault { my ($self, $gi, $attrName, $defaultValue) = @_; if (!defined $self->{attrDefaults}->{$gi}) { $self->{attrDefaults}->{$gi} = {}; } } sub parseAttributeString { my ($self, $attrString) = @_; my $orig = $attrString; my %atHash = (); while ($attrString) { $attrString =~ s/^\s+//; my $an = ""; my $av = ""; # Check for a bare name token attribute (e.g. HTML border) my $n = $attrString =~ s/^($xname)(\s+[^\s=]|[\/>])/$2/; if ($n>0) { $an = $av = $1; } else { $n = ($attrString =~ s/^($xname)\s*=\s*(".*?"|'.*?'|$xname)\s*//); if ($n<=0) { $self->vMsg(0,"Bad attribute syntax in '$attrString'\n"); last; } $an = $1; $av = $2; $av =~ s/^['"]//; $av =~ s/['"]$//; } if (!$an) { $self->vMsg(0,"Missing attribute name in '$orig'\n"); } else { (defined $atHash{$an}) && $self->vMsg(0,"Duplicate attribute '$an'\n"); $atHash{$an} = $self->{entManager}->expandEntities($av); } } return(\%atHash); } # parseAttributeString ############################################################################### # Forwarding to ElementManager package. # sub getDepth { my ($self) = @_; return($self->{es}->getDepth()); } sub getCurrentFQGI { my ($self) = @_; return($self->{es}->getCurrentFQGI()); } ############################################################################### # Forwarding to EntityManager package # sub addText { my ($self, $s) = @_; $self->{entManager}->addText($s); } ############################################################################### ############################################################################### # Utilities # # Return true if the name passed is a legitimate XML NAME. # (Perl \w is not exactly the same as needed for XML NAMEs, but it's close). # sub isXmlName { my ($self, $name) = @_; return(($name =~ m/^$xname$/) ? 1:0); } sub vMsg { my ($self, $level, $msg) = @_; chomp $msg; return if ($self->{quiet} && $level<=0); return if ($self->{options}->{verbose} < $level); warn "FakeParser: $msg\n"; } ############################################################################### ############################################################################### ############################################################################### # =pod =head1 Usage use FakeParser; Parse a string as XML, but don't fail if it's not WF. Used almost exactly like C from CPAN (except that it doesn't support various "Styles"). This is I a conforming XML parser. But it's close, small, fairly fast, and handy. It should produce the same results as any SAX parser, for any well-formed XML document; but it will also survive a variety of errors and correct some. The event structure is basically the same as the SAX interface used (for example) by XML::Parser (see http://search.cpan.org/~msergeant/XML-Parser-2.36/Parser.pm), though you can also (optionally) get events for individual attributes. If desired, you can instead use the "pull" interface, where you start up the parser and then ask for events one at a time, rather than getting callbacks. =head2 Why/how this is not a real XML parser For good reasons, the XML standard requires that a conforming XML parser terminate if it finds a well-formedness error. If you have "sort of XML" data (for example, if some end-tags are missing, or some attributes are not quoted, etc.), this means you can't use a conforming XML parser to clean it up. You I use a standalone program like the excellent I; or you can use I, which is meant to be "plugged in" as a compatible replacement for I in such situations. =head3 Special (non-XML-like) parsing behaviors Most of these cases will also generate an "ERROR" event preceding whatever they return (which you can just ignore if desired). It should be the case that you can create a conforming XML parser merely by causing your application to die on seeing an ERROR event; however, this is probably only I true. That is, not all well-formedness errors are guaranteed to be caught. =over =item * Multiple root-level elements are allowed. =item * Missing end-tags are provided when an outer end-tag or EOF is found. =item * End-tags for non-open elements are ignored. =item * Element and attribute names that start with C<[-.0-9]>, or that contain characters in Perl \w (a broader class than XML permits) are allowed. =item * Unquoted attribute values are ok if they only contain XML NAME characters. =item * SGML/HTML style bare-NAME-token attributes will be accepted, such as in "". However, they are treated as equivalent to (for example) border="border", rather than checking for an enumerated type declaration in a DTD. =item * Attribute lists that can't be figured out, are survived. =item * "&" or "<" characters that don't begin markup constructs will be returned as text. =item * Late, case-varying, repeated, and/or otherwise erroneous XML declarations are allowed. =item * Encoding declarations are ignored (the input is expected to be UTF-8). Consider using C if needed. =item * Unknown named entity references are allowed, and treated as text. =item * There is an option to not expand any entity references at all. =item * You can set up your own entity definitions and attribute defaults, or HTML's set, regardless of DTD. =item * Spurious marked-section ends ("]]>") are reported and treated as text. =item * Although it does not (yet) parse external DTDs, it does parse DOCTYPE internal subsets (and is not picky about the '[' and ']'). =item * The caller will soon be able to set attribute defaulting via method I. =back =for nobody =================================================================== =head1 Example (push) use FakeParser; $fp = new FakeParser(); $fp->setHandlers( { "Start" => \&myStartHandler, "End" => \&myEndHandler, "Char" => \&myCharHandler, "ERROR" => \&myErrorHandler, } $fp->parseText($myText); =head1 Example (pull) use FakeParser; $fp = new FakeParser(); $fp->addText($myText); while (my @args = @{$fp->pull_more()}) { my $eType = shift @args; if ($eType eq "Start") { ... } elsif ($eType eq "End") { ... } elsif ($eType eq "Char") { ... } else { ... } } =for nobody =================================================================== =head1 Methods =over =item * B() =item * B() Clear any document-specific state (keep declared entities, etc). This does not reset options, attribute defaults, etc; the idea is to handle a new document of the same kind you were just doing. =item * B(I) Options available (all Boolean) include: =over =item * I (Boolean) -- Provide a separate "Attr" event for each attribute, and an "AttrFin" event to mark the end of all attributes for the preceding start-tag. =item * I (Boolean) -- Never return adjacent text nodes (such as for a character reference in the middle of text content), but combine them into a single text event. Not yet supported. =item * I (Boolean) -- Normalize white space in text, and don't return any white-space-only text nodes. =item * I (Boolean) -- Entities are quietly expanded (the default). If this option is turned off, they will be returned as "Unparsed" events instead. =item * I (Boolean) -- Support HTML named character entities. =item * I (Boolean) -- Support the 5 XML built-in entities. =item * I (Boolean) -- parse the contents of PIs as if they were attribute specification lists, and return them that way, followed by an C event (see also I). =item * I (Boolean) -- suppress zero-level messages. =item * I (integer) -- how many warning messages to provide. =item * I (Boolean) -- enables XML-style error handling, where the parse is terminated at the first WF error. This is B the default behavior, because the point of this parser is to help get through lame data. If you set this option, many (probably not all) XML Well-formedness errors will be caught, and result in termination of the parse. In that case, C will be returned from calls such as I, I, and so on. =back =item * B(I) Return the value of the named option (or undef if I is unknown). =item * B(hash) Like the corresponding method in XML::Parser. I maps SAX event names to the Perl procedures to be called when each one happens (see above for a list and an exmple). There are a couple extra events unlike XML::Parser. =item * B() Remove all attached event handlers (see I). =item * B(I) Assert that elements of type I may never occur within elements of type I (directly or indirectly). Thus, if the parse sees a I element, it will first force all applicable I elements closed. Repeatable. =for nobody =================================================================== =item * B(I) Append the text to the buffer to be parsed. =item * B(I) Append the contents of the file at I to the buffer to be parsed. =for nobody =================================================================== =item * B(I,optionsHash?) Given either an open file handle or a string, parse and call event handlers (see I). =item * B(I,optionsHash?) Parse the string I and call event handlers (see I). =item * B(I,optionsHash?) Open the file at I and call I on it. =back =head2 Methods for "pull" parsing =over =item * B(optionsHash?) Return a handle to an ExpatNB object, which lets you parse chunks at a time. Call its I() method with a new chunk of XML, and it will parse just that piece, calling event handlers as usual. When done, call its I() method. =item * B() Parse off the next XML construct from the parse buffer, removing it. The event will be returned as a reference to an array, whose elements are essentially the same as the arguments passed to a callback when using Perl C, except that one extra argument is inserted, which is the event type name (the same as you would pass to XML::Parser to register a handler); this argument is [1], after [0] which is a reference to the FakeParser object itself. The rest of the arguments depend on the event type, and are the same as with C. =item * B() Indicates that the document being parsed via I is finished. =item * B(I) Parse the file I and return a reference to an XML::DOM structure. For the moment this just loads the file into a string and calls I. =item * B(I) Parse the string I and return a reference to an XML::DOM structure. Unfinished. =back =for nobody =================================================================== =head2 Schema-related methods =over =item * B(I) (experimental) Define a default value (similar to what you can do with an ATTLIST declaration in an SGML or XML DTD), for a particular attribute. With SGML (but not XML, if I remember right), a default value can be associated with a named attribute for a separate element type, or shared across the same-named attributes on multiple element types, such as I and I here: B, with this parser you cannot share a default like that. =back =for nobody =================================================================== =head1 SAX event types These are the same as for Perl's XML::Parser, plus I and the optional I and I. The names shown below are the keys to use when passing a hash to I(). Each key's data should be a reference to the Perl procedure to be called when the event occurs; it will be passed the parameters shown. =over =item B(I) =item B(I) =item B(I) =item B(I) =item B(I) Iff you set the I option, then instead of attributes being packed into the I event, they will follow it, one event per attribute, in alphabetical order. An I event will then follow to mark the end of attributes. =item B() See I. =item B(I) =item B(I) =item B(I) =item B(I) =item B(I) Message: Printable text for the message =item B(I) Indicates the start of a CDATA marked section. Content will be returned via following I events. =item B(I) Indicates the end of a CDATA marked section. =item B(I) An event for which no handler has been installed. =back =head3 B =over =item B(I) A reference to an unparse external entity (such as an image). =item B(I) An (unexpanded?) external entity reference. =item B(I) =item B(I) An ENTITY declaration in the DTD. =item B(I) An ELEMENT declaration in the DTD. =item B(I) An ATTLIST declaration in the DTD. =item B(I) =item B(I) The DOCTYPE declaration in the DTD. =item B(I) The end of the DOCTYPE declaration in the DTD. =back =for nobody =================================================================== =head1 Known bugs and limitations Does not die on all WF errors, and therefore is not a conforming XML parser. This is intentional. By setting the I option, all WF errors which I caught, will terminate parsing instead of just issuing an C event (some WF errors might not be caught, however). Does not coalesce entities that produce text, or CDATA marked sections, into the surrounding text to make a single text event/node. Does not do anything with namespaces other than accept names that include ":". Does not try to parse the inside of DTDs or external entities. It will properly skip past most DOCTYPE declarations and internal subsets. =for nobody =================================================================== =head1 Related commands C and C do similar repairs, but don't (so far as I know) expose a regular SAX interface. C is somewhat similar. Any XML parser is quite similar, except that it will terminate on various errors which this pseudo-parser will survive and often correct. C converts between countless character sets, so is excellent for getting input ready for I. C uses I to parse the guts of wiki pages, since they often have incomplete or incorrect. C, C, C. =for nobody =================================================================== =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see L. B This software must not be used as an "XML" parser unless it is modified so that it conforms to XML's explicit requirement that XML parsers not accept non-well-formed XML. This requires trapping all of the WF errors this software catches, plus any more that it doesn't, and terminating normal processing at that point. See the XML Recommendation at L for details. The author's present email is sderose at acm.org. For the most recent version, see L. =cut