#!/usr/bin/perl -w # # YMLParser.pm # # Written 2011-03-11 by Steven J. DeRose, sderose@acm.org. # Based on xmlparser and fakeparser. # 2011-03-14 sjd: lookupEntity(). Empty elements generate ETAG events. Help. # 2011-03-17 sjd: Start adding SAX API. Entities in attributes. Built-ins. # 2011-05-13 sjd: More work on SAx API. Add isXmlName, addentity, reset, # getText, openElement, closeElement, getDepth, getFQGI, getCurrentGI, # getCurrentLang, defineEntity. Add setXmlEntities. Return array instead of # packed string from nextEvent. Generalize pendingEvents. # Break out parseAttributeString(); use HTML::Entities. # 2011-05-25 sjd: getCurrentNsList(). A few bugs with pending events. # Add setNoNest(). Big CDATA MS's; DOCTYPE, XML Decl. Better queueing. # 2011-06-02 sjd: Don't return the quotes on attribute values. Add support # for value-only attributes like HTML border. Improve DOCTYPE parsing. # Support -returnForm for pending events. # 2011-07-16 sjd: Start implementing parseToDOM. Fix bugs in entity-handling. # 2011-07-25 sjd: Add -entitiesInPIs for entity expansion inside PIs. Debug. # Don't let text-matching eat YML chars. Swap ^R and ^Q. # 2011-07-26ff sjd: Bunch of debugging. Pull in and packagize EntityManager. # Separate short-attribute features and make optional. Add ymlElementInfo. # Option to split pseudo-attributes out of PI content. Add packages # for ElementDcl, AttlistDcl, NotationDcl. Support lastClosed. Fix reopen. # 2011-08-03 sjd: Still fixing ymlQuad. Move penultimate element items into # element object. Merge base and yml attrs right. # 2011-08-04 sjd: Track/return default namespace, add getDefaultNameSpace(). # Return prefix and URI on Attr events. Add ProcAttr, ProcAttrFin events. # 2011-09-08 sjd: Catch some more WF errors. Normalize to LF in input. # Start on parameter entity support. # 2012-05-18 sjd: Split EntityManager to external package. # 2013-01-21 sjd: Finish switching to external EntityManager. Start Attribute. # 2013-06-19 sjd: Change ymlQuad to handle first row / first col better. # Add changeDelimiter(). Ditch "PREFIX" form. Add nextEventArray(). # Start on regex precompiling. # # To do: # Finish compiling regexes.... # Attribute defaults (integrate ElementManager.pm). # Long comments, PIs, etc. Just keep fetching til done. # Finish parseToDom methods # Handle XML decl 'encoding' # Allow <[tag]> and <{tag}> to signal array and hash semantics? # External ent can start w/ XML Decl (w/ charset!) # Finish ATTLIST, NOTATION, etc. dcls # Add rabbit-duck support? # Mod CDATA to be: # # Low priority: # Support local/cumulative IDs # Swap order of parseAttributeStringToArray and parseAttributeString? # Options re. parameter entity expansion? # Factor errors to method to both queue and warn? # Support Stax API? # Test reading straight from a zip files # Return more WF errors # PE refs in markup dcls in internal subset # PE replacement text must match extSubsetDecl # Ref to unparsed entity (but can be named in ENTITY(IES) attrs # attrs can't refer to external ents # Synchronous entities # Split out Element/Attribute Managers? # use strict; use XML::DOM; use HTML::Entities; # http://search.cpan.org/~adamk/Archive-Zip-1.30/lib/Archive/Zip/MemberRead.pm #use Archive::Zip; #use Archive::Zip::MemberRead; use EntityManager; #use ElementManager; our $VERSION = "0.9"; ############################################################################### ############################################################################### # Maintain various information about a single YML element instance. # Cf ElementManager.pm. # # For YML, esp. ymlQuad, we need more information than in XML. If you're in # a TR's 10th TD, you need to still have the *previous* TR's 10th child too # (or at least that child's type and attributes). # # Therefore, as long as an element is on the stack we need its prior sibling # object, which in turn keeps its children's types and attrs. So every # element object has arrays for {childTags} and {childAttrs}. # When we pop an element off the stack we don't just discard it. Instead, we # put a reference to it in the parent element object as {penultimate}. # When we close an Element, it's no longer the current child, so it moves over. # package ymlElementInstanceInfo; sub ymlElementInstanceInfo::new { my ($class, $tag, $attrArrayRef) = @_; my $self = { tag => $tag, # What XML element type? attrs => $attrArrayRef, # (name1,value1,name2,value2,...) lang => "", # Inherited xml:lang code newNameSpaces => [], # nss declared *here* dftNameSpace => "", # (only if set here) # Following are needed mainly for ymlQuad. We keep the full node # for the preceding-(element)-sibling of each open element; but # for those we only need the list of child types and attrs. # penultimate => undef, # Ref to 2nd-last child childTags => [], # Tags of all the element children childAttrs => [], # Attrs of same }; bless $self, $class; return($self); } sub ymlElementInstanceInfo::getTag { my ($self) = @_; return($self->{tag}); } sub ymlElementInstanceInfo::setTag { my ($self, $tag) = @_; $self->{tag} = $tag; } sub ymlElementInstanceInfo::getAttrArray { my ($self) = @_; return($self->{attrs}); } sub ymlElementInstanceInfo::getAttr { my ($self, $aname) = @_; my $nAttrTokens = scalar(@{$self->{attrs}}); for (my $i=0; $i<$nAttrTokens; $i+=2) { if ($self->{attrs}->[$i] eq $aname) { return($self->{attrs}->[$i+1]); } } return(undef); } sub ymlElementInstanceInfo::setAttr { my ($self, $aname, $avalue) = @_; my $nAttrTokens = scalar(@{$self->{attrs}}); for (my $i=0; $i<$nAttrTokens; $i+=2) { if ($self->{attrs}->[$i] eq $aname) { $self->{attrs}->[$i] = $avalue; return; } } push @{$self->{attrs}}, $aname; push @{$self->{attrs}}, $avalue; } sub ymlElementInstanceInfo::delAttr { my ($self, $aname) = @_; my $nAttrTokens = scalar(@{$self->{attrs}}); for (my $i=0; $i<$nAttrTokens; $i+=2) { if ($self->{attrs}->[$i] eq $aname) { delete $self->{attrs}->[$i+1]; delete $self->{attrs}->[$i]; return; } } } sub ymlElementInstanceInfo::getLang { my ($self) = @_; return($self->{lang}); } sub ymlElementInstanceInfo::setLang { my ($self,$lang) = @_; $self->{lang} = $lang; } sub ymlElementInstanceInfo::setNameSpace { my ($self, $abbr, $uri) = @_; for (my $i=0; $i{newNameSpaces}}); $i+=2) { if ($self->{newNameSpaces}->[$i] eq $abbr) { $self->{newNameSpaces}->[$i+1] = $uri; return; } } push @{$self->{newNameSpaces}}, $abbr; push @{$self->{newNameSpaces}}, $uri; } sub ymlElementInstanceInfo::appendChild { my ($self, $tag, $attrsRef) = @_; if (!$tag) { $tag = "_UNKNOWN"; } if ($attrsRef && !ref($attrsRef)) { die "Non-ref passed for attrs to appendChild.\n"; } push @{$self->{childTags}}, $tag; push @{$self->{childAttrs}}, $attrsRef; if (scalar(@{$self->{childTags}}) != scalar(@{$self->{childAttrs}})) { die "tags and attrs are out of sync.\n"; } } # appendChild ############################################################################### ############################################################################### # package YMLParser; my $badControls = "["; # Make list of non-XML chars for (my $i=1; $i<32; $i++) { ($i==9 || $i==10 || $i==13) && next; $badControls .= chr($i); } $badControls .= "]"; # Bind control characters for the special YML shorthand (cf changeDelimiter()) # my $ymlQuad = chr(17); # ^Q DC1 my $ymlReopen = chr(18); # ^R DC2 my $ymlStartAttributes = chr(19); # ^S DC3 (after ymlReopen or ymlQuad) my $ymlTerminateAttributes = chr(20); # ^T DC4 (after ymlStartAttributes) my $ymlUp = chr(21); # ^U NAK my $ymlChars = "$ymlUp$ymlReopen$ymlQuad"; # Not the attr ones! # Compiled regexes # # XML delimiters my $como = qr/^/; my $mdo = qr/^/; my $Xpi = qr/^$pio(.*?)$pic/; my $Xcom = qr/^$como(.*?)$comc/; my $XstartTag = qr/^<($xname)(\s.*?)?(\/?>)/; my $XendTag = qr/^<\/(.*?)\s*>/; my $XymlChars = qr/^([$ymlChars]|<\/>)/; my $XcdataStart = qr/^/; # Entity refs my $XhexRef = qr/^(&#x[\da-f]+;)/; my $XdecRef = qr/^(&#\d+;)/; my $XentRef = qr/^(&$xname;)/; my $XpentRef = qr/^%($xname;)/; # Markup declarations my $XdoctypeO = qr/^)?/; my $XsysIdent2 = qr/^(SYSTEM|PUBLIC)\s*$qlit\s*$qlit?/; my $Xnotation = qr/^\s*NOTATION\s+(\w+)/; my $XdclElement = qr/^/; my $XdclEntity = qr/^/; my $XdclAttlist = qr/^/; my $XdclNotation = qr/^/; # Based on XML::Parser (cf SAX). See also tupleSets/, fakeParser. # my %eventNames = ( "Init" => "RUN+", # "Final" => "RUN-", # "Start" => "ELM+", # "End" => "ELM-", # "Char" => "CHAR", # "Proc" => "PINS", # "Comment" => "COMM", # "CdataStart" => "CDT+", # "CdataEnd" => "CDT-", # "Default" => "DEFT", # "Unparsed" => "UNPA", # "ExternEnt" => "EXT+", # "ExternEntFin" => "EXT-", # # Markup declarations, from the DTD "Entity" => "ENT:", # "Element" => "ELM:", # "Attlist" => "ATT:", # "Notation" => "NOT:", # "Doctype" => "DOC+", # "DoctypeFin" => "DOC-", # "XMLDecl" => "XDCL", # "Attr" => "ATT+", # Extension "AttrFin" => "ATT-", # Extension "ProcAttr" => "PI@+", # Extension "ProcAttrFin" => "PI@-", # Extension "ERROR" => "ERRR", # ); sub YMLParser::new { my ($class) = @_; my $s = ""; my %ha = (); my @pe = ("Init"); my $self = { version => "2013-06-19", # Options verbose => 0, attrEvents => 0, # Return separate event per attribute? normalize => 0, # Normalize white space? useHtmlEnties => 1, # Support HTML named entities? useXmlEnties => 1, # Support the 5 XML built-ins? expandEntities => 1, # Handle entities transparently? entitiesInPIs => 0, # Expand ent. refs in PIs? attrsInPIs => 0, # Treat PI content as pseudo-attributes? coalesce => 0, # Never return adjacent text nodes? yml => 0, # Support my YML proposal? shortAttrs => 0, # Allow special short-attribute forms? defaultLang => "EN", # In case no xml:lang on document element cantGoInside => {}, # {child} => disallowed parent(s). # Markup declaration stuff entManager => new EntityManager, elementDcls => {}, notationDcls => {}, # Data to be worked on theText => $s, # Text being parsed pendingEvents => [ \@pe ], # Events waiting to be returned # Parse state theStack => [], # A stack of ymlElementInstanceInfo objects inCDATA => 0, # Are we inside a CDATA M S? inDoctype => 0, # Are we inside the DOCTYPE? seenTag => 0, # Have we seen a tag yet? current_file => "", # Where we are current_line => 1, handlers => \%ha, # Event callbacks for 'push' parsing }; # self bless $self, $class; return $self; } # new sub YMLParser::yMsg { my ($self, $level, $msg) = @_; warn("YMLParser: $msg\n"); } sub YMLParser::identifier { my ($self) = @_; return("YMLparser.pl $self->{version}"); } sub YMLParser::reset { my ($self, $n, $v) = @_; $self->{theText} = ""; $self->resetStack(); my @pe = ( "Init" ); $self->{pendingEvents} = [ \@pe ]; $self->{inCDATA} = 0; $self->{inDoctype} = 0; $self->{seenTag} = 0; $self->{current_file} = ""; $self->{current_line} = 1; } # reset sub YMLParser::changeDelimiter { my ($self, $name, $v) = @_; if ($name eq "ymlQuad") { $ymlQuad = $v || chr(17); } elsif ($name eq "ymlReopen") { $ymlReopen = $v || chr(18); } elsif ($name eq "ymlStartAttributes") { $ymlStartAttributes = $v || chr(19); } elsif ($name eq "ymlTerminateAttributes") { $ymlTerminateAttributes = $v || chr(20); } elsif ($name eq "ymlUp") { $ymlUp = $v || chr(21); } else { die "Unknown delimiter '$name'.\n"; } $ymlChars = "$ymlUp$ymlReopen$ymlQuad"; # Not the attr ones! } sub YMLParser::setOption { my ($self, $optionName, $v) = @_; if ($optionName !~ m/^(verbose|attrEvents|normalize|div|h|expandEntities|coalesce|yml|shortAttrs)$/) { die "Bad option name '$optionName'\n"; } $self->{$optionName} = $v; } sub YMLParser::appendEntityPath { my ($self, $path) = @_; $self->{entManager}->appendEntityPath($path); } # See http://search.cpan.org/~msergeant/XML-Parser-2.36/Parser.pm # sub YMLParser::setHandlers { my ($self, $hhRef) = @_; my %hh = %$hhRef; for my $handlerName (keys %hh) { if (!defined $eventNames{$handlerName}) { die "YMLParser: Unknown handler '$handlerName'.\n"; } $self->{handlers}->{$handlerName} = $hh{$handlerName}; } } sub YMLParser::setNoNest { my ($self, $parent, $child) = @_; my $curList = $self->cantGoInside->{$child}; $self->cantGoInside->{$child} = ($curList) ? "$curList\t$parent" : $parent; } sub YMLParser::parsefile { my ($self, $file) = @_; $self->{current_file} = $file; (scalar keys %{$self->{handlers}}) || die "No handlers set for YMLParser.\n"; my $fh = undef; if ($file =~ m/\.zip$/) { my $zip = Archive::Zip->new($file); my @members = $zip->members(); $fh = Archive::Zip::MemberRead->new($zip,$members[$0]); $self->yMsg(0, "parsing directly from zip files is experimental."); } else { (open($fh,$file)) || return(undef); } my $nEvents = 0; my $rec; while (1) { if (ref($fh) eq "GLOB") { $rec = <$fh>; } else { $rec = $fh->readline(); } (defined $rec) && last; $self->addText($rec); } close $fh; $self->yMsg(0, "parsefile: loaded, starting parse, self: $self"); while (my $epacked = $self->nextEvent()) { my @e = split(/\t/, $epacked); #shift @e; my $type = shift @e; if (!defined $type) { die "Bad event. " . $epacked . "\n"; } my $hand = $self->{handlers}->{$type}; if (defined $hand) { $hand->($self, @e); # Is this right? } elsif ($hand = $self->{handlers}->{"Default"}) { $hand->($self, @e); # Is this right? } $nEvents++; ($type eq "Final") && last; } return($nEvents); } sub YMLParser::parsestring { my ($self, $text) = @_; (scalar keys %{$self->{handlers}}) || die "No handlers set for YMLParser.\n"; my $nEvents = 0; $self->addText($text); while (my $epacked = $self->nextEvent()) { my @e = split(/\t/, $epacked); #shift @e; my $type = shift @e; if (defined ($self->{handlers}->{$type})) { $self->{handlers}->{$type}->($self,@e); } else { die "Bad event type '$type'\n"; } $nEvents++; ($type eq "Final") && last; } return($nEvents); } sub YMLParser::parsefiletoDOM { my ($self, $file) = @_; die "YMLParser: parsing file to a DOM is not yet implemented.\n"; $self->{current_file} = $file, (open(XF,$file)) || return(undef); while (my $rec = ) { $self->addText($rec); } close XF; return($self->parseStringToDOM()); } # parseFileToDOM sub YMLParser::parsestringtoDOM { my ($self, $text) = @_; die "YMLParser: parsing string to a DOM is not yet implemented.\n"; my $theDOM = new XML::DOM(); my $currentElement = $theDOM->getDocumentElement(); $self->{attrEvents} = 1; while (my @eventRef = @{$self->nextEvent()}) { my $eventType = $eventRef[0]; if ($eventType eq "Init") { next; } elsif ($eventType eq "Final") { last; } elsif ($eventType eq "XMLDecl") { next; } elsif ($eventType eq "Doctype") { next; } elsif ($eventType eq "DoctypeFin") { next; } elsif ($eventType eq "Start") { my $newel = $theDOM->makeElement($eventRef[1]); $currentElement->appendChild($newel); $currentElement = $newel; } elsif ($eventType eq "Attr") { $currentElement->setAttribute($eventRef[1],$eventRef[2]); } elsif ($eventType eq "AttrFin") { next; } elsif ($eventType eq "End") { $currentElement = $currentElement->getParentNode(); } elsif ($eventType eq "Char") { my $newel = $theDOM->makeTextNode($eventRef[1]); $currentElement->appendChild($newel); } elsif ($eventType eq "Comment") { my $newel = $theDOM->makeCommentNode($eventRef[1]); $currentElement->appendChild($newel); } elsif ($eventType eq "Proc") { my $newel = $theDOM->makeProcNode($eventRef[1],$eventRef[2]); $currentElement->appendChild($newel); } elsif ($eventType eq "ProcAttr") { } elsif ($eventType eq "ProcAttrFin") { next; } elsif ($eventType eq "End") { $currentElement = $currentElement->getParentNode(); } elsif ($eventType eq "CdataStart") { next; } elsif ($eventType eq "CdataEnd") { next; } elsif ($eventType eq "Default") { next; } elsif ($eventType eq "Unparsed") { next; } elsif ($eventType eq "Notation") { next; } elsif ($eventType eq "ExternEnt") { next; } elsif ($eventType eq "ExternEntFin") { next; } elsif ($eventType eq "Entity") { next; } elsif ($eventType eq "Element") { next; } elsif ($eventType eq "Attlist") { next; } elsif ($eventType eq "Notation") { next; } elsif ($eventType eq "ERROR") { next; } } # while $theDOM->normalize(); return(\$theDOM); } # parsestringtoDOM ############################################################################### # Manage the text to be parsed (replace with real entity manager). # sub YMLParser::addText { my ($self, $s) = @_; $self->{theText} .= $s; } sub YMLParser::getText { my ($self, $s) = @_; return($self->{theText}); } sub YMLParser::clearText { my ($self) = @_; my $rc = $self->{theText}; $self->{theText} = ""; return($rc); } ############################################################################### # Do a "pull" style parse. As we're parsing, we simply push the event(s) we # find or infer (each is a reference to an array, which is the same as the # array of arguments (not counting the parser itself) that XML::Parser would # pass to a handler); when events are already queued, we just return them. # sub YMLParser::nPendingEvents { my ($self) = @_; return(scalar(@{$self->{pendingEvents}})); } sub YMLParser::nextEvent { my ($self) = @_; my $e = $self->nextEventArray(); shift @{$e}; return(join("\t", @{$e})); } sub YMLParser::nextEventArray { my ($self) = @_; ($self->{verbose}) && $self->yMsg(0, "Entering nextEvent."); # If there's a pending event(s) already, just return the next one. # if ($self->nPendingEvents()>0) { return($self->shiftEvent()); } # Once there's nothing pending, do real parsing # #if (!$self->{theText}) { $self->reload(); } # ENTITY MGR my $raw = $self->{theText}; if (!$raw) { # EOF my $d = $self->getDepth(); if ($d > 0) { $self->pushEvent("End", $self->getCurrentTag()); } else { $self->pushEvent("Final"); } } # In constructs that can persist across events! # "]]>" outside CDATA MS is an error, but we ignore it. # elsif ($self->{inCDATA}) { # in CDATA MS if ($raw =~ s/^(.*?)]]>//s) { if ($1) { $self->pushEvent("Char", $1); } $self->pushEvent("CdataEnd"); } else { $self->pushEvent("Char", $raw); } } elsif ($self->{inDoctype} && $raw =~ s/\s*\]?\s*>//) { # Doctype Fin $self->pushEvent("DoctypeFin"); $self->{inDoctype} = 0; } # Potentially long constructs elsif ($raw =~ s/^pushEvent("CdataStart"); } elsif ($raw =~ s/^]]>//s) { # CDATA MS END my $msg = "WF: Found ]]> not inside CDATA marked section\n"; $self->pushEvent("ERROR", $msg); $self->vMsg(0,$msg); } elsif ($raw =~ s/^<\?(XML) (.*?)\?>//si) { # XML DECL ($1 eq "xml") || $self->pushEvent( "ERROR", "WF: XML declaration target is not lower-case."); if ($self->{seenTag}) { my $msg = "WN: Late XML declaration found\n"; $self->pushEvent("ERROR", $msg); $self->vMsg(0,$msg); } else { $self->pushEvent("XMLDecl", $2); } } elsif ($raw =~ s/^<\?(.*?)\?>//s) { # PROCESSING INSTR my $pi = $1; $pi =~ s/^($xname)\s*//; my $target = ($1) ? ($1) : ""; if ($target =~ m/^yml/) { # YML PI } if ($self->{entitiesInPIs}) { $pi = $self->expandEntities($pi); } $self->pushEvent("Proc", $target, $pi); if ($self->{attrsInPIs}) { my $aaRef = $self->parseAttributeStringToArray($pi); for (my $i=0; $aaRef && $ipushEvent("ProcAttr",$aaRef->[$i],$aaRef->[$i+1]); } $self->pushEvent("ProcAttrFin"); } } # PI elsif ($raw =~ s/^//s) { # COMMENT my $text = $1; ($text =~ m/-$/) && pushEvent( "ERROR", "WF: Comment cannot end with '--->'."); $self->pushEvent("Comment", $text); } elsif ($raw =~ s/^"); } sub YMLParser::setXmlEntities { my ($self, $flag) = @_; if (defined $flag && !$flag) { $self->useXmlEntities = 0; } else { $self->useXmlEntities = 1; } } # setXmlEntities sub YMLParser::addTextEntity { my ($self, $aName, $value) = @_; if (defined $self->{textEntities}->{$aName} || defined $self->{fileEntities}->{$aName}) { $self->yMsg(0, "Entity '$aName' redefined.\n"); } $self->{textEntities}->{$aName} = $value; $self->{ecount}++; } sub YMLParser::setHtmlEntities { my ($self,$flag) = @_; if (defined $flag && !$flag) { $self->useHtmlEntities = 0; } else { $self->useHtmlEntities = 1; } } # setHtmlEntities ############################################################################### # The parser just scans for the closing "/?>" on start-tags. Anything ahead # of that gets passed here to parse out the individual attributes. # Allows several minimization options. # ####### CHANGE this around -- build array, then make hash from that. # sub YMLParser::parseAttributeStringToArray { my ($self, $attrString) = @_; my $ahref = $self->parseAttributeString($attrString); my @aarray = (); for my $k (sort keys %{$ahref}) { push @aarray, $k; push @aarray, $ahref->{$k}; } return(\@aarray); } sub YMLParser::parseAttributeString { my ($self, $attrString) = @_; my $orig = $attrString; my %atHash = (); while ($attrString) { $attrString =~ s/^\s+//; my $an = ""; my $av = ""; if ($attrString =~ # Normal s/^($xname)\s*=\s*(".*?"|'.*?')\s*//) { $an = $1; $av = $2; $av =~ s/^['"]//; $av =~ s/['"]$//; } elsif ($self->{shortAttrs} && # +/- $attrString =~ s/([-+])($xname)\s*//) { $an = $2; $av = ($1 eq '+') ? "1" : "0"; # $self->yMsg(0, "+/- attr, '$1' for '$2'.\n"); } elsif ($self->{shortAttrs} && # Unquoted $attrString =~ s/^($xname)\s*=\s*($xname)\s*//) { $an = $1; $av = $2; # $self->yMsg(0, "unquoted attr, '$an' = '$av'.\n"); } elsif ($self->{shortAttrs} && # Bare $attrString =~ s/^($xname)\s*//) { $an = $av = $1; # $self->yMsg(0, "bare attr, '$an'.\n"); } else { # Failed my $msg = "WF: Bad attribute syntax in '$attrString'\n"; $self->vMsg(0,$msg); $self->pushEvent("ERROR", $msg); last; } if (!$an) { $self->vMsg(0,"YMLWF: Missing attribute name in '$orig'\n"); } else { # Normalize the value per XML 3.3.3 (defined $atHash{$an}) && $self->vMsg(0,"WF: Duplicate attribute '$an'\n"); $atHash{$an} = $self->expandEntities($av); # Should be recursive $atHash{$an} =~ s/[ \r\n\t]/ /g; if (0) { # non-CDATA attrs, if we saw the declaration $atHash{$an} =~ s/ +/ /g; $atHash{$an} =~ s/^ //; $atHash{$an} =~ s/ $//; } } } # while return(\%atHash); } # parseAttributeString ############################################################################### # sub YMLParser::dclElement { my ($self, $ename, $model) = @_; $self->{elementDcls}->{$ename} = new ElementDcl($ename, $model); } sub YMLParser::dclAttr { my ($self, $ename, $aname, $atype, $adefault) = @_; $self->{elementDcls}->{$ename}->{attrs}->{$aname} = "$atype\t$adefault"; } ############################################################################### # Return true if the name passed is a legitimate XML NAME. # (I'm not certain Perl \w is exactly the same as needed for XML NAMEs, # but it's close). # sub YMLParser::isXmlName { my ($self, $theName) = @_; return(($theName =~ m/^$xname$/) ? 1:0); } sub YMLParser::isXmlChar { my ($self, $c) = @_; my $n = ord($c); if ($n == 0x0009 || $n == 0x000A || $n == 0x000D || ($n >= 0x0020 && $n <= 0xD7FF) || ($n >= 0xE000 && $n <= 0xFFFD) || ($n >= 0x10000 && $n <= 0x10FFFF)) { return(1); } return(0); } sub YMLParser::vMsg { my ($self, $level, $msg) = @_; if (!$msg) { $msg = ""; }; chomp $msg; if ($self->{verbose}) { $self->yMsg(0, "YMLParser: $msg\n"); } } ############################################################################### # Manage the open-element stack and various state information. # For YML, we have to keep the previous sibling of each open element, too. # sub YMLParser::resetStack { my ($self) = @_; $self->{theStack} = []; } sub YMLParser::openElement { my $self = shift; my $tag = shift; if (!$tag) { $self->yMsg(0, "openElement: nil tag arg\n"); $tag = "_UNKNOWN"; } my @attrArray = @_; my $curFrame = $self->{theStack}->[-1]; my $newEI = new ymlElementInstanceInfo($tag, \@attrArray); $newEI->setLang($curFrame ? $curFrame->{lang} : $self->{defaultLang}); for (my $i=0; $isetLang($avalue); } elsif ($aname =~ m/^xmlns:/) { if ($aname eq "xmlns:") { $newEI->{dftNameSpace} = $avalue; } else { # should we inherit default? $newEI->setNameSpace($aname,$avalue); } } } # Possible bug: We don't track multiple children of #ROOT if ($curFrame && $self->{yml}) { $curFrame->appendChild($tag,\@attrArray); } push @{$self->{theStack}}, $newEI; my $d = $self->getDepth(); } # openElement sub YMLParser::closeElement { my ($self, $tag) = @_; my $curFrame = $self->{theStack}->[-1]; if (!$curFrame) { $self->yMsg(0, "closeElement called with nothing open.\n"); return(); } my $curTag = $curFrame->{tag}; if ($tag && $curTag ne $tag) { $self->yMsg(0, "closeElement called for '$tag', expected '" . $curFrame->{tag} . "'."); } # Record where this element type last ended, for later error msgs. $self->{elementDcls}->{$curTag}->{lastClosed} = $self->{entManager}->getWholeEntityLoc(); # ymlQuad needs the element as long as its 1st following-sibling is open. if ($self->getDepth()>=2) { $self->{theStack}->[-2]->{penultimate} = $curFrame; } pop @{$self->{theStack}}; } # closeElement # Searching/counting along the theStack # sub YMLParser::getDepth { my ($self) = @_; return(scalar(@{$self->{theStack}})); } sub YMLParser::getFQGI { my ($self) = @_; my $buf = ""; for my $e (@{$self->{theStack}}) { $buf .= "/" . $e->{tag}; } return($buf); } sub YMLParser::isOpen { my ($self, $tag) = @_; for my $e (@{$self->{theStack}}) { if ($e->{tag} eq $tag) { return(1); } } return(0); } sub YMLParser::nOpen { my ($self, $tag) = @_; my $n = 0; for my $e (@{$self->{theStack}}) { if ($e->{tag} eq $tag) { $n++; } } return($n); } sub YMLParser::findOutermost { # Largest of any of types named my ($self, $tags) = @_; my $d = $self->getDepth(); for (my $i=0; $i<$d; $i++) { my $curTag = $self->{theStack}->[$i]->{tag}; if ($tags =~ m/\b$curTag\b/) { return($i); } } return(undef); } # Get information about current (innermost) element. # sub YMLParser::getCurrentTag { my ($self) = @_; my $curFrame = $self->{theStack}->[-1]; return($curFrame->getTag()); } sub YMLParser::getCurrentAttr { my ($self,$aname) = @_; my $curFrame = $self->{theStack}->[-1]; return($curFrame->getAttr($aname)); } sub YMLParser::getCurrentAttrArray { my ($self) = @_; my $curFrame = $self->{theStack}->[-1]; return($curFrame->getAttrArray()); } sub YMLParser::getCurrentLang { my ($self) = @_; my $curFrame = $self->{theStack}->[-1]; return($curFrame->getLang()); } sub YMLParser::getCurrentNewNameSpaces { my ($self) = @_; my $curFrame = $self->{theStack}->[-1]; return($curFrame->{newNameSpaces}); } sub YMLParser::getCurrentSiblingTagArray { my ($self) = @_; my $curFrame = $self->{theStack}->[-2]; return(($curFrame) ? $self->{theStack}->[-2]->{childTags} : undef); } sub YMLParser::getCurrentSiblingAttrArray { my ($self) = @_; my $curFrame = $self->{theStack}->[-2]; return(($curFrame) ? $self->{theStack}->[-2]->{childAttrs} : undef); } # Get any open element info object, from which you can then extract stuff. # sub YMLParser::getElementInfo { my ($self,$n) = @_; return($self->{theStack}->[-$n]); } ############################################################################### # Namespace mapping. Note that only the *new* declarations are stored at # each level (element), since ns prefixes can be overridden. So we search # upward from the current element toward the root, and use the innermost # declaration in effect. # sub YMLParser::getDefaultNameSpace { my ($self) = @_; for (my $i=$self->getDepth()-1; $i>=0; $i--) { my $dns = $self->{theStack}->[-1]->{dftNameSpace}; if ($dns) { return($dns); } } return(""); } sub YMLParser::getNameSpaceUriFromPrefix { my ($self, $prefix) = @_; if (!$prefix) { return($self->getDefaultNameSpace()); } for (my $i=$self->getDepth()-1; $i>=0; $i--) { my $curEI = $self->{theStack}->[$i]; for (my $i=0; $i{newNameSpaces}}); $i+=2) { if ($curEI->{newNameSpaces}->[$i] eq $prefix) { return($curEI->{newNameSpaces}->[$i+1]); } } } return(undef); } sub YMLParser::getNameSpacePrefixFromUri { my ($self, $uri) = @_; if (!$uri) { return(""); } for (my $i=$self->getDepth()-1; $i>=0; $i--) { my $curEI = $self->{theStack}->[$i]; for (my $i=0; $i{newNameSpaces}}); $i+=2) { if ($curEI->{newNameSpaces}->[$i+1] eq $uri) { return($curEI->{newNameSpaces}->[$i]); } } } return(undef); } ############################################################################### ############################################################################### # Move to SchemaManager.pm? # package ElementDcl; sub ElementDcl::new { my ($class, $ename, $model) = @_; my $self = { ename => $ename, # Name of the element model => $model, # Content model attrs => {}, # Attrs declared lastClose => "", # Where did last instance end? }; # self bless $self, $class; return $self; } # new sub ElementDcl::getContentModel { my ($self) = @_; die "No getContentModel.\n"; } sub ElementDcl::addAttr { my ($self, $aname, $atype, $adefault) = @_; if (defined $self->{attrs}->{$aname}) { warn "Duplicate attribute definition"; return(0); } $self->{attrs}->{$aname} = new Attribute($aname, $atype, $adefault); return(1); } sub ElementDcl::isAttrDefined { my ($self, $aname) = @_; die "No getAttrDefault.\n"; } sub ElementDcl::getAttrDefault { my ($self, $aname) = @_; die "No getAttrDefault.\n"; } sub ElementDcl::isSequenceValid { my ($self, $ename, $childTypeArrayRef) = @_; die "isSequenceValid is not yet supported.\n"; } ############################################################################### package AttributeDef; sub AttributeDef::new { my ($class, $name, $type, $default) = @_; # xmlname($name) # m/^(ID|IDREFS?|ENTITY|ENTITIES|CDATA|NAMES?|NMTOKENS?)$/ # qlit my $self = { name => $name, type => $type, default => $default, }; bless $self, $class; return($self); } 1; ############################################################################### ############################################################################### ############################################################################### # =pod =head1 Usage use YMLParser; Parse a string as XML, but handle non-WF and YML extensions. This is especially useful for dealing with natural language corpora, since quite a number of them claim to be in XML, but aren't. This is I a conforming XML parser, though very similar. It should produce the same results as a conforming SAX parser for any well-formed XML document. But it will also survive a variety of errors and correct some, and it supports some optional extensions (more below), some of which address XML's oft-criticised verbosity for tabular structures. It can do either "push" or "pull" parsing. The event structure is almost identical to the SAX interface used (for example) by XML::Parser (see L), though you can also (optionally) get events for individual attributes, and errors are returned as additional events. =head2 Why/how this is not a real XML parser For good reasons, the XML standard requires that a conforming XML parser terminate if it finds a well-formedness error. If you have "sort of XML" data (for example, if some end-tags are missing, or some attributes are not quoted, etc), this means you can't use a conforming XML parser to clean it up. You I use a standalone program like the excellent I; or you can use I. =head1 Error-recovery special behaviors =over =item * Multiple root-level elements and text outside the root element(s) are allowed. =item * End-tags for non-open elements are ignored (a warning is issued). =item * Element and attribute names that start with [.-\d] are allowed. =item * Attribute lists that can't be figured out are at least survived. =item * "&" or "<" characters that don't begin WF XML markup constructs will be returned as text. =item * Late, case-varying, repeated, and/or otherwise erroneous XML declarations are allowed. However, the input I be UTF-8; encoding declarations are ignored; consider C if needed. =item * Unknown named entity references are allowed, and treated as text. =item * There is an option to not expand any entity references at all. =item * You can set up your own entity definitions and attribute defaults, or HTML's set, regardless of DTD. =item * Marked-section ends ("]]>") that are not in the scope of a CDATA marked section, are reported and treated as text. =item * Although it does not (yet) parse external DTDs, it does parse DOCTYPE internal subsets (and is not picky about the '[' and ']'). =head1 Handy special API features =item * Optionally, it can expand entities within the content of Processing Instructions. =item * Optionally, it can parse and return content of Processing Instructions that (like XML declarations) follow the syntax of XML attribute specification lists. =item * Optionally, the caller can get attributes as separate "Attr" events after a Start event, instead of packed into additional arguments to the Start handler. =item * The previous two features can be used together, so that the pseudo-attributes within a PI can be returned as separate events. =back =head1 Short attribute special behaviors These are enabled via the "shortAttrs" option. =over =item * Unquoted attributes are ok if they only contain XML NAME characters. They need not be XML NAMES, and thus may begin with a digit. For example, = . =item * SGML/HTML style bare-NAME-token attributes will be accepted, and the value is made the same as the name. For example, =
. =item * Boolean-valued attributes can be shortened to "+" or "-" and their name. For example, = . =back =head1 YML special behaviors If you set the "yml" option, special syntax is supported for making more compact files, especially in the case of repetitive or tabular data. YML markup is only recognized in content, not within a tag, PI, comment, etc. A YML parser handles any WF XML document in the normal way. However, an XML parser will I correctly handle YML documents (well, if they use any of the YML-specific constructs). By design, if you use an XML parser on a YML document by mistake, it will not get it wrong; instead, it will reliably terminate with a Well-Formedness error. This prevents the confusion that would result if a YML document could be parsed successfully but incorrectly by an XML parser. The special YML syntax features are: =over =item ymlOmitEnd (I) Missing end-tags are provided when an outer end-tag or EOF is found, in the same way as the SGML OMITTAG feature works for end-tags. This is illegal in XML. =item ymlEmptyEnd ("") This is just the "empty end tag" syntax from SGML. It closes the current (innermost) open element, regardless of type. This and I are the only YML constructs that do not involve a non-XML control character. However, this is still illegal in XML. =item ymlUp (0x15, d21, NAK, ^U) I has the same effect as "", but via a single character. Mnemonic: ^U for "Up". =item ymlReopen (0x12, d18, DC2, ^R) Closes the current element and then opens a new element of the same type. The new element is said to be "based on" the one that was current just before the I character was encountered. The new element gets all the same attributes as the one it is based on, except for any that are overridden using I...I. Mnemonic: ^R for "Reopen". =item ymlQuad (0x11, d17, DC1, ^Q) This is mainly intended to save a great deal of space versus XML in tables, and in other structures where many successive siblings (like tr) have the same sequence of types for their child elements (like td). Mnemonic: I is probably most important for tabular data: so ^Q for "Quadrille", as in quadrille-ruled (graph) paper; or "Quadding", a typography term related to horizontal alignment of text. I enables table markup very much like MediaWiki: ||cell|cell|cell... If all of the cells are marked up the same way (for example, if all are I or
John Doe 555-1234 FL
Jane Buck 999-1235 MA
I handles such cases by basing the new element on the I child element of the I, instead of on the I child element of the I. The I child element is the one which has the same child element number (position in order), as the new element about to be opened will have. A preceding container must be the immediately-preceding sibling of the current container, and of the same element type. If there is no such element, then I behaves exactly the same as I. Or, the I I or similar element can have its children marked up in full form or with I. For example, consider a context like the example above, except that a I occurs immediately after the text C<999-1235>, instead of C<< ... >>. At that point: the I is the I the I is the third child element of its parent the I is the second I
the I is the first I
element Text nodes don't count. The I closes the current (I) element. Then it opens a new element, which will of course be the I child of its parent. The new element is based on the corresponding (fourth) child element of the I, and so will be a I element as desired (with any attribute that the earlier I element may have had -- in this case none). This exact algorithm wouldn't let you use ymlQuad to start the I I
(or similar), because the I would be current, rather than a I. YML closes it and re-opens it (essentially like I), and then additionally opens a first element child of that new element, based on the first child of the element that it just closed (which it assumed functions like a I). =head2 ymlQuad algorithm For example, in a table the first row could tag all its children (say, as TD, TH, and/or other types). All following rows can just use the I character to separate their fields, much like "|" is used in many Wiki markup systems (they must still open the I field in each row explicitly). This reduces the overhead for tabular or other highly-repetitive structures to essentially the same as in the ever-popular "CSV" file: 1 byte per field. It also makes the data self-documenting (in the manner of CSV files with "header" records), and allows more flexibility because any deviations from total regularity can be handled, merely by using full XML-style tags instead of I for the exceptions, and then going back to I again. A YML parser keeps track of the sequence of element types (and attributes, including namespace attributes) for the already-seen children of each open element, I for each open element's preceding-sibling element (if any). Taking HTML table markup as a simple example, the I character may be used within the children (cells) of any but the first TR. If it is, then the parser: =over =item 1: figures out the current element's I, that is, its number of preceding-sibling elements plus 1 (PIs, comments, and text nodes do not count). Call that number I. =item 2: finds the parent's nearest preceding-sibling element. Call that element PRE. If there is no such element, a YML Well-formedness error is signaled. =item 3: looks up the element type and attributes of the Ith child of PRE. If there is no such element, a YML Well-formedness error is signaled. If there is, that element is the one that the new element will be based on. =item 5: checks for I immediately following the I, and if found, parses the subsequent attribute list as described below. =item 2: closes the current element (perhaps it is a TD). =item 6: opens the new element. =back B: I can't start the I cell in a row, because it is defined to close the current element and then open something. Suggestions for an improved semantic are nevertheless welcome. Likewise, a single ^U at the end of a row will close the cell, not the row. To close both, use (for example) ^U^U or ^U or or ^U^R or similar. Just be sure to close them both. I also doesn't do anything special in case of things like row or column spans, so a row whose cells don't correspond neatly to the prior row, should typically be tagged in full (this doesn't seem that bad to me, because it is a case specific to tabular I, that doesn't arise in tabular or relational I. =item ymlStartAttributes (0x13, d19, DC3, ^S) This may occur only as the very next character after I or I (not even whitespace between). It indicates the start of an attribute list for the element which the I or I will open. The attribute list must end with the I character. Attributes specified in this attribute list override any of the same name on the element the new element is I. Mnemonic: ^S for "Start Attributes". =item ymlTerminateAttributes (0x14, d20, DC4, ^T) This may only occur as described under I, to mark the end of an attribute list for an element to be opened due to I or I. Mnemonic: ^T for "Terminate Attributes" (or just "the character after ^S"). =back =head1 NOTES =head2 Example document (control characters are shown here as ^Q...^T)

A paragraph^U

Another paragraph^U

Para 1, reopen: ^RPara 2, reopen: ^RPara 3, up:^U

elements with the same (or no) attributes, then you can get this effect using I as already described. However, (in effect) copying the start-tag of the current element isn't good enough in general. For example:
2011: $12 Green
2012: $1200 Orange
in a I
(and therefore would be closed, which is not the idea). Therefore, I behaves specially when it is doubled at the start of a line, in shameless imitation of the MediaWiki convention. In that case the sequence of a line break and two Is signals that the current element is functioning like a I
cell1 cell2 cell3
cell10^Qcell20^Qcell30^U
cell11^Qcell21^Sid="12" class="middle"^T^Qcell31^U ^Rcell12^Qcell22^Qcell32^U^U
=head2 Example code (pull) use YMLParser; $fp = new YMLParser(); $fp->addText($myText); while (my $e = $fp->nextEvent()) { $e =~ s/^(.*?)\t//; my $type = ($1) ? $1:"????"; if ($type eq "Start") { ... } elsif ($type eq "End") { ... } elsif ($type eq "Char") { ... } else { ... } } =head2 Example code (push) use YMLParser; $fp = new YMLParser(); $fp->setHandlers( { "Start" => \&myStartHandler, "End" => \&myEndHandler, "Char" => \&myCharHandler, "ERROR" => \&myErrorHandler, } $fp->parsestring($myText); =head1 Methods =over =item * B() =item * B() Identifies this particular implementation, as "YMLparser.pl". =item * B() Clear any document-specific state (keep declared entities, etc). =item * B(I) Change the named YML delimiter to a new character. If I is undefined, reset it to its default. Accepted names are: I, I, I, I, I. The main use for this is probably to change I to C<|> like MediaWiki table markup. =item * B(I) Options available include: =over I -- (integer) issue various messages to STDERR. I -- (Boolean) after a I event, return a separate I event for each attribute (if any), and then an I event. I -- (Boolean) treat the contents of processing instructions as a list of (pseudo-) attributes, and return them in the same manner as real attributes are returned with I, as separate events after the PI. I -- (Boolean) if turned off, entities will be returned as I events with a single argument, which is the original form of the entity (or numeric character) reference. I -- (Boolean) if set, the content of PIs (but not the I) will be parsed in search of entity and character references, which will be expanded. I -- enables the short attribute features described above. I -- enables the YML features described above. =back =item * B(path) Add I to the end of the list of directories, in which to search for external entities. First added, is first searched. =item * B(hash) Like the corresponding method in XML::Parser. I maps SAX event names to the Perl procedures to be called when each one happens (see above for a list and an exmple). There are a few extra events, most notably ERROR, and (optional) separate events for attributes. =item * B(I) Assert that elements of type I may never occur within elements of type I (directly or indirectly). Thus, if the parse sees a I element, it will first force all open I elements closed. Repeatable (but not yet working). =item * B(I) Append the text to the buffer to be parsed. =item * $s = B() Return the current contents of the buffer to be parsed (that is, any pending text that has not yet been parsed). This does not clear the buffer; to do that use I(). =item * B() Remove any remaining text in the parse buffer (the text is returned); =item * B() Parse off the next XML construct from the parse buffer, removing it. The event will be returned as a reference to an array, whose elements are essentially the same as the arguments passed to a callback when using Perl C, except that one extra argument is inserted, which is the event type name (the same as you would pass to XML::Parser to register a handler); this argument is [1], after [0] which is a reference to the YMLParser object itself. The rest of the arguments depend on the event type, and are the same as with C. =item * B(I) Parse the string I and call event handlers (see I). =item * B(I) Parse the file at I and call event handlers (see I). If the path ends with C<.zip>, I() will try to use the first member of the zip file transparently. (experimental) =item * B(I) Parse the string I and return a reference to an XML::DOM structure. (experimental) =item * B(I) Parse the file I and return a reference to an XML::DOM structure. (experimental) =item * B(I) (experimental) Define a default value (similar to what you can do with an ATTLIST declaration in an SGML or XML DTD), for a particular attribute. With SGML (but not XML, if I remember right), a default value can be associated with a named attribute for a separate element type, or shared across the same-named attributes on multiple element types, such as I and I here: B, with this parser you cannot share a default like that. =item * B(I) Define the entity I to the given (string) value. If I() is in effect, defining an entity via overrides any HTML entity of the same name. Entities that refer to files, URIs, etc. are not yet supported. =item * B(I) Determine whether to recognize the standard HTML named character entities. They are off by default; this calls turns them on unless I is present and has the value 0 (if I is entirely absent, they are turned I!) Individual entities can be overridden via I(). =item * B(I) Determine whether to recognize the five XML built-in entities. They are on by default; this calls turns them on unless I is present and has the value 0 (if I is entirely absent, they are turned I!) These entities cannot be overridden when on. =item * B(I) This is the internal method called when a start (or empty) tag is parsed. It stacks the element, xml:lang, and some other information, then issues the start-element event (and if the I option is set, possibly also some number of following attribute events). =item * B(I) This is the internal method called when a end tag is parsed. It pops the information that I pushed. =item * B() Returns the number of open elements. =item * B(type) Returns true if at least one element of the specified I is open. =item * B(typelist) Returns the index into the stack of open elements (0 is the document element), of the outermost (largest) instance of any of the types listed in the string I (types must be separated by whitespace). This is used internally to implement the I feature, since everything out to this index has to be closed on finding a conflict. =item * B() Returns the list of all open element types, separated by "/". =item * B() Returns the innermost/current open element type. =item * B() Returns the innermost/current/inherited value of I. =item * B() Returns a reference to a hash that maps all currently-defined namespace prefixes to URIs (this includes all namespaces defined on the current element or any of its ancestors). =item * B() Return the current default NameSpace URI. If none, return "". =item * B(I) Return the XML namespace prefix currently in effect, that maps to the given I. If there are several, it is undefined which one will be returned; if none, return "". =item * B(I) Return a string containing the URI currently assigned to the given XML namespace I; if none, return "". =back =head1 SAX event types These are the same as for Perl's C package (see CPAN), plus I and the optional I and I. The names shown below are the keys to use when passing a hash to I(). Each key's data should be a reference to the Perl procedure to be called when the event occurs; it will be passed the parameters shown. =over =item B(I) The very start of parsing. =item B(I) The very end of parsing. =item B(I) The start of an element. However, if the I option is set, then I arguments will not be provided. Instead, the attributes will be provided via separate I events, and after all the I events (whether or not there are any), an I event is provided. =item B(I) If you set the I option, then instead of attributes being packed into the I event, they will follow it, one event per attribute, in alphabetical order. An I event will then follow to mark the end of attributes (that event always happens, even if there are no attributes on a particular start-tag). See also the I option, and the I and I events. =item B(I) Marks the end of the attributes for a given start-tag, if the I option is set. See I for the analogoues event when I is set. =item B(I) The end of an element. =item B(I) Text content. B: There is no guarantee that a I event will not be followed immediately by another I event. =item B(I) =item B(I) This event occurs if a significant error is found in parsing. The intent is that all XML and YML Well-Formedness (WF) errors will be reported, but that is not yet complete. It is generally returned I the any event that occurs despite the error. For example, if an end-tag is found for an element that is not the innermost open element, an ERROR event is returned, followed by as many End events as it takes to close all the elements out to the one named in the end-tag that was actually found. I: Printable text for the message. =item B(I) Indicates the start of a CDATA marked section. Content will be returned via following I events. =item B(I) Indicates the end of a CDATA marked section. =item B(I) A processing instruction. However, if the I and I options are set, then I will not be returned. Instead, the I content will be parsed as if it were a list of attributes, and those attributes will be provided via I events (and finally a I event). =item B(I) If you set the I and I options, then a I event will be returned for each pseudo-attribute found within the content of a processing instruction. A I event will then follow. =item B(I) Marks the end of the pseudo-attributes for a given PI), when the I option is set. =item B(I) The XML declaration. =item B(I) The DOCTYPE declaration in the DTD. =item B(I) The end of the DOCTYPE declaration in the DTD. =back =head2 The following events are not yet fully supported =over =item B(I) An event for which no handler has been installed. =item B(I) A reference to an unparsed external entity (such as an image). =item B(I) An (unexpanded?) external entity reference. =item B(I) =item (all events representing markup declarations) B(I) B(I) B(I) B(I<...>) =back =head1 Known bugs and limitations This is intentionally B a conforming XML parser. It is intended for dealing with broken data and helping you fix it, somewhat like the extraordinarily useful C; and for encoding highly-repetitive structures more compactly while remaining true to the ideas of XML. It may be better for I to require that the parent and aunt be of the same type (for example, successive table rows), thus allowing even the first cell in each row to be started with I. ??? The semantics of default attributes may not be completely correct. When using ymlQuad and ymlReopen, attributes are copied only from the I element (including any attributes which that element defaulted from the DTD). However, there is no way to "reset to default"; should there be? Parsing DTDs and external entities is unfinished (therefore, so is attribute-defaulting). Does not die on all WF errors, even some where perhaps it should. Gets unhappy with very long comments, PIs, etc. This is an outright bug (not hard to fix, I just haven't gotten around to it yet). Does not coalesce entities that produce text, or CDATA marked sections, into the surrounding text. So you can get extra text nodes. I<-canonical> is imperfect (see above). =head1 Related commands C and C do similar repairs, but don't (so far as I know) expose a regular SAX interface, and don't have the YML constructs. C converts between countless character sets, so is excellent for getting input ready for I. C provides an API similar to severa functions implement here; the ones here are specialized because to support YML (and specifically ymlQuad) we must keep track of extra state information, such as the list of previous sibling types and attributes for each still-open element. =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License, with one additional restriction: B If a user agent is to incorporate this software, it must require the user to opt-in, not opt-out; that is, the user agent software must not use this software to "correct" non-WF XML, or to parse YML, unless the user has explicitly consented first. For further information on the CCLI license, see L. The author's present email is sderose at acm.org. For the most recent version, see L. =cut