#!/usr/bin/perl -w
#
# YMLParser.pm
#
# Written 2011-03-11 by Steven J. DeRose, sderose@acm.org.
#     Based on xmlparser and fakeparser.
# 2011-03-14 sjd: lookupEntity(). Empty elements generate ETAG events. Help.
# 2011-03-17 sjd: Start adding SAX API. Entities in attributes. Built-ins.
# 2011-05-13 sjd: More work on SAx API. Add isXmlName, addentity, reset,
#     getText, openElement, closeElement, getDepth, getFQGI, getCurrentGI,
#     getCurrentLang, defineEntity. Add setXmlEntities. Return array instead of
#     packed string from nextEvent. Generalize pendingEvents.
#     Break out parseAttributeString(); use HTML::Entities.
# 2011-05-25 sjd: getCurrentNsList(). A few bugs with pending events.
#     Add setNoNest(). Big CDATA MS's; DOCTYPE, XML Decl. Better queueing.
# 2011-06-02 sjd: Don't return the quotes on attribute values. Add support
#     for value-only attributes like HTML border. Improve DOCTYPE parsing.
#     Support -returnForm for pending events.
# 2011-07-16 sjd: Start implementing parseToDOM. Fix bugs in entity-handling.
# 2011-07-25 sjd: Add -entitiesInPIs for entity expansion inside PIs. Debug.
#     Don't let text-matching eat YML chars. Swap ^R and ^Q.
# 2011-07-26ff sjd: Bunch of debugging. Pull in and packagize EntityManager.
#     Separate short-attribute features and make optional. Add ymlElementInfo.
#     Option to split pseudo-attributes out of PI content. Add packages
#     for ElementDcl, AttlistDcl, NotationDcl. Support lastClosed. Fix reopen.
# 2011-08-03 sjd: Still fixing ymlQuad. Move penultimate element items into
#     element object. Merge base and yml attrs right.
# 2011-08-04 sjd: Track/return default namespace, add getDefaultNameSpace().
#     Return prefix and URI on Attr events. Add ProcAttr, ProcAttrFin events.
# 2011-09-08 sjd: Catch some more WF errors. Normalize to LF in input.
#     Start on parameter entity support.
# 2012-05-18 sjd: Split EntityManager to external package.
# 2013-01-21 sjd: Finish switching to external EntityManager. Start Attribute.
# 2013-06-19 sjd: Change ymlQuad to handle first row / first col better.
#     Add changeDelimiter(). Ditch "PREFIX" form. Add nextEventArray().
#     Start on regex precompiling.
#
# To do:
#     Finish compiling regexes....
#     Attribute defaults (integrate ElementManager.pm).
#     Long comments, PIs, etc. Just keep fetching til done.
#     Finish parseToDom methods
#     Handle XML decl 'encoding'
#     Allow <[tag]> and <{tag}> to signal array and hash semantics?
#     External ent can start w/ XML Decl (w/ charset!)
#     Finish ATTLIST, NOTATION, etc. dcls
#     Add rabbit-duck support?
#     Mod CDATA to be: <![CDATA:anything[ ... ]anything]>
#
# Low priority:
#     Support local/cumulative IDs
#     Swap order of parseAttributeStringToArray and parseAttributeString?
#     Options re. parameter entity expansion?
#     Factor errors to method to both queue and warn?
#     Support Stax API?
#     Test reading straight from a zip files
#     Return more WF errors
#         PE refs in markup dcls in internal subset
#         PE replacement text must match extSubsetDecl
#         Ref to unparsed entity (but can be named in ENTITY(IES) attrs
#         attrs can't refer to external ents
#         Synchronous entities
#     Split out Element/Attribute Managers?
#
use strict;
use XML::DOM;
use HTML::Entities;
# http://search.cpan.org/~adamk/Archive-Zip-1.30/lib/Archive/Zip/MemberRead.pm
#use Archive::Zip;
#use Archive::Zip::MemberRead;

use EntityManager;
#use ElementManager;

our $VERSION = "0.9";

###############################################################################
###############################################################################
# Maintain various information about a single YML element instance.
#     Cf ElementManager.pm.
#
# For YML, esp. ymlQuad, we need more information than in XML. If you're in
# a TR's 10th TD, you need to still have the *previous* TR's 10th child too
# (or at least that child's type and attributes).
#
# Therefore, as long as an element is on the stack we need its prior sibling
# object, which in turn keeps its children's types and attrs. So every
# element object has arrays for {childTags} and {childAttrs}.
# When we pop an element off the stack we don't just discard it. Instead, we
# put a reference to it in the parent element object as {penultimate}.
# When we close an Element, it's no longer the current child, so it moves over.
#
package ymlElementInstanceInfo;

sub ymlElementInstanceInfo::new {
    my ($class, $tag, $attrArrayRef) = @_;

    my $self = {
        tag             => $tag,           # What XML element type?
        attrs           => $attrArrayRef,  # (name1,value1,name2,value2,...)
        lang            => "",             # Inherited xml:lang code
        newNameSpaces   => [],             # nss declared *here*
        dftNameSpace    => "",             # (only if set here)

        # Following are needed mainly for ymlQuad. We keep the full node
        # for the preceding-(element)-sibling of each open element; but
        # for those we only need the list of child types and attrs.
        #
        penultimate     => undef,          # Ref to 2nd-last child
        childTags       => [],             # Tags of all the element children
        childAttrs      => [],             # Attrs of same
    };
    bless $self, $class;
    return($self);
}

sub ymlElementInstanceInfo::getTag {
    my ($self) = @_;
    return($self->{tag});
}
sub ymlElementInstanceInfo::setTag {
    my ($self, $tag) = @_;
    $self->{tag} = $tag;
}

sub ymlElementInstanceInfo::getAttrArray {
    my ($self) = @_;
    return($self->{attrs});
}

sub ymlElementInstanceInfo::getAttr {
    my ($self, $aname) = @_;
    my $nAttrTokens = scalar(@{$self->{attrs}});
    for (my $i=0; $i<$nAttrTokens; $i+=2) {
        if ($self->{attrs}->[$i] eq $aname) {
            return($self->{attrs}->[$i+1]);
        }
    }
    return(undef);
}
sub ymlElementInstanceInfo::setAttr {
    my ($self, $aname, $avalue) = @_;
    my $nAttrTokens = scalar(@{$self->{attrs}});
    for (my $i=0; $i<$nAttrTokens; $i+=2) {
        if ($self->{attrs}->[$i] eq $aname) {
            $self->{attrs}->[$i] = $avalue;
            return;
        }
    }
    push @{$self->{attrs}}, $aname;
    push @{$self->{attrs}}, $avalue;
}
sub ymlElementInstanceInfo::delAttr {
    my ($self, $aname) = @_;
    my $nAttrTokens = scalar(@{$self->{attrs}});
    for (my $i=0; $i<$nAttrTokens; $i+=2) {
        if ($self->{attrs}->[$i] eq $aname) {
            delete $self->{attrs}->[$i+1];
            delete $self->{attrs}->[$i];
            return;
        }
    }
}

sub ymlElementInstanceInfo::getLang {
    my ($self) = @_;
    return($self->{lang});
}
sub ymlElementInstanceInfo::setLang {
    my ($self,$lang) = @_;
    $self->{lang} = $lang;
}

sub ymlElementInstanceInfo::setNameSpace {
    my ($self, $abbr, $uri) = @_;
    for (my $i=0; $i<scalar(@{$self->{newNameSpaces}}); $i+=2) {
        if ($self->{newNameSpaces}->[$i] eq $abbr) {
            $self->{newNameSpaces}->[$i+1] = $uri;
            return;
        }
    }
    push @{$self->{newNameSpaces}}, $abbr;
    push @{$self->{newNameSpaces}}, $uri;
}

sub ymlElementInstanceInfo::appendChild {
    my ($self, $tag, $attrsRef) = @_;
    if (!$tag) {
        $tag = "_UNKNOWN";
    }
    if ($attrsRef && !ref($attrsRef)) {
        die "Non-ref passed for attrs to appendChild.\n";
    }
    push @{$self->{childTags}}, $tag;
    push @{$self->{childAttrs}}, $attrsRef;
    if (scalar(@{$self->{childTags}}) != scalar(@{$self->{childAttrs}})) {
        die "tags and attrs are out of sync.\n";
    }
} # appendChild


###############################################################################
###############################################################################
#
package YMLParser;

my $badControls = "[";                    # Make list of non-XML chars
for (my $i=1; $i<32; $i++) {
    ($i==9 || $i==10 || $i==13) && next;
    $badControls .= chr($i);
}
$badControls .= "]";

# Bind control characters for the special YML shorthand (cf changeDelimiter())
#
my $ymlQuad                = chr(17);  # ^Q DC1
my $ymlReopen              = chr(18);  # ^R DC2
my $ymlStartAttributes     = chr(19);  # ^S DC3 (after ymlReopen or ymlQuad)
my $ymlTerminateAttributes = chr(20);  # ^T DC4 (after ymlStartAttributes)
my $ymlUp                  = chr(21);  # ^U NAK

my $ymlChars = "$ymlUp$ymlReopen$ymlQuad"; # Not the attr ones!

# Compiled regexes
#
# XML delimiters
my $como         = qr/^<!--/;
my $comc         = qr/^-->/;
my $mdo          = qr/^<!/;
my $pio          = qr/^<\?/;
my $pic          = qr/^<\?/;
my $qlit         = qr/('[^']*'|"[^"]*")/;
my $qs           = $qlit;
my $xname        = "[-_.:\\w\\d]+";

# Document instance
my $XxmlDecl     = qr/^<\?(XML) (.*?)\?>/;
my $Xpi          = qr/^$pio(.*?)$pic/;
my $Xcom         = qr/^$como(.*?)$comc/;
my $XstartTag    = qr/^<($xname)(\s.*?)?(\/?>)/;
my $XendTag      = qr/^<\/(.*?)\s*>/;
my $XymlChars    = qr/^([$ymlChars]|<\/>)/;
my $XcdataStart  = qr/^<!\[CDATA\[/;
my $XcdataEnd    = qr/^]]>/;

# Entity refs
my $XhexRef      = qr/^(&#x[\da-f]+;)/;
my $XdecRef      = qr/^(&#\d+;)/;
my $XentRef      = qr/^(&$xname;)/;
my $XpentRef     = qr/^%($xname;)/;

# Markup declarations
my $XdoctypeO    = qr/^<!DOCTYPE\s+/;
my $XsydIdent    = qr/($xname)\s+(PUBLIC|SYSTEM)?\s*($qs)\s*($qs)?\s*(\[\s*|>)?/;
my $XsysIdent2   = qr/^(SYSTEM|PUBLIC)\s*$qlit\s*$qlit?/;
my $Xnotation    = qr/^\s*NOTATION\s+(\w+)/;
my $XdclElement  = qr/^<!ELEMENT\s+($xname)\s+(.*?)\s*>/;
my $XdclEntity   = qr/^<!ENTITY\s+(%\s+)?($xname)\s+(.*?)\s*>/;
my $XdclAttlist  = qr/^<!ATTLIST\s+($xname)\s+(.*?)\s*>/;
my $XdclNotation = qr/^<!NOTATION\s+($xname)\s+(.*?)\s*>/;

# Based on XML::Parser (cf SAX). See also tupleSets/, fakeParser.
#
my %eventNames = (
        "Init"          => "RUN+",  #
        "Final"         => "RUN-",  #
        "Start"         => "ELM+",  #
        "End"           => "ELM-",  #
        "Char"          => "CHAR",  #
        "Proc"          => "PINS",  #
        "Comment"       => "COMM",  #
        "CdataStart"	=> "CDT+",  #
        "CdataEnd"	    => "CDT-",  #
        "Default"	    => "DEFT",  #

        "Unparsed"	    => "UNPA",  #
        "ExternEnt"	    => "EXT+",  #
        "ExternEntFin"	=> "EXT-",  #

        # Markup declarations, from the DTD
        "Entity"	    => "ENT:",  #
        "Element"	    => "ELM:",  #
        "Attlist"	    => "ATT:",  #
        "Notation"	    => "NOT:",  #

        "Doctype"       => "DOC+",  #
        "DoctypeFin"    => "DOC-",  #
        "XMLDecl"       => "XDCL",  #

        "Attr"          => "ATT+",  # Extension
        "AttrFin"       => "ATT-",  # Extension
        "ProcAttr"      => "PI@+",  # Extension
        "ProcAttrFin"   => "PI@-",  # Extension

        "ERROR"         => "ERRR",  #
    );

sub YMLParser::new {
    my ($class) = @_;
    my $s = "";
    my %ha = ();
    my @pe = ("Init");

    my $self = {
        version        => "2013-06-19",

        # Options
        verbose        => 0,
        attrEvents     => 0,         # Return separate event per attribute?
        normalize      => 0,         # Normalize white space?
        useHtmlEnties  => 1,         # Support HTML named entities?
        useXmlEnties   => 1,         # Support the 5 XML built-ins?
        expandEntities => 1,         # Handle entities transparently?
        entitiesInPIs  => 0,         # Expand ent. refs in PIs?
        attrsInPIs     => 0,         # Treat PI content as pseudo-attributes?
        coalesce       => 0,         # Never return adjacent text nodes?
        yml            => 0,         # Support my YML proposal?
        shortAttrs     => 0,         # Allow special short-attribute forms?
        defaultLang    => "EN",      # In case no xml:lang on document element

        cantGoInside   => {},        # {child} => disallowed parent(s).

        # Markup declaration stuff
        entManager     => new EntityManager,
        elementDcls    => {},
        notationDcls   => {},

        # Data to be worked on
        theText        => $s,        # Text being parsed
        pendingEvents  => [ \@pe ],  # Events waiting to be returned

        # Parse state
        theStack       => [],        # A stack of ymlElementInstanceInfo objects
        inCDATA        => 0,         # Are we inside a CDATA M S?
        inDoctype      => 0,         # Are we inside the DOCTYPE?
        seenTag        => 0,         # Have we seen a tag yet?

        current_file   => "",        # Where we are
        current_line   => 1,

        handlers       => \%ha,      # Event callbacks for 'push' parsing
    }; # self

    bless $self, $class;
    return $self;
} # new

sub YMLParser::yMsg {
    my ($self, $level, $msg) = @_;
    warn("YMLParser: $msg\n");
}

sub YMLParser::identifier {
    my ($self) = @_;
    return("YMLparser.pl $self->{version}");
}

sub YMLParser::reset {
    my ($self, $n, $v) = @_;

    $self->{theText}      = "";
    $self->resetStack();
    my @pe = ( "Init" );
    $self->{pendingEvents} = [ \@pe ];

    $self->{inCDATA}      = 0;
    $self->{inDoctype}    = 0;
    $self->{seenTag}      = 0;

    $self->{current_file} = "";
    $self->{current_line} = 1;
} # reset

sub YMLParser::changeDelimiter {
    my ($self, $name, $v) = @_;
    if    ($name eq "ymlQuad") {
        $ymlQuad = $v || chr(17);
    }
    elsif ($name eq "ymlReopen") {
        $ymlReopen = $v || chr(18);
    }
    elsif ($name eq "ymlStartAttributes") {
        $ymlStartAttributes = $v || chr(19);
    }
    elsif ($name eq "ymlTerminateAttributes") {
        $ymlTerminateAttributes = $v || chr(20);
    }
    elsif ($name eq "ymlUp") {
        $ymlUp = $v || chr(21);
    }
    else {
        die "Unknown delimiter '$name'.\n";
    }
    $ymlChars = "$ymlUp$ymlReopen$ymlQuad"; # Not the attr ones!
}

sub YMLParser::setOption {
    my ($self, $optionName, $v) = @_;
    if ($optionName !~
        m/^(verbose|attrEvents|normalize|div|h|expandEntities|coalesce|yml|shortAttrs)$/) {
        die "Bad option name '$optionName'\n";
    }
    $self->{$optionName} = $v;
}

sub YMLParser::appendEntityPath {
    my ($self, $path) = @_;
    $self->{entManager}->appendEntityPath($path);
}

# See http://search.cpan.org/~msergeant/XML-Parser-2.36/Parser.pm
#
sub YMLParser::setHandlers {
    my ($self, $hhRef) = @_;
    my %hh = %$hhRef;
    for my $handlerName (keys %hh) {
        if (!defined $eventNames{$handlerName}) {
            die "YMLParser: Unknown handler '$handlerName'.\n";
        }
        $self->{handlers}->{$handlerName} = $hh{$handlerName};
    }
}


sub YMLParser::setNoNest {
    my ($self, $parent, $child) = @_;
    my $curList = $self->cantGoInside->{$child};
    $self->cantGoInside->{$child} =
        ($curList) ? "$curList\t$parent" : $parent;
}


sub YMLParser::parsefile {
    my ($self, $file) = @_;
    $self->{current_file} = $file;
    (scalar keys %{$self->{handlers}}) || die
        "No handlers set for YMLParser.\n";

    my $fh = undef;
    if ($file =~ m/\.zip$/) {
        my $zip = Archive::Zip->new($file);
        my @members = $zip->members();
        $fh = Archive::Zip::MemberRead->new($zip,$members[$0]);
        $self->yMsg(0, "parsing directly from zip files is experimental.");
    }
    else {
        (open($fh,$file)) || return(undef);
    }

    my $nEvents = 0;
    my $rec;
    while (1) {
        if (ref($fh) eq "GLOB") { $rec = <$fh>; }
        else { $rec = $fh->readline(); }
        (defined $rec) && last;
        $self->addText($rec);
    }
    close $fh;
    $self->yMsg(0, "parsefile: loaded, starting parse, self: $self");

    while (my $epacked = $self->nextEvent()) {
        my @e = split(/\t/, $epacked);
        #shift @e;
        my $type = shift @e;
        if (!defined $type) {
            die "Bad event. " . $epacked . "\n";
        }
        my $hand = $self->{handlers}->{$type};
        if (defined $hand) {
            $hand->($self, @e); # Is this right?
        }
        elsif ($hand = $self->{handlers}->{"Default"}) {
            $hand->($self, @e); # Is this right?
        }
        $nEvents++;
        ($type eq "Final") && last;
    }
    return($nEvents);
}

sub YMLParser::parsestring {
    my ($self, $text) = @_;
    (scalar keys %{$self->{handlers}}) || die
        "No handlers set for YMLParser.\n";
    my $nEvents = 0;
    $self->addText($text);
    while (my $epacked = $self->nextEvent()) {
        my @e = split(/\t/, $epacked);
        #shift @e;
        my $type = shift @e;
        if (defined ($self->{handlers}->{$type})) {
            $self->{handlers}->{$type}->($self,@e);
        }
        else {
            die "Bad event type '$type'\n";
        }
        $nEvents++;
        ($type eq "Final") && last;
    }
    return($nEvents);
}

sub YMLParser::parsefiletoDOM {
    my ($self, $file) = @_;

    die "YMLParser: parsing file to a DOM is not yet implemented.\n";

    $self->{current_file} = $file,
    (open(XF,$file)) || return(undef);
    while (my $rec = <XF>) {
        $self->addText($rec);
    }
    close XF;
    return($self->parseStringToDOM());
} # parseFileToDOM

sub YMLParser::parsestringtoDOM {
    my ($self, $text) = @_;
    die "YMLParser: parsing string to a DOM is not yet implemented.\n";
    my $theDOM = new XML::DOM();
    my $currentElement = $theDOM->getDocumentElement();

    $self->{attrEvents} = 1;

    while (my @eventRef = @{$self->nextEvent()}) {
        my $eventType = $eventRef[0];
        if    ($eventType eq "Init")         { next; }
        elsif ($eventType eq "Final")        { last; }
        elsif ($eventType eq "XMLDecl")      { next; }
        elsif ($eventType eq "Doctype")      { next; }
        elsif ($eventType eq "DoctypeFin")   { next; }
        elsif ($eventType eq "Start")        {
            my $newel = $theDOM->makeElement($eventRef[1]);
            $currentElement->appendChild($newel);
            $currentElement = $newel;
        }
        elsif ($eventType eq "Attr")         {
            $currentElement->setAttribute($eventRef[1],$eventRef[2]);
        }
        elsif ($eventType eq "AttrFin")      {
            next;
        }
        elsif ($eventType eq "End")          {
            $currentElement = $currentElement->getParentNode();
        }
        elsif ($eventType eq "Char")         {
            my $newel = $theDOM->makeTextNode($eventRef[1]);
            $currentElement->appendChild($newel);
        }
        elsif ($eventType eq "Comment")      {
            my $newel = $theDOM->makeCommentNode($eventRef[1]);
            $currentElement->appendChild($newel);
        }
        elsif ($eventType eq "Proc")         {
            my $newel = $theDOM->makeProcNode($eventRef[1],$eventRef[2]);
            $currentElement->appendChild($newel);
        }
        elsif ($eventType eq "ProcAttr")     {
        }
        elsif ($eventType eq "ProcAttrFin")  {
            next;
        }
        elsif ($eventType eq "End")          {
            $currentElement = $currentElement->getParentNode();
        }

        elsif ($eventType eq "CdataStart")   { next; }
        elsif ($eventType eq "CdataEnd")     { next; }
        elsif ($eventType eq "Default")      { next; }
        elsif ($eventType eq "Unparsed")     { next; }
        elsif ($eventType eq "Notation")     { next; }

        elsif ($eventType eq "ExternEnt")    { next; }
        elsif ($eventType eq "ExternEntFin") { next; }
        elsif ($eventType eq "Entity")       { next; }
        elsif ($eventType eq "Element")      { next; }
        elsif ($eventType eq "Attlist")      { next; }
        elsif ($eventType eq "Notation") { next; }

        elsif ($eventType eq "ERROR")        { next; }
    } # while
    $theDOM->normalize();
    return(\$theDOM);
} # parsestringtoDOM


###############################################################################
# Manage the text to be parsed (replace with real entity manager).
#
sub YMLParser::addText {
    my ($self, $s) = @_;
    $self->{theText} .= $s;
}
sub YMLParser::getText {
    my ($self, $s) = @_;
    return($self->{theText});
}
sub YMLParser::clearText {
    my ($self) = @_;
    my $rc = $self->{theText};
    $self->{theText} = "";
    return($rc);
}


###############################################################################
# Do a "pull" style parse. As we're parsing, we simply push the event(s) we
# find or infer (each is a reference to an array, which is the same as the
# array of arguments (not counting the parser itself) that XML::Parser would
# pass to a handler); when events are already queued, we just return them.
#
sub YMLParser::nPendingEvents {
    my ($self) = @_;
    return(scalar(@{$self->{pendingEvents}}));
}

sub YMLParser::nextEvent {
    my ($self) = @_;
    my $e = $self->nextEventArray();
    shift @{$e};
    return(join("\t", @{$e}));
 }
sub YMLParser::nextEventArray {
    my ($self) = @_;

    ($self->{verbose}) && $self->yMsg(0, "Entering nextEvent.");

    # If there's a pending event(s) already, just return the next one.
    #
    if ($self->nPendingEvents()>0) {
        return($self->shiftEvent());
    }

    # Once there's nothing pending, do real parsing
    #
    #if (!$self->{theText}) { $self->reload(); } # ENTITY MGR
    my $raw = $self->{theText};

    if (!$raw) {                                             # EOF
        my $d = $self->getDepth();
        if ($d > 0) {
            $self->pushEvent("End", $self->getCurrentTag());
        }
        else {
            $self->pushEvent("Final");
        }
    }

    # In constructs that can persist across events!
    # "]]>" outside CDATA MS is an error, but we ignore it.
    #
    elsif ($self->{inCDATA}) {                               # in CDATA MS
        if ($raw =~ s/^(.*?)]]>//s) {
            if ($1) { $self->pushEvent("Char", $1); }
            $self->pushEvent("CdataEnd");
        }
        else {
            $self->pushEvent("Char", $raw);
        }
    }
    elsif ($self->{inDoctype} && $raw =~ s/\s*\]?\s*>//) {   # Doctype Fin
        $self->pushEvent("DoctypeFin");
        $self->{inDoctype} = 0;
    }

    # Potentially long constructs
    elsif ($raw =~ s/^<!\[CDATA\[//s) {                      # CDATA MS START
        $self->pushEvent("CdataStart");
    }
    elsif ($raw =~ s/^]]>//s) {                              # CDATA MS END
        my $msg = "WF: Found ]]> not inside CDATA marked section\n";
        $self->pushEvent("ERROR", $msg);
        $self->vMsg(0,$msg);
    }

    elsif ($raw =~ s/^<\?(XML) (.*?)\?>//si) {                 # XML DECL
        ($1 eq "xml") || $self->pushEvent(
            "ERROR", "WF: XML declaration target is not lower-case.");
        if ($self->{seenTag}) {
            my $msg = "WN: Late XML declaration found\n";
            $self->pushEvent("ERROR", $msg);
            $self->vMsg(0,$msg);
        }
        else {
            $self->pushEvent("XMLDecl", $2);
        }
    }
    elsif ($raw =~ s/^<\?(.*?)\?>//s) {                      # PROCESSING INSTR
        my $pi = $1;
        $pi =~ s/^($xname)\s*//;
        my $target = ($1) ? ($1) : "";
        if ($target =~ m/^yml/) {                                # YML PI
        }
        if ($self->{entitiesInPIs}) {
            $pi = $self->expandEntities($pi);
        }
        $self->pushEvent("Proc", $target, $pi);
        if ($self->{attrsInPIs}) {
            my $aaRef = $self->parseAttributeStringToArray($pi);
            for (my $i=0; $aaRef && $i<scalar(@{$aaRef}); $i+=2) {
                $self->pushEvent("ProcAttr",$aaRef->[$i],$aaRef->[$i+1]);
            }
            $self->pushEvent("ProcAttrFin");
        }
    } # PI

    elsif ($raw =~ s/^<!--(.*?)-->//s) {                     # COMMENT
        my $text = $1;
        ($text =~ m/-$/) && pushEvent(
            "ERROR", "WF: Comment cannot end with '--->'.");
        $self->pushEvent("Comment", $text);
    }
    elsif ($raw =~ s/^<!--//s) {                             # COMMENT
        $self->pushEvent("ERROR", "WF: Unterminated comment");
        $self->yMsg(0, "Unterminated comment");
    }

    elsif ($raw =~ s/^<!DOCTYPE\s+//) {                      # DOCTYPE DCL
        $raw =~ s/($xname)\s+(PUBLIC|SYSTEM)?\s*($qs)\s*($qs)?\s*(\[\s*|>)?//;
        my $dtname = ($1) ? $1:"";
        my $pskey = ($2) ? $2:"";
        my $public = ($3) ? $3:"";
        my $system = ($4) ? $4:"";
        if ($pskey eq "SYSTEM") {
            $public = $system; $system = "";
        }
        $self->pushEvent("DOCTYPE", $dtname, $public, $system);
        if ($5 eq ">") {
            $self->pushEvent("DoctypeFin");
            $self->{inDoctype} = 0;
        }
        else {
            $self->{inDoctype} = 1;
        }
    }
    elsif ($raw =~ s/^<!ELEMENT\s+($xname)\s+(.*?)\s*>//s) {      # ELEMENT DCL
        $self->pushEvent("Element", $1, $2);
        $self->dclElement($1,$2);
    }
    elsif ($raw =~ s/^<!ENTITY\s+(%\s+)?($xname)\s+(.*?)\s*>//s) { # ENTITY DCL
        my $param = $1;
        my $name = $2;
        my $notation = "";
        my $rest = $3;
        my $value = my $sysid = my $pubid = "";
        my $valueType = "";
        if ($rest =~ s/^$qlit$//) {
            $valueType = "LITERAL";
            $value = $1;
            $value = substr($value, 1, length($value)-2);
        }
        elsif ($rest =~ s/^(SYSTEM|PUBLIC)\s*$qlit\s*$qlit?//) {
            $valueType = $1;
            my $id1 = $2;
            $id1 = substr($id1, 1, length($id1)-2);
            my $id2 = $3 || "";
            $id2 = substr($id1, 1, length($id2)-2);
            if ($valueType eq "PUBLIC") {
                $pubid = $id1; $sysid = $id2;
            }
            else {
                $sysid = $id1;
            }
            if ($rest =~ s/^\s*NOTATION\s+(\w+)//) {
                $notation = $1;
            }
        }
        $self->{entManager}->defineEntity(
            $name,$valueType,$value,$sysid,$pubid,$notation,$param?1:0);
        if ($valueType eq "LITERAL") {
            $self->pushEvent("Entity", $name, $value);
        }
        else {
            $self->pushEvent("Entity", $name, $sysid, $pubid);
        }
    } # ENTITY

    # Make the arg lists work just like SAX
    #
    elsif ($raw =~ s/^<!ATTLIST\s+($xname)\s+(.*?)\s*>//s) { # ATTLIST DCL
        # $2 is like ID ID #REQUIRED  CLASS NAMES #IMPLIED  FOO (x|y) "x"...
        $self->pushEvent("Attlist", $1, $2);
    } # ATTLIST
    elsif ($raw =~ s/^<!NOTATION\s+($xname)\s+(.*?)\s*>//s) {# NOTATION DCL
        (defined $self->{notationDcls}->{$1}) && $self->pushEvent(
            "ERROR", "VC: Duplicate declaration for NOTATION '$1'.");
        $self->{notationDcls}->{$1} = $2;
        $self->pushEvent("Notation", $1, $2);
    } # NOTATION
    elsif ($raw =~ s/^<!(.*?)>//s) {                         # BAD DCL
        $self->pushEvent("ERROR", "VC: Bad markup declaration: $1");
    }

    elsif ($raw =~ s/^(&#x[\da-f]+;)//si ||                  # Entity/Char Ref
           $raw =~ s/^(&#\d+;)//s ||
           $raw =~ s/^(&$xname;)//si) {
        if ($self->{expandEntities}) {
            $self->pushEvent("Char", $self->expandEntity($1));
        }
        else {
            $self->pushEvent("Unparsed", $1);
        }
    } # "&"

    elsif ($raw =~ s/^%($xname;)//si) {                      # Parameter Entity
        ($self->{inDoctype}) ||
            $self->pushEvent(
                "ERROR", "WF: Parameter Entity reference outside DTD");
        if ($self->{expandEntities}) {
            $self->pushEvent("Char", $self->openParameterEntity($1));
        }
        else {
            $self->pushEvent("Unparsed", $1);
        }
    } # "%"

    elsif ($raw =~ s/^<\/(.*?)\s*>//s) {                     # End tag
        my $gi = ($1) ? $1:"";
        #$self->vMsg(1,"Calling queueCloseEvent($gi).");
        $self->queueCloseEvent($gi);
    } # End tag
    elsif ($raw =~ s/^<($xname)(\s.*?)?(\/?>)//s) {          # Start/empty tag
        $self->{seenTag} = 1;
        my $gi = $1;
        my $attrString = $2 ? $2:"";
        my $close = $3;
        if (my $closeEm = $self->{cantGoInside}->{$gi}) {
            my $out = $self->findOutermost($closeEm);
            my $d = $self->getDepth();
            for (my $i=$d-1; $i>=$out; $i--) {
                $self->pushEvent("End", $self->getCurrentTag());
            }
        }
        my $attrArrayRef =
            $self->parseAttributeStringToArray($attrString);
        $self->queueOpenEvent($gi,@{$attrArrayRef});
        if ($close eq "/>") {                                # Empty element
            $self->pushEvent("End", $gi);
        }
    } # Start tag

    elsif ($self->{yml} &&                                   # YML
           $raw =~ m/^([$ymlChars]|<\/>)/o) {
        $raw = $self->handleYml($raw);
        #$self->vMsg(0, "YML is not fully supported.");
    } # YML stuff

    # Check for [&<] that didn't start recognizable markup
    #
    elsif ($raw =~ s/^([<&])//) {                            # ??
        $self->vMsg(0,"WF: Bad parse at: '$1$raw'");
        $self->pushEvent("ERROR", $1);
    }

    # Otherwise we're in content, not markup.
    #
    else {                                                   # Text
        $raw =~ s/^([^<&$ymlChars]+)//s;
        ($1) || die "What? '$raw'\n";
        my $txt = $1;
        if ($self->{normalize}) {
            $txt =~ s/\s\s+/ /g;
            $txt =~ s/^\s+//;
            $txt =~ s/\s+$//;
        }
        if ($txt =~ m/$badControls/) {
            $self->vMsg(0, "WF: Prohibited characters found in: $txt");
            $self->pushEvent("ERROR", $txt);
        }
        $self->pushEvent("Char", $txt);
    } # Text

    # Whatever we parsed, we end up here....
    #
    $self->{theText} = $raw;
    return($self->shiftEvent());
} # nextEvent


###############################################################################
# Do additional parsing when YML-specific markup is detected.
#
sub YMLParser::handleYml {
    my ($self, $raw) = @_;

    if ($raw =~ s/^$ymlUp// ||                          # Close current
        $raw =~ s/^<\/>//) {
        #$self->yMsg(0, "  ymlUp, stack is " . $self->getFQGI() . ".");
        $self->queueCloseEvent();
    }

    elsif ($raw =~ s/^$ymlReopen//) {                   # Close and reopen
        #$self->yMsg(0, "  ymlReopen, stack is " . $self->getFQGI() . ".");
        my $baseTag = $self->getCurrentTag();
        my $baseAttrsRef = $self->getCurrentAttrArray();
        my $bar = ($baseAttrsRef) ? join("|",@{$baseAttrsRef}) : "";
        #$self->yMsg(0, "    Base ($baseTag): |$bar|.");

        #$self->yMsg(0, "    queueCloseEvent.");
        $self->queueCloseEvent();
        my $ymlAttrsRef = [];
        if ($raw =~ s/^$ymlStartAttributes(.*?)$ymlTerminateAttributes//) {
            $self->yMsg(0, "    ymlAttrs '$1'\n");
            $ymlAttrsRef = $self->parseAttributeStringToArray($1);
        }
        #$self->yMsg(0, "    queueOpenEvent($baseTag): |" . join("|",@$ymlAttrsRef) . "|.\n";);
        $self->queueOpenEvent($baseTag,@$ymlAttrsRef);
    }

    elsif ($raw =~ s/^$ymlQuad//) {                     # ymlQuad
        my $ecn = $self->getElementChildNumber();
        #$self->yMsg(0, "  ymlQuad, at " . $self->getFQGI() .
        #    ", starting child # " . ($ecn+1) . ".");
        my $baseTag      = $self->getCousinTag($ecn+1);
        my $baseAttrsRef = $self->getCousinAttrs($ecn+1);
        if (!$baseAttrsRef) {
            #$self->yMsg(0, "*** no base attrs\n");
            $baseAttrsRef = [];
        }
        if (!$baseTag) {
            $baseTag = $self->{theStack}->[-1]->{tag};
            $baseAttrsRef = $self->{attrs};
            $self->pushEvent(
                "ERROR",
                "YMLWF: Cannot find based-on element for ymlQuad in FQGI " .
                $self->getFQGI() . " - treated as ymlReopen.");
        }
        elsif ($self->{verbose}) {
            my $bar = ($baseAttrsRef) ?
                join("|",@{$baseAttrsRef}) : "(no attributes on base)";
            $self->yMsg(0, "    Base ($baseTag): |$bar|.\n");
        }

        ($self->{verbose}) && $self->yMsg(0, "    queueCloseEvent due to ymlQuad.\n");
        $self->queueCloseEvent();

        if ($raw =~ s/$ymlStartAttributes(.*?)$ymlTerminateAttributes//) {
            #$self->yMsg(0, "    ymlAttrs to parse: '$1'\n");
            my $ymlAttrsRef = $self->parseAttributeStringToArray($1);
            if ($ymlAttrsRef) {
                ($self->{verbose}) && $self->yMsg(0, "baseAttrsRef: " .
                    join("|",@{$baseAttrsRef}) . "\n" .
                    "ymlAttrsRef: " . join("|",@{$ymlAttrsRef}));
                $self->insertAttrs($baseAttrsRef, $ymlAttrsRef);
            }
        }
        # $self->yMsg(0, "    queueOpen($baseTag): |" . join("|",@{$baseAttrsRef}) . "|\n";);
        $self->queueOpenEvent($baseTag,@{$baseAttrsRef});
    } # ymlQuad

    return($raw);
} # handleYml

# Merge the second array of attribute names and values, into the first.
# For any duplicate name(s), use the value(s) from the second array.
#
# Note: Does not check for duplicates within the second list.
#
sub YMLParser::insertAttrs {
    my ($self, $orig, $new) = @_;
    for (my $i=0; $i<scalar(@{$new}); $i+=2) {
        my $an = $new->[$i];
        my $av = $new->[$i+1];
        my $didReplace = 0;
        for (my $j=0; $j<scalar(@{$orig}); $j+=2) {
            if ($orig->[$j] eq $an) { # replace
                $orig->[$j+1] = $av;
                $didReplace = 1;
                last;
            }
        }
        if (!$didReplace) {
            push @{$orig}, $an;
            push @{$orig}, $av;
        }
    }
    return($orig);
}


# Return the position of the current element among its (element?) siblings.
# This requires us to keep the list (or at least count) of all prior
# siblings at each level.
#
sub YMLParser::getElementChildNumber { # 1-based!
    my ($self) = @_;
    my $d = $self->getDepth();
    if ($d<2) { return(1); }
    return(scalar(@{$self->{theStack}->[-2]->{childTags}}));
}

# Find the preceding element sibling of the parent of the current element,
# then find that element's $ecn'th child. $ecn will normally be the child
# number of the current element (see getElementChildNumber()), so this
# gets the 'corresponding' element, for example the corresponding table cell
# in from the prior row (the penultimate child of the tbody).
#
# Returns undef if no such element can be found, which is a YML error.
#
sub YMLParser::getCousinTag {
    my ($self,$ecn) = @_;
    my $grandParent = $self->getGrandParent();
    # Subtract one because children are 1-based!
    my $tag = $grandParent->{penultimate}->{childTags}->[$ecn-1];
    return($tag);
}

sub YMLParser::getCousinAttrs {
    my ($self,$ecn) = @_;
    my $grandParent = $self->getGrandParent();
    # Subtract one because children are 1-based!
    my $attrsRef = $grandParent->{penultimate}->{childAttrs}->[$ecn-1];
    if (0 && $attrsRef) {
        my @ca = @{$attrsRef};
        for (my $i=0; $i<scalar(@ca); $i+=2) {
            $self->yMsg(0, "    cousin attr: $ca[$i]=\"$ca[$i+1]\"\n");
        }
    }
    return($attrsRef);
}

sub YMLParser::getParent {
    my ($self,$ecn) = @_;
    ($self->getDepth()<2) && return(undef);
    return($self->{theStack}->[-2]);
}

sub YMLParser::getGrandParent {
    my ($self,$ecn) = @_;
    ($self->getDepth()<3) && return(undef);
    my $grandParent = $self->{theStack}->[-3];
    if ($grandParent->{penultimate}) { # a little extra checking for now
        my $auntType = $grandParent->{penultimate}->{tag};
        if ($auntType ne $self->{theStack}->[-2]->{tag}) {
            my $pt  = $self->{theStack}->[-2]->{tag};
            #$self->yMsg(0, "*** parent type '$pt' != aunt type '$auntType'.\n" .
            #    "    FQGI: " . $self->getFQGI() . ".");
        }
    }
    return($grandParent);
}


###############################################################################
# The 'open' and 'close' here just push those events -- the stack is not
# actually modified until those events happen.
#
sub YMLParser::queueOpenEvent {
    my $self = shift;
    my $gi = shift;
    if (ref($_[0])) { die "queueOpenEvent: bad arg 3!\n"; }
    my @attrArray = @_; # NOT a reference!
    if (scalar keys(%{$self->{attrDefaults}})) {
        $self->vMsg(0,"attr defaults are not yet implemented.\n");
    }
    if ($self->{attrEvents}) {
        $self->pushEvent("Start", $gi);
        for (my $i=0; @attrArray && $i<scalar(@attrArray); $i+=2) {
            $attrArray[$i] =~ m/^([^:]*):/;
            my $nsPrefix = ($1) ? $1:"";
            my $nsURI = $self->getNameSpaceUriFromPrefix($nsPrefix);
            $self->pushEvent("Attr",$attrArray[$i],$attrArray[$i+1],
                $nsPrefix, $nsURI);
        }
        $self->pushEvent("AttrFin");
    }
    elsif (@attrArray && scalar(@attrArray)>0) {
        $self->pushEvent("Start", $gi, @attrArray);
    }
    else {
        $self->pushEvent("Start", $gi);
    }
} # queueOpenEvent

sub YMLParser::queueCloseEvent {
    my ($self, $gi) = @_;
    ($self) || die "queueCloseEvent: no self.\n";
    if (!$gi) { $gi = ""; }
    $self->vMsg(1,"In queueCloseEvent for '$gi', depth = " .
                 $self->getDepth());
    if ($self->getDepth()<=0) {
        $self->yMsg(0, "Error: End-tag for '$gi', with nothing open.\n");
        $self->pushEvent("ERROR", "WF: End-tag with nothing open.");
    }
    elsif (!$gi || $gi eq $self->getCurrentTag()) {
        $self->pushEvent("End", $self->getCurrentTag());
    }
    elsif (!$self->isOpen($gi)) {
        $self->yMsg(0, "Error: End-tag for '$gi', which is not open. " .
            "Open elements: " . $self->getFQGI() . ".");
        my $edcl = $self->{elementDcls}->{$gi};
        ($edcl) && $self->yMsg(0, "    Last '$gi' was closed at " .
            $edcl->{lastClosed} . ".");
        $self->pushEvent("ERROR", "WF: End-tag for non-open '$gi'");
    }
    else { # infer end-tags (NON-CONFORMING)
        my $d = $self->getDepth();
        my $top = "";
        for (my $i=$d-1; $i>=0; $i--) {
            $top = $self->{theStack}->[$i]->{tag};
            if (!$self->{yml}) {
                $self->pushEvent("ERROR", "WF: End-tag inferred for '$top'.");
            }
            $self->pushEvent("End", $top);
            ($self->{verbose}) && $self->yMsg(0, "WF: End-tag inferred for '$top'.\n");
            last if ($top eq $gi);
        }
    }
} # queueCloseEvent

sub YMLParser::pushEvent {
    my $self = shift;
    my @pe = @_;
    push @{$self->{pendingEvents}}, \@pe;
}

# Take an event off the pendingEvents queue. This is the only method that
# does that, or that implements side-effects of the events.
#
# Returns: A reference to an array containing:
#     [0]: $self (added here), so event callbacks have it like XML::Parser.
#     [1]: Event type (like SAX callback distinctions)
#     [2]: Main data (element type, etc.)
#     ...: Additional data (identifiers, attributes, etc.)
#
sub YMLParser::shiftEvent {
    my ($self) = @_;
    if ($self->nPendingEvents() <= 0) {
        return(undef);
    }
    my $eventRef = shift @{$self->{pendingEvents}};
    my @event = @$eventRef;
    #vMsg(0,"shiftEvent: returning a $event[0].\n");
    if    ($event[0] eq "End")   {
        my $gi = $event[1];
        $self->closeElement($gi);
    }
    elsif ($event[0] eq "Start") {
        my $gi = $event[1];
        my @attrArray = @event;
        @attrArray = (scalar(@attrArray)>2) ? splice(@attrArray,2) : ();
        # $self->yMsg(0, "shiftEvent Start $gi:\n  " . join("\n  ",@attrArray) . ".\n");
        $self->openElement($gi,@attrArray);
    }
    elsif ($event[0] eq "CdataStart") {
        $self->{inCDATA} = 1;
    }
    elsif ($event[0] eq "CdataEnd") {
        $self->{inCDATA} = 0;
    }
    elsif (defined $eventNames{$event[0]}) {
        # PI, comment, char, attr, etc. -- nothing special to do.
    }
    else {
        $self->vMsg(0,"Funky pending event, type '$event[0]'.\n");
    }
    unshift @event, $self;
    return(\@event);
} # shiftEvent


###############################################################################
# Manage attribute defaults
#
sub YMLParser::addAttributeDefault {
    my ($self, $gi, $aname, $dvalue) = @_;
    $self->{attrDefaults}->{"$gi@$aname"} = $dvalue;
}


###############################################################################
# Manage entities and character references
#
sub YMLParser::expandEntities {
    my ($self, $s) = @_;
    my $buf = "";
    $s =~ s/^&(#\d+|#x[\da-f]+|$xname);/{ $self->expandEntity($1); }/ges;
    return($s);
} # expandEntities

sub YMLParser::expandEntity {
    my ($self, $raw) = @_;
    my $buf = "";
    if ($raw =~ s/^&#x([0-9a-f]+);//si) {         # Hexadecimal Char Ref
        my $c = chr(hex($1));
        (!$c || !isXmlChar($c)) && $self->pushEvent(
            "ERROR", "WF: Character reference to non-XML Char 0x$1\n");
        $buf = $c;
    }
    elsif ($raw =~ s/^&#([0-9]+);//s) {           # Decimal Char Ref
        my $c = chr($1);
        (!$c || !isXmlChar($c)) && $self->pushEvent(
            "ERROR", "WF: Character reference to non-XML Char 0d$1\n");
        $buf = $c;
    }
    elsif ($raw =~ s/^&($xname);//) {             # Named Entity Ref
        $buf = $self->lookupEntityName($1);
    }
    else {
        $self->vMsg(0,"WF: Bad entity reference syntax: '$raw'.\n");
        $buf = $raw;
    }
    return($buf);
} # expandentity

# Try to find a definition for a given entity name. Try the predefined XML
# ones first, then any the caller defined, then HTML if enabled.
#
sub YMLParser::lookupEntityName {
    my ($self, $ename) = @_;
    # XML built-in / reserved
    if ($self->useXmlEntities) {
        if ($ename eq "lt")   { return("<"); }
        if ($ename eq "gt")   { return(">"); }
        if ($ename eq "amp")  { return("&"); }
        if ($ename eq "apos") { return("'"); }
        if ($ename eq "quot") { return("\""); }
    }
    # Text Entities
    if ($self->{textEntities}) {
        my $evalue = $self->{textEntities}->{$ename};
        if ($evalue) {
            return($evalue);
        }
    }
    if ($self->{fileEntities}) {
        my $evalue = $self->{textEntities}->{$ename};
        if ($evalue) {
            $self->yMsg(0, "File (system) entities are not yet supported.\n");
            return("&$ename;");
        }
    }
    # HTML set
    if ($self->{useHtmlEntities}) {
        my $eref = "&$ename;";
        my $evalue = decode_entities($eref);
        if ($evalue ne $eref) { return($evalue); }
    }
    $self->vMsg(0,"Unrecognized entity name '$ename'.\n");
    return("<!-- Unrecognized entity name '$ename' -->");
}

sub YMLParser::setXmlEntities {
    my ($self, $flag) = @_;
    if (defined $flag && !$flag) {
        $self->useXmlEntities = 0;
    }
    else {
        $self->useXmlEntities = 1;
    }
} # setXmlEntities

sub YMLParser::addTextEntity {
    my ($self, $aName, $value) = @_;
    if (defined $self->{textEntities}->{$aName} ||
        defined $self->{fileEntities}->{$aName}) {
        $self->yMsg(0, "Entity '$aName' redefined.\n");
    }
    $self->{textEntities}->{$aName} = $value;
    $self->{ecount}++;
}

sub YMLParser::setHtmlEntities {
    my ($self,$flag) = @_;
    if (defined $flag && !$flag) {
        $self->useHtmlEntities = 0;
    }
    else {
        $self->useHtmlEntities = 1;
    }
} # setHtmlEntities


###############################################################################
# The parser just scans for the closing "/?>" on start-tags. Anything ahead
# of that gets passed here to parse out the individual attributes.
# Allows several minimization options.
#
####### CHANGE this around -- build array, then make hash from that.
#
sub YMLParser::parseAttributeStringToArray {
    my ($self, $attrString) = @_;
    my $ahref = $self->parseAttributeString($attrString);
    my @aarray = ();
    for my $k (sort keys %{$ahref}) {
        push @aarray, $k;
        push @aarray, $ahref->{$k};
    }
    return(\@aarray);
}

sub YMLParser::parseAttributeString {
    my ($self, $attrString) = @_;
    my $orig = $attrString;
    my %atHash = ();

    while ($attrString) {
        $attrString =~ s/^\s+//;
        my $an = "";
        my $av = "";

        if ($attrString =~                                           # Normal
               s/^($xname)\s*=\s*(".*?"|'.*?')\s*//) {
            $an = $1;
            $av = $2;
            $av =~ s/^['"]//;
            $av =~ s/['"]$//;
        }
        elsif ($self->{shortAttrs} &&                                # +/-
            $attrString =~ s/([-+])($xname)\s*//) {
            $an = $2;
            $av = ($1 eq '+') ? "1" : "0";
            # $self->yMsg(0, "+/- attr, '$1' for '$2'.\n");
        }
        elsif ($self->{shortAttrs} &&                                # Unquoted
               $attrString =~ s/^($xname)\s*=\s*($xname)\s*//) {
            $an = $1;
            $av = $2;
            # $self->yMsg(0, "unquoted attr, '$an' = '$av'.\n");
        }
        elsif ($self->{shortAttrs} &&                                # Bare
               $attrString =~ s/^($xname)\s*//) {
            $an = $av = $1;
            # $self->yMsg(0, "bare attr, '$an'.\n");
        }
        else {                                                       # Failed
            my $msg = "WF: Bad attribute syntax in '$attrString'\n";
            $self->vMsg(0,$msg);
            $self->pushEvent("ERROR", $msg);
            last;
        }

        if (!$an) {
            $self->vMsg(0,"YMLWF: Missing attribute name in '$orig'\n");
        }
        else {
            # Normalize the value per XML 3.3.3
            (defined $atHash{$an}) &&
                $self->vMsg(0,"WF: Duplicate attribute '$an'\n");
            $atHash{$an} = $self->expandEntities($av); # Should be recursive
            $atHash{$an} =~ s/[ \r\n\t]/ /g;
            if (0) { # non-CDATA attrs, if we saw the declaration
                $atHash{$an} =~ s/  +/ /g;
                $atHash{$an} =~ s/^ //;
                $atHash{$an} =~ s/ $//;
            }
        }
    } # while
    return(\%atHash);
} # parseAttributeString


###############################################################################
#
sub YMLParser::dclElement {
    my ($self, $ename, $model) = @_;
    $self->{elementDcls}->{$ename} = new ElementDcl($ename, $model);
}

sub YMLParser::dclAttr {
    my ($self, $ename, $aname, $atype, $adefault) = @_;
    $self->{elementDcls}->{$ename}->{attrs}->{$aname} = "$atype\t$adefault";
}


###############################################################################
# Return true if the name passed is a legitimate XML NAME.
# (I'm not certain Perl \w is exactly the same as needed for XML NAMEs,
# but it's close).
#
sub YMLParser::isXmlName {
    my ($self, $theName) = @_;
    return(($theName =~ m/^$xname$/) ? 1:0);
}

sub YMLParser::isXmlChar {
    my ($self, $c) = @_;
    my $n = ord($c);
    if ($n == 0x0009 ||
        $n == 0x000A ||
        $n == 0x000D ||
        ($n >= 0x0020 && $n <= 0xD7FF) ||
        ($n >= 0xE000 && $n <= 0xFFFD) ||
        ($n >= 0x10000 && $n <= 0x10FFFF)) {
        return(1);
    }
    return(0);
}

sub YMLParser::vMsg {
    my ($self, $level, $msg) = @_;
    if (!$msg) { $msg = ""; };
    chomp $msg;
    if ($self->{verbose}) {
        $self->yMsg(0, "YMLParser: $msg\n");
    }
}


###############################################################################
# Manage the open-element stack and various state information.
# For YML, we have to keep the previous sibling of each open element, too.
#
sub YMLParser::resetStack {
    my ($self) = @_;
    $self->{theStack}      = [];
}

sub YMLParser::openElement {
    my $self = shift;
    my $tag = shift;
    if (!$tag) {
        $self->yMsg(0, "openElement: nil tag arg\n");
        $tag = "_UNKNOWN";
    }
    my @attrArray = @_;

    my $curFrame = $self->{theStack}->[-1];
    my $newEI = new ymlElementInstanceInfo($tag, \@attrArray);

    $newEI->setLang($curFrame ?
                    $curFrame->{lang} : $self->{defaultLang});

    for (my $i=0; $i<scalar(@attrArray); $i+=2) {
        my $aname = $attrArray[$i];
        my $avalue = $attrArray[$i+1];
        if ($aname eq "xml:lang") {
            $newEI->setLang($avalue);
        }
        elsif ($aname =~ m/^xmlns:/) {
            if ($aname eq "xmlns:") {
                $newEI->{dftNameSpace} = $avalue;
            }
            else {
                # should we inherit default?
                $newEI->setNameSpace($aname,$avalue);
            }
        }
    }

    # Possible bug: We don't track multiple children of #ROOT
    if ($curFrame && $self->{yml}) {
        $curFrame->appendChild($tag,\@attrArray);
    }

    push @{$self->{theStack}}, $newEI;
    my $d = $self->getDepth();
} # openElement

sub YMLParser::closeElement {
    my ($self, $tag) = @_;
    my $curFrame = $self->{theStack}->[-1];
    if (!$curFrame) {
        $self->yMsg(0, "closeElement called with nothing open.\n");
        return();
    }
    my $curTag = $curFrame->{tag};
    if ($tag && $curTag ne $tag) {
        $self->yMsg(0, "closeElement called for '$tag', expected '" .
            $curFrame->{tag} . "'.");
    }

    # Record where this element type last ended, for later error msgs.
    $self->{elementDcls}->{$curTag}->{lastClosed} =
        $self->{entManager}->getWholeEntityLoc();

    # ymlQuad needs the element as long as its 1st following-sibling is open.
    if ($self->getDepth()>=2) {
        $self->{theStack}->[-2]->{penultimate} = $curFrame;
    }
    pop @{$self->{theStack}};
} # closeElement


# Searching/counting along the theStack
#
sub YMLParser::getDepth {
    my ($self) = @_;
    return(scalar(@{$self->{theStack}}));
}
sub YMLParser::getFQGI {
    my ($self) = @_;
    my $buf = "";
    for my $e (@{$self->{theStack}}) {
        $buf .= "/" . $e->{tag};
    }
    return($buf);
}
sub YMLParser::isOpen {
    my ($self, $tag) = @_;
    for my $e (@{$self->{theStack}}) {
        if ($e->{tag} eq $tag) { return(1); }
    }
    return(0);
}
sub YMLParser::nOpen {
    my ($self, $tag) = @_;
    my $n = 0;
    for my $e (@{$self->{theStack}}) {
        if ($e->{tag} eq $tag) { $n++; }
    }
    return($n);
}
sub YMLParser::findOutermost { # Largest of any of types named
    my ($self, $tags) = @_;
    my $d = $self->getDepth();
    for (my $i=0; $i<$d; $i++) {
        my $curTag = $self->{theStack}->[$i]->{tag};
        if ($tags =~ m/\b$curTag\b/) { return($i); }
    }
    return(undef);
}

# Get information about current (innermost) element.
#
sub YMLParser::getCurrentTag {
    my ($self) = @_;
    my $curFrame = $self->{theStack}->[-1];
    return($curFrame->getTag());
}
sub YMLParser::getCurrentAttr {
    my ($self,$aname) = @_;
    my $curFrame = $self->{theStack}->[-1];
    return($curFrame->getAttr($aname));
}
sub YMLParser::getCurrentAttrArray {
    my ($self) = @_;
    my $curFrame = $self->{theStack}->[-1];
    return($curFrame->getAttrArray());
}
sub YMLParser::getCurrentLang {
    my ($self) = @_;
    my $curFrame = $self->{theStack}->[-1];
    return($curFrame->getLang());
}
sub YMLParser::getCurrentNewNameSpaces {
    my ($self) = @_;
    my $curFrame = $self->{theStack}->[-1];
    return($curFrame->{newNameSpaces});
}
sub YMLParser::getCurrentSiblingTagArray {
    my ($self) = @_;
    my $curFrame = $self->{theStack}->[-2];
    return(($curFrame) ? $self->{theStack}->[-2]->{childTags} : undef);
}
sub YMLParser::getCurrentSiblingAttrArray {
    my ($self) = @_;
    my $curFrame = $self->{theStack}->[-2];
    return(($curFrame) ? $self->{theStack}->[-2]->{childAttrs} : undef);
}

# Get any open element info object, from which you can then extract stuff.
#
sub YMLParser::getElementInfo {
    my ($self,$n) = @_;
    return($self->{theStack}->[-$n]);
}


###############################################################################
# Namespace mapping. Note that only the *new* declarations are stored at
# each level (element), since ns prefixes can be overridden. So we search
# upward from the current element toward the root, and use the innermost
# declaration in effect.
#
sub YMLParser::getDefaultNameSpace {
    my ($self) = @_;
    for (my $i=$self->getDepth()-1; $i>=0; $i--) {
        my $dns = $self->{theStack}->[-1]->{dftNameSpace};
        if ($dns) { return($dns); }
    }
    return("");
}
sub YMLParser::getNameSpaceUriFromPrefix {
    my ($self, $prefix) = @_;
    if (!$prefix) {
        return($self->getDefaultNameSpace());
    }
    for (my $i=$self->getDepth()-1; $i>=0; $i--) {
        my $curEI = $self->{theStack}->[$i];
        for (my $i=0; $i<scalar(@{$curEI->{newNameSpaces}}); $i+=2) {
            if ($curEI->{newNameSpaces}->[$i] eq $prefix) {
                return($curEI->{newNameSpaces}->[$i+1]);
            }
        }
    }
    return(undef);
}
sub YMLParser::getNameSpacePrefixFromUri {
    my ($self, $uri) = @_;
    if (!$uri) {
        return("");
    }
    for (my $i=$self->getDepth()-1; $i>=0; $i--) {
        my $curEI = $self->{theStack}->[$i];
        for (my $i=0; $i<scalar(@{$curEI->{newNameSpaces}}); $i+=2) {
            if ($curEI->{newNameSpaces}->[$i+1] eq $uri) {
                return($curEI->{newNameSpaces}->[$i]);
            }
        }
    }
    return(undef);
}


###############################################################################
###############################################################################
# Move to SchemaManager.pm?
#
package ElementDcl;

sub ElementDcl::new {
    my ($class, $ename, $model) = @_;

    my $self = {
        ename      => $ename,   # Name of the element
        model      => $model,   # Content model
        attrs      => {},       # Attrs declared
        lastClose  => "",       # Where did last instance end?
    }; # self

    bless $self, $class;
    return $self;
} # new

sub ElementDcl::getContentModel {
    my ($self) = @_;
    die "No getContentModel.\n";
}

sub ElementDcl::addAttr {
    my ($self, $aname, $atype, $adefault) = @_;
    if (defined $self->{attrs}->{$aname}) {
        warn "Duplicate attribute definition";
        return(0);
    }
    $self->{attrs}->{$aname} = new Attribute($aname, $atype, $adefault);
    return(1);
}

sub ElementDcl::isAttrDefined {
    my ($self, $aname) = @_;
    die "No getAttrDefault.\n";
}

sub ElementDcl::getAttrDefault {
    my ($self, $aname) = @_;
    die "No getAttrDefault.\n";
}

sub ElementDcl::isSequenceValid {
    my ($self, $ename, $childTypeArrayRef) = @_;
    die "isSequenceValid is not yet supported.\n";
}

###############################################################################

package AttributeDef;

sub AttributeDef::new {
    my ($class, $name, $type, $default) = @_;
    # xmlname($name)
    # m/^(ID|IDREFS?|ENTITY|ENTITIES|CDATA|NAMES?|NMTOKENS?)$/
    # qlit
    my $self = {
        name     => $name,
        type     => $type,
        default  => $default,
    };
    bless $self, $class;
    return($self);
}


1;


###############################################################################
###############################################################################
###############################################################################
#

=pod

=head1 Usage

use YMLParser;

Parse a string as XML, but handle non-WF and YML extensions.

This is especially useful for dealing with natural language corpora, since
quite a number of them claim to be in XML, but aren't.

This is I<not> a conforming XML parser, though very similar.
It should produce the same results as a conforming SAX parser
for any well-formed XML document.
But it will also survive a variety of errors and correct some,
and it supports some optional extensions (more below),
some of which address XML's oft-criticised verbosity for tabular structures.

It can do either "push" or "pull" parsing. The event structure is almost
identical to the SAX interface used (for example) by XML::Parser
(see L<http://search.cpan.org/~msergeant/XML-Parser-2.36/Parser.pm>), though
you can also (optionally) get events for individual attributes, and
errors are returned as additional events.


=head2 Why/how this is not a real XML parser

For good reasons, the XML standard requires that a conforming XML parser
terminate if it finds a well-formedness error. If you have "sort of XML"
data (for example, if some end-tags are missing, or some attributes are
not quoted, etc), this means you can't use
a conforming XML parser to clean it up. You I<can> use a standalone
program like the excellent I<xmlTidy>; or you can use I<YMLParser.pm>.


=head1 Error-recovery special behaviors

=over

=item * Multiple root-level elements and text outside the root
element(s) are allowed.

=item * End-tags for non-open elements are ignored (a warning is issued).

=item * Element and attribute names that start with [.-\d] are allowed.

=item * Attribute lists that can't be figured out are at least survived.

=item * "&" or "<" characters that don't begin WF XML markup constructs
will be returned as text.

=item * Late, case-varying, repeated, and/or otherwise erroneous
XML declarations are allowed. However, the input I<must> be UTF-8;
encoding declarations are ignored; consider C<iconv> if needed.

=item * Unknown named entity references are allowed, and treated as text.

=item * There is an option to not expand any entity references at all.

=item * You can set up your own entity definitions and
attribute defaults, or HTML's set, regardless of DTD.

=item * Marked-section ends ("]]>") that are not in the scope of
a CDATA marked section, are reported and treated as text.

=item * Although it does not (yet) parse external DTDs, it does parse
DOCTYPE internal subsets (and is not picky about the '[' and ']').


=head1 Handy special API features

=item * Optionally, it can expand entities within the content of Processing
Instructions.

=item * Optionally, it can parse and return content of Processing
Instructions that (like XML declarations) follow the syntax of
XML attribute specification lists.

=item * Optionally, the caller can get attributes as separate "Attr"
events after a Start event, instead of packed into additional arguments
to the Start handler.

=item * The previous two features can be used together, so that the
pseudo-attributes within a PI can be returned as separate events.

=back


=head1 Short attribute special behaviors

These are enabled via the "shortAttrs" option.

=over

=item * Unquoted attributes are ok if they only contain XML NAME characters.
They need not be XML NAMES, and thus may begin with a digit.
For example, <z class=doc size=12> = <z class="doc" size="12">.

=item * SGML/HTML style bare-NAME-token attributes will be accepted,
and the value is made the same as the name.
For example, <table border> = <table border="border">.

=item * Boolean-valued attributes can be shortened to "+" or "-" and their
name. For example, <z +fine -secret> = <z fine="1" secret="0">.

=back


=head1 YML special behaviors

If you set the "yml" option, special syntax is supported for making
more compact files, especially in the case of repetitive or tabular data.
YML markup is only recognized in content, not within a tag, PI, comment, etc.

A YML parser handles any WF XML document in the normal way. However,
an XML parser will I<not> correctly handle YML documents
(well, if they use any of the YML-specific constructs).

By design, if you use an XML parser on a YML document by mistake, it will not
get it wrong; instead, it will reliably terminate with a Well-Formedness error.
This prevents the confusion that would result if a YML document
could be parsed successfully but incorrectly by an XML parser.

The special YML syntax features are:

=over

=item ymlOmitEnd (I<no syntax>)

Missing end-tags are provided when an outer end-tag or EOF is found,
in the same way as the SGML OMITTAG feature works for end-tags.
This is illegal in XML.

=item ymlEmptyEnd ("</>")

This is just the "empty end tag" syntax from SGML.
It closes the current (innermost) open element, regardless of type.
This and I<ymlOmitEnd> are the only YML
constructs that do not involve a non-XML control character.
However, this is still illegal in XML.

=item ymlUp   (0x15, d21, NAK, ^U)

I<ymlUp> has the same effect as "</>", but via a single character.
Mnemonic: ^U for "Up".

=item ymlReopen  (0x12, d18, DC2, ^R)

Closes the current element and then opens a new element of the same type.
The new element is said to be "based on" the one that was current just
before the I<ymlReopen> character was encountered.
The new element gets all the same attributes as the one it is based on,
except for any that are overridden using I<ymlStartAttributes>...I<ymlTerminateAttributes>.
Mnemonic: ^R for "Reopen".

=item ymlQuad   (0x11, d17, DC1, ^Q)

This is mainly intended to save a great deal of space versus XML in tables,
and in other structures where
many successive siblings (like tr) have the same sequence of types for their
child elements (like td).

Mnemonic: I<ymlQuad> is probably most important for tabular data: so ^Q for
"Quadrille", as in quadrille-ruled (graph) paper;
or "Quadding", a typography term related to horizontal alignment of text.

I<ymlQuad> enables table markup very much like MediaWiki:

    ||cell|cell|cell...

If all of the cells are marked up the same way (for example, if all are
I<td> elements with the same (or no) attributes, then you can get this effect
using I<ymlreopen> as already described. However, (in effect) copying the
start-tag of the current element isn't good enough in general. For example:

    <tr> <th>2011:</th>
         <td class="x1">$12</td>
         <td class="q">Green</td>
    </tr>
    <tr> <th>2012:</th>
         <td class="x1">$1200</td>
         <td class="q">Orange</td>
    </tr>

or

    <address> <fname>John</fname>
              <lname>Doe</lname>
              <phone>555-1234</phone>
              <state>FL</state>
    </address>
    <address> <fname>Jane</fname>
              <lname>Buck</lname>
              <phone>999-1235</phone>
              <state>MA</state>
    </address>

I<ymlQuad> handles such cases by basing the new element on the I<corresponding>
child element of the I<preceding container>, instead of on
the I<preceding> child element of the I<current container>.
The I<corresponding> child element is the one which has the same child element
number (position in order), as the new element about to be opened will have.

A preceding container must be the immediately-preceding sibling of the current container, and of the same element type. If there is no such element, then
I<ymlQuad> behaves exactly the same as I<ymlReopen>. Or, the I<first> I<tr>
or similar element can have its children marked up in full form
or with I<ymlReopen>.

For example, consider a context like the example above, except
that a I<ymlQuad> occurs immediately after the text C<999-1235>, instead of
C<< </phone>...<state> >>. At that point:

    the I<current element> is the I<phone>
    the I<current element> is the third child element of its parent
    the I<current container> is the second I<address>
    the I<preceding container> is the first I<address> element

Text nodes don't count.

The I<ymlQuad> closes the current (I<phone>) element.
Then it opens a new element, which will of course be the I<fourth>
child of its parent. The new element is based on the corresponding (fourth)
child element of the I<preceding container>, and so will be a I<state>
element as desired (with any attribute that the earlier I<state>
element may have had -- in this case none).

This exact algorithm wouldn't let you use ymlQuad to start the I<first>
I<td> in a I<tr> (or similar), because the I<tr> would be current, rather
than a I<td> (and therefore would be closed, which is not the idea).

Therefore, I<ymlQuad> behaves specially when it is doubled at the start of
a line, in shameless imitation of the MediaWiki convention. In that case
the sequence of a line break and two I<ymlQuad>s signals that the
current element is functioning like a I<tr>. YML closes it and re-opens it
(essentially like I<ymlReopen>), and then additionally opens a first element
child of that new element, based on the first child of the element that it
just closed (which it assumed functions like a I<tr>).


=head2 ymlQuad algorithm

For example, in a table the first row could tag all its
children (say, as TD, TH, and/or other types). All following rows can
just use the I<ymlQuad> character to separate their fields, much like
"|" is used in many Wiki markup systems (they must still open the I<first>
field in each row explicitly).
This reduces the overhead for
tabular or other highly-repetitive structures to essentially the same as
in the ever-popular "CSV" file: 1 byte per field. It also makes
the data self-documenting (in the manner of CSV files with "header" records),
and allows more flexibility because any deviations from total
regularity can be handled, merely by using full XML-style tags instead of
I<ymlQuad> for the exceptions, and then going back to I<ymlQuad> again.

A YML parser keeps track of the sequence of element types (and attributes,
including namespace attributes)
for the already-seen children of each open element, I<and> for each open
element's preceding-sibling element (if any).

Taking HTML table markup as a simple example, the I<ymlQuad> character
may be used within the children (cells) of any but the first TR.
If it is, then the parser:

=over

=item 1: figures out the current element's I<child number>,
that is, its number of preceding-sibling elements plus 1
(PIs, comments, and text nodes do not count). Call that number I<N>.

=item 2: finds the parent's nearest preceding-sibling element.
Call that element PRE.
If there is no such element, a YML Well-formedness error is signaled.

=item 3: looks up the element type and attributes of the I<N+1>th child of PRE.
If there is no such element, a YML Well-formedness error is signaled.
If there is, that element is the one that the new element will be based on.

=item 5: checks for I<ymlStartAttributes> immediately following the I<ymlQuad>, and
if found, parses the subsequent attribute list as described below.

=item 2: closes the current element (perhaps it is a TD).

=item 6: opens the new element.

=back

B<Note>: I<ymlQuad> can't start the I<first> cell in a row, because it
is defined to close the current element and then open something.
Suggestions for an improved semantic are nevertheless welcome.

Likewise, a single ^U at the end of a row will close the cell, not the
row. To close both, use (for example)
^U^U or
^U</tr> or
</tr> or
^U^R or
similar. Just be sure to close them both.

I<ymlQuad> also doesn't do anything special in case of things like row or column
spans, so a row whose cells don't correspond neatly to the prior row,
should typically be tagged in full (this doesn't seem that bad to me, because
it is a case specific to tabular I<markup>, that doesn't arise in tabular
or relational I<data>.


=item ymlStartAttributes    (0x13, d19, DC3, ^S)

This may occur only as the very next character after
I<ymlReopen> or I<ymlQuad> (not even whitespace between).
It indicates the start of an attribute list for the element
which the I<ymlReopen> or I<ymlQuad> will open.
The attribute list must end with the I<ymlTerminateAttributes> character.
Attributes specified in this attribute list override any of the same name
on the element the new element is I<based on>.
Mnemonic: ^S for "Start Attributes".

=item ymlTerminateAttributes (0x14, d20, DC4, ^T)

This may only occur as described under I<ymlStartAttributes>, to mark the end of
an attribute list for an element to be opened due to
I<ymlReopen> or I<ymlQuad>.
Mnemonic: ^T for "Terminate Attributes" (or just "the character after ^S").

=back


=head1 NOTES

=head2 Example document

    (control characters are shown here as ^Q...^T)

    <p>A paragraph^U
    <p>Another paragraph^U

    <p class="p1">Para 1, reopen:
    ^RPara 2, reopen:
    ^RPara 3, up:^U

    <table>
    <tbody>
    <tr>
        <td id="base1">cell1</td>
        <td>cell2</td>
        <td>cell3</td>
    </tr>
    <tr><td>cell10^Qcell20^Qcell30^U
    <tr><td>cell11^Qcell21^Sid="12" class="middle"^T^Qcell31^U
    ^R<td>cell12^Qcell22^Qcell32^U^U
    </table>


=head2 Example code (pull)

  use YMLParser;
  $fp = new YMLParser();
  $fp->addText($myText);
  while (my $e = $fp->nextEvent()) {
      $e =~ s/^(.*?)\t//;
      my $type = ($1) ? $1:"????";
      if    ($type eq "Start") { ... }
      elsif ($type eq "End")   { ... }
      elsif ($type eq "Char")  { ... }
      else { ... }
  }

=head2 Example code (push)

  use YMLParser;
  $fp = new YMLParser();
  $fp->setHandlers( {
        "Start"     => \&myStartHandler,
        "End"       => \&myEndHandler,
        "Char"      => \&myCharHandler,
        "ERROR"     => \&myErrorHandler,
  }
  $fp->parsestring($myText);


=head1 Methods

=over

=item * B<new>()

=item * B<identifier>()

Identifies this particular implementation, as "YMLparser.pl".

=item * B<reset>()

Clear any document-specific state (keep declared entities, etc).

=item * B<changeDelimiter>(I<name, value>)

Change the named YML delimiter to a new character.
If I<value> is undefined, reset it to its default.
Accepted names are: I<ymlQuad>, I<ymlReopen>, I<ymlStartAttributes>,
I<ymlTerminateAttributes>, I<ymlUp>.
The main use for this is probably to change I<ymlQuad> to C<|>
like MediaWiki table markup.

=item * B<setOption>(I<name, value>)

Options available include:

=over

I<verbose> -- (integer) issue various messages to STDERR.

I<attrEvents> -- (Boolean) after a I<Start> event,
return a separate I<Attr> event for each attribute (if any),
and then an I<AttrFin> event.

I<attrsInPIs> -- (Boolean) treat the contents of processing instructions
as a list of (pseudo-) attributes, and return them in the same manner
as real attributes are returned with I<attrEvents>, as separate events
after the PI.

I<expandEntities> -- (Boolean) if turned off, entities will be returned as
I<unparsed> events with a single argument, which is the original form
of the entity (or numeric character) reference.

I<entitiesInPIs> -- (Boolean) if set, the content of PIs (but not the
I<target>) will be parsed in search of entity and character references,
which will be expanded.

I<shortAttrs> -- enables the short attribute features described above.

I<yml> -- enables the YML features described above.

=back


=item * B<appendEntityPath>(path)

Add I<path> to the end of the list of directories, in which to search
for external entities. First added, is first searched.

=item * B<setHandlers>(hash)

Like the corresponding method in XML::Parser. I<hash> maps SAX event names
to the Perl procedures to be called when each one happens (see above for
a list and an exmple). There are a few extra events, most notably ERROR,
and (optional) separate events for attributes.

=item * B<setNoNest>(I<parent, child>)

Assert that elements of type I<child> may never occur within elements
of type I<parent> (directly or indirectly). Thus, if the parse sees a I<child>
element, it will first force all open I<parent> elements closed.
Repeatable (but not yet working).

=item * B<addText>(I<text>)

Append the text to the buffer to be parsed.

=item * $s = B<getText>()

Return the current contents of the buffer to be parsed (that is, any pending
text that has not yet been parsed). This does not clear the buffer; to do that
use I<clearText>().

=item * B<clearText>()

Remove any remaining text in the parse buffer (the text is returned);


=item * B<nextEvent>()

Parse off the next XML construct from the parse buffer, removing it.
The event will be returned as a reference to an array, whose elements
are essentially the same as the arguments passed to a callback when using
Perl C<XML::Parser>, except that one extra argument is inserted, which is
the event type name (the same as you would pass to XML::Parser to register
a handler); this argument is [1], after [0] which is a reference to the
YMLParser object itself. The rest of the arguments depend on the event
type, and are the same as with C<XML::Parser>.

=item * B<parsestring>(I<s>)

Parse the string I<s> and call event handlers (see I<setHandlers>).

=item * B<parsefile>(I<path>)

Parse the file at I<path> and call event handlers (see I<setHandlers>).
If the path ends with C<.zip>, I<parsefile>() will try to use the first
member of the zip file transparently.
(experimental)

=item * B<parsestringToDom>(I<s>)

Parse the string I<s> and return a reference to an XML::DOM structure.
(experimental)

=item * B<parsefileToDom>(I<f>)

Parse the file I<f> and return a reference to an XML::DOM structure.
(experimental)

=item * B<addAttributeDefault>(I<elementName, attributename, defaultValue>)

(experimental)
Define a default value (similar to what you can do with an ATTLIST declaration
in an SGML or XML DTD), for a particular attribute. With SGML (but not XML,
if I remember right), a default value can be associated with a named
attribute for a separate element type, or shared across the same-named
attributes on multiple element types, such as I<foo> and I<bar> here:

  <!ATTLIST (foo, bar)  class  NAME "secret">

B<However>, with this parser you cannot share a default like that.

=item * B<addTextEntity>(I<name, value>)

Define the entity I<name> to the given (string) value. If I<setHtmlEntities>()
is in effect, defining an entity via <defineEntity> overrides any HTML entity
of the same name.
Entities that refer to files, URIs, etc. are not yet supported.

=item * B<setHtmlEntities>(I<flag>)

Determine whether to recognize the standard HTML named character entities.
They are off by default; this calls turns them on unless I<flag> is present
and has the value 0 (if I<flag> is entirely absent, they are turned I<on>!)
Individual entities can be overridden via I<defineEntity>().

=item * B<setXmlEntities>(I<flag>)

Determine whether to recognize the five XML built-in entities.
They are on by default; this calls turns them on unless I<flag> is present
and has the value 0 (if I<flag> is entirely absent, they are turned I<on>!)
These entities cannot be overridden when on.

=item * B<openElement>(I<self, gi, attrname1, attrvalue1,...>)

This is the internal method called when a start (or empty) tag is parsed.
It stacks the element, xml:lang, and some other information, then issues
the start-element event (and if the I<attrEvents> option is set,
possibly also some number of following attribute events).

=item * B<closeElement>(I<self, gi>)

This is the internal method called when a end tag is parsed.
It pops the information that I<openElement> pushed.

=item * B<getDepth>()

Returns the number of open elements.

=item * B<isOpen>(type)

Returns true if at least one element of the specified I<type> is open.

=item * B<findOutermost>(typelist)

Returns the index into the stack of open elements (0 is the document element),
of the outermost (largest) instance of any of the types listed
in the string I<typelist> (types must be separated by whitespace).
This is used internally to implement the I<setNoNest> feature, since everything
out to this index has to be closed on finding a conflict.

=item * B<getFQGI>()

Returns the list of all open element types, separated by "/".

=item * B<getCurrentGI>()

Returns the innermost/current open element type.

=item * B<getCurrentLang>()

Returns the innermost/current/inherited value of I<xml:lang>.

=item * B<getCurrentNsList>()

Returns a reference to a hash that maps all currently-defined namespace
prefixes to URIs (this includes all namespaces defined on the current element
or any of its ancestors).

=item * B<getDefaultNameSpace>()

Return the current default NameSpace URI. If none, return "".

=item * B<getNameSpacePrefixFromUri>(I<uri>)

Return the XML namespace prefix currently in effect,
that maps to the given I<uri>. If there are several, it is undefined which
one will be returned; if none, return "".

=item * B<getNameSpaceUriFromPrefix>(I<prefix>)

Return a string containing the URI currently assigned to the
given XML namespace I<prefix>; if none, return "".

=back


=head1 SAX event types

These are the same as for Perl's C<XML::Parser> package (see CPAN),
plus I<ERROR> and the optional I<Attr> and I<AttrFin>.
The names shown below are the keys to use when passing a hash to
I<setHandlers>(). Each key's data should be a reference to the Perl procedure
to be called when the event occurs; it will be passed the parameters shown.

=over

=item B<Init>(I<Expat>)

The very start of parsing.

=item B<Final>(I<Expat>)

The very end of parsing.

=item B<Start>(I<Expat, Element [, Attr, Val [,...]]>)

The start of an element. However, if the I<attrEvents> option is set,
then I<Attr, Val...> arguments will not be provided.
Instead, the attributes will be provided via separate I<Attr> events,
and after all the I<Attr> events (whether or not there are any),
an I<AttrFin> event is provided.

=item B<Attr>(I<Expat, Name, Val, nsPrefix, nsURI>)

If you set the I<attrEvents> option, then instead of attributes being
packed into the I<Start> event, they will follow it, one event per
attribute, in alphabetical order.
An I<AttrFin> event will then follow to mark the end of attributes
(that event always happens, even if there are no attributes on a
particular start-tag).
See also the I<attrsInPIs> option,
and the I<ProcAttr> and I<ProcAttrFin> events.

=item B<AttrFin>(I<Expat>)

Marks the end of the attributes for a given start-tag,
if the I<attrEvents> option is set.
See I<ProcAttrFin> for the analogoues event when I<attrsInPIs> is set.

=item B<End>(I<Expat, Element>)

The end of an element.

=item B<Char>(I<Expat, String>)

Text content. B<Note>: There is no guarantee that a I<Char> event will not
be followed immediately by another I<Char> event.

=item B<Comment>(I<Expat, Data>)

=item B<ERROR>(I<Expat, Message>)

This event occurs if a significant error is found in parsing. The intent
is that all XML and YML Well-Formedness (WF) errors will be reported,
but that is not yet complete. It is generally returned I<before> the
any event that occurs despite the error. For example, if an end-tag is
found for an element that is not the innermost open element, an ERROR
event is returned, followed by as many End events as it takes to close
all the elements out to the one named in the end-tag that was actually found.
I<Message>: Printable text for the message.

=item B<CdataStart>(I<Expat>)

Indicates the start of a CDATA marked section. Content will be returned
via following I<Char> events.

=item B<CdataEnd>(I<Expat>)

Indicates the end of a CDATA marked section.

=item B<Proc>(I<Expat, Target, Data>)

A processing instruction.
However, if the I<attrEvents> and I<attrsInPIs> options are set,
then I<Data> will not be returned. Instead, the I<Data> content will be
parsed as if it were a list of attributes, and those attributes will be
provided via I<ProcAttr> events (and finally a I<ProcAttrFin> event).

=item B<ProcAttr>(I<Expat, Name, Val>)

If you set the I<attrEvents> and I<attrsInPIs> options,
then a I<ProcAttr> event will be returned for each pseudo-attribute
found within the content of a processing instruction.
A I<ProcAttrFin> event will then follow.

=item B<ProcAttrFin>(I<Expat>)

Marks the end of the pseudo-attributes for a given PI),
when the I<attrsInPIs> option is set.

=item B<XMLDecl>(I<Expat, Version, Encoding, Standalone>)

The XML declaration.

=item B<Doctype>(I<Expat, Name, Sysid, Pubid, Internal>)

The DOCTYPE declaration in the DTD.

=item B<DoctypeFin>(I<Parser>)

The end of the DOCTYPE declaration in the DTD.

=back


=head2 The following events are not yet fully supported

=over

=item B<Default>(I<Expat, String>)

An event for which no handler has been installed.

=item B<Unparsed>(I<Expat, Entity, Base, Sysid, Pubid, Notation>)

A reference to an unparsed external entity (such as an image).

=item B<ExternEnt>(I<Expat, Base, Sysid, Pubid>)

An (unexpanded?) external entity reference.

=item B<ExternEntFin>(I<Expat>)

=item (all events representing markup declarations)

  B<Element>(I<Expat, Name, Model>)
  B<Attlist>(I<Expat, Elname, Attname, Type, Default, Fixed>)
  B<Entity>(I<Expat, Name, Val, Sysid, Pubid, Ndata, IsParam>)
  B<Notation>(I<...>)

=back


=head1 Known bugs and limitations

This is intentionally B<not> a conforming XML parser. It is intended for
dealing with broken data and helping you fix it, somewhat like the
extraordinarily useful C<tidy>; and for encoding highly-repetitive structures
more compactly while remaining true to the ideas of XML.

It may be better for I<ymlQuad> to require that the parent and aunt be of the
same type (for example, successive table rows), thus allowing even the first
cell in each row to be started with I<ymlQuad>. ???

The semantics of default attributes may not be completely correct.
When using ymlQuad and ymlReopen,
attributes are copied only from the I<base-on> element (including any
attributes which that element defaulted from the DTD).
However, there is no way to "reset to default"; should there be?

Parsing DTDs and external entities is unfinished
(therefore, so is attribute-defaulting).

Does not die on all WF errors, even some where perhaps it should.

Gets unhappy with very long comments, PIs, etc. This is an outright bug
(not hard to fix, I just haven't gotten around to it yet).

Does not coalesce entities that produce text, or CDATA marked sections,
into the surrounding text. So you can get extra text nodes.

I<-canonical> is imperfect (see above).


=head1 Related commands

C<tidy> and C<xmltidy> do similar repairs, but don't (so far as I know)
expose a regular SAX interface, and don't have the YML constructs.

C<iconv> converts between countless character sets, so is excellent for
getting input ready for I<YMLParser.pm>.

C<xmlOutput.pm> provides an API similar to severa functions implement here;
the ones here are specialized because to support YML (and specifically ymlQuad)
we must keep track of extra state information, such as
the list of previous sibling types and attributes for each still-open element.


=head1 Ownership

This work by Steven J. DeRose is licensed under a Creative Commons
Attribution-Share Alike 3.0 Unported License, with one additional restriction:

B<This software may not be integrated into an end-user Web browser or
HTML display agent. This is in order to comply with the XML WG's
intent that XML browsers not accept non-well-formed XML (since, among
other things, that tends to breed more ill-formed XML).> If a user agent
is to incorporate this software, it must require the user to opt-in,
not opt-out; that is, the user agent software must not use this software
to "correct" non-WF XML, or to parse YML, unless the user has explicitly
consented first.

For further information on
the CCLI license, see L<http://creativecommons.org/licenses/by-sa/3.0/>.

The author's present email is sderose at acm.org.

For the most recent version, see L<http://www.derose.net/steve/utilities/>.

=cut