#!/usr/bin/perl -w # # domExtensions -- useful methods on top of XML::DOM. # # Written 2010-04-01~23 by Steven J. DeRose, sderose@acm.org. # 2010-10-08 sjd: Convert to a real package, normalize naming. # # Integrate into: # xxx2html # use strict; use XML::Parser; use XML::DOM; #use XML::XPath; package DOMextensions; my $version = "2010-10-08"; sub new { my ($class) = @_; my $self = { source => "", theDOM => undef, }; bless $self, $class; return($self); } sub getVersion { return($version); } ############################################################################### # Methods to walk the XPath axes, selecting by gi, attr, value, and n. # # XPath axis DOM properties and methods() # ancestor # attribute attributes, getAttribute(), getAttributeNode() # child childNodes, firstChild, lastChild, # descendant # following # following-sibling nextSibling # namespace # parent parentNode # preceding # preceding-sibling previousSibling # self # # Additional DOM Node methods: # ownerDocument, textContent, tagName # nodeName, nodeType, nodeValue, localName, prefix, namespaceURI # appendChild(), insertBefore(), removeChild(), replaceChild() # removeAttribute(), removeAttributeNode(), # setAttribute(), setAttributeNode() # hasAttribute(), hasAttributes(), hasChldNodes() # compareDocumentPosition(), isEqualNode, isSameNode() # getElementsByTagName() # normalize() # sub selectAncestor { my ($self,$n,$type,$aname,$avalue) = @_; if (!defined $type) { $type = ""; } if (!defined $n) { $n = 1; } while($self=getParentNode($self)) { if (nodeMatches($self,$type,$aname,$avalue)) { $n--; if ($n<=0) { return($self); } } } return(undef); } sub selectChild { my ($self,$n,$type,$aname,$avalue) = @_; if (!defined $type) { $type = ""; } if (!defined $n) { $n = 1; } $self = $self->getFirstChild(); while(defined $self) { if (nodeMatches($self,$type,$aname,$avalue)) { $n--; if ($n<=0) { return($self); } } $self = $self->getNextSibling(); } return(undef); } sub selectDescendant { my ($self,$n,$type,$aname,$avalue) = @_; setN((defined $n) ? $n:1); return(selectDescendantR($self,$n,$type,$aname,$avalue)); } sub selectDescendantR { # XXX FIX ??? my ($self,$n,$type,$aname,$avalue) = @_; if (nodeMatches($self,$type,$aname,$avalue)) { $n--; if ($n<=0) { return($self); } } for my $ch ($self->getChildNodes()) { my $node = getDescendantByAttributeR($ch,$n,$type,$aname,$avalue); if (defined $node) { $n--; if ($n<=0) { return($node); } } } return(undef); } sub selectPreceding { my ($self,$n,$type,$aname,$avalue) = @_; if (!defined $n) { $n = 1; } if (!defined $type) { $type = ""; } while($self=$self->getPreceding()) { if (nodeMatches($self,$type,$aname,$avalue)) { $n--; if ($n<=0) { return($self); } } } return(undef); } sub getPreceding { my ($self) = @_; my $node = $self->getPreviousSibling(); if (defined $node) { while(my $next = $node->getFirstChild()) { $node = $next; } return($node); } return($self->getParentNode()); } sub selectFollowing { my ($self,$n,$type,$aname,$avalue) = @_; if (!defined $n) { $n = 1; } if (!defined $type) { $type = ""; } while($self=getFollowing($self)) { if (nodeMatches($self,$type,$aname,$avalue)) { $n--; if ($n<=0) { return($self); } } } return(undef); } sub getFollowing { my $self = $_[0]; my $node = $self->getFirstChild(); if (defined $node) { return($node); } $node = $self->getNextSibling(); if (defined $node) { return($node); } while ($self = $self->getParentNode()) { if ($self->getNextSibling()) { return($self->getNextSibling()); } } return(undef); } sub selectPrecedingSibling { my ($self,$n,$type,$aname,$avalue) = @_; if (!defined $type) { $type = ""; } if (!defined $n) { $n = 1; } while($self=getPreviousSibling($self)) { if (nodeMatches($self,$type,$aname,$avalue)) { $n--; if ($n<=0) { return($self); } } } return(undef); } sub selectFollowingSibling { my ($self,$n,$type,$aname,$avalue) = @_; if (!defined $type) { $type = ""; } if (!defined $n) { $n = 1; } while($self=getNextSibling($self)) { if (nodeMatches($self,$type,$aname,$avalue)) { $n--; if ($n<=0) { return($self); } } } return(undef); } ############################################################################### # sub getLeftBranch { my ($self) = @_; (defined $self) || return(undef); while(my $fc = $self->getFirstChild()) { $self = $fc; } return($self); } sub getRightBranch { my ($self) = @_; (defined $self) || return(undef); while(my $fc = $self->getLastChild()) { $self = $fc; } return($self); } sub getDepth { my ($self) = @_; my $d = 0; while(defined $self) { $d++; $self = $self->getParentNode(); } return($d); } sub getFQGI { my ($self) = @_; my $f = ""; while(defined $self) { $f = "/" . $self->getNodeName() . $f; $self = $self->getParentNode(); } return($f); } sub getXPointer { my ($self) = @_; my $f = ""; while(defined $self) { $f = "/" . $self->getParentNode()->getChildIndex($self) . $f; $self = $self->getParentNode(); } return($f); } sub xPointerCompare { my ($self,$xp1,$xp2) = @_; my @t1 = split(/\//,$xp1); my @t2 = split(/\//,$xp2); for (my $i=0; $i $t2[$i]) { return( 1); } } # At least one of them ran out... return (scalar @t1 <=> scalar @t2); } sub nodeCompare { my ($self,$other) = @_; return($self->xPointerCompare( $self->getXPointer(),$other->getXPointer())); } sub XPointerInterpret { my ($self, $xp) = @_; my $document = $self->ownerDocument(); my $node = $document->getDocumentElement(); my @t = split(/\//,$xp); for (my $i=0; $igetChildAtIndex($t[$i]); (defined $node) || return(undef); } return($node); } # Check if a node matches the supported selection constraints: # If type is non-nil, "*" matches any *element*, else name must be right. # If aname is non-nil, attribute must exist # If avalue is non-nill, attribute aname must = avalue. # sub nodeMatches { my ($self,$type,$aname,$avalue) = @_; if (defined $type && $type ne "") { # check type constraint if ($type eq "*") { if ($self->getNodeType!=1) { return(0); } } else { if ($self->getNodeName() ne $type) { return(0); } } } if (!defined $aname) { # No attribute constraint return(1); } my $thisvalue = $self->getAttribute($aname); if (!defined $thisvalue) { # attr specified, absent return(0); } if (!defined $avalue || $thisvalue eq $avalue) { # attr matches return(1); } return(0); } # Search upward to find an assignment to an attribute -- like xml:lang. # sub getInheritedAttribute { my ($self,$aname) = @_; do { my $avalue = $self->getAttribute($aname); if (defined $avalue) { return($avalue); } $self = $self->getParentNode(); } while($self); return(undef); } # Assemble the entire attribute list, escaped as needed to write out as XML. # sub getEscapedAttributeList { my ($self) = @_; my $buf = ""; (defined $self) || return(undef); my $alist = $self->getAttributes(); (defined $alist) || return($buf); for (my $i=0; $i<$alist->getLength(); $i++) { my $anode = $alist->item($i); my $aname = $anode->getName(); my $avalue = escapeAttribute($anode->getValue()); $buf .= " $aname=\"$avalue\""; } return($buf); } sub deleteWhiteSpaceNodes { my ($self) = @_; # print "running deleteWhiteSpaceNodesCB\n"; forEachNode($self,\&deleteWhiteSpaceNodesCB,undef); } sub deleteWhiteSpaceNodesCB { my ($self) = @_; ($self->getNodeName eq "#text") || return; my $t = $self->getData(); ($t =~ m/[^\s]/) && return; $self->getParentNode()->removeChild($self); } ############################################################################### # sub normalizeSpace { my ($self) = @_; forEachNode($self,\&normalizeSpaceCB,undef); } sub normalizeSpaceCB { my ($self) = @_; ($self->getNodeName eq "#text") || return; $self->setData(normalizeSpace($self->getData())) } sub normalizeDashChars { my ($self) = @_; forEachNode($self,\&normalizeDashCharsCB,undef); } sub normalizeDashCharsCB { my ($self) = @_; ($self->getNodeName eq "#text") || return; $self->setData(normalize_DashChars($self->getData())) } sub normalizeDQuoteChars { my ($self) = @_; forEachNode($self,\&normalizeDQuoteCharsCB,undef); } sub normalizeDQuoteCharsCB { my ($self) = @_; ($self->getNodeName eq "#text") || return; $self->setData(normalize_DQuoteChars($self->getData())) } sub normalizeSQuoteChars { my ($self) = @_; forEachNode($self,\&normalizeSQuoteCharsCB,undef); } sub normalizeSQuoteCharsCB { my ($self) = @_; ($self->getNodeName eq "#text") || return; $self->setData(normalize_SQuoteChars($self->getData())) } ############################################################################### # Take a block of contiguous siblings and enclose them in a new intermediate # parent node (which in inserted at the level they *used* to be at). # sub groupSiblings { my ($self, $first, $last, $newParentType) = @_; (defined $first) || return; my $oldParent = $first->getParentNode(); my $newParent = $first->getOwnerDocument()->createElement($newParentType); $oldParent->insertChildBefore($newParent,$first); my $next; for (my $cur = $first; defined $cur; $cur=$next) { $next = $cur->getNextSibling(); my $moving = $oldParent->removeChild($cur); $newParent->insertBefore($moving,undef); } } # groupSiblings ############################################################################### # Remove the given node, promoting all its children. # sub promoteChildren { my ($self) = @_; my $parent = $self->getParentNode(); my $next; for (my $cur = $self->getFirstChild(); defined $cur; $cur=$next) { $next = $cur->getNextSibling(); my $moving = $self->removeChild($cur); $parent->insertBefore($moving,$parent); } $parent->deleteChild($self); } ############################################################################### # Traverse a subtree given its root, and call separate callbacks before # and after traversing each subtree. Callbacks might, for example, # respectively generate the start- and end-tags for the element. Callbacks # are allowed to be undef if not needed. # # If a callback returns true, we stop traversing. # sub forEachNode { my ($self,$callbackA,$callbackB,$depth) = @_; if (!defined $depth) { $depth = 1; } (defined $self) || return(0); my $name = $self->getNodeName(); if (defined $callbackA) { if ($callbackA->($self,$name,$depth)) { return(1); } } if ($self->hasChildNodes()) { #print "Recursing for child nodes at level " . ($depth+1) . "\n"; for my $ch ($self->getChildNodes()) { my $rc = forEachNode($ch,$callbackA,$callbackB,$depth+1); if ($rc) { return(1); } } } if (defined $callbackB) { if ($callbackB->($self,$name,$depth)) { return(1); } } return(0); # succeed } # forEachNode ############################################################################### # Cat together all descendant text nodes, with spaces between. # (at the moment, it leaves a stray $delim at the start) # sub collectAllText { my ($self,$delim) = @_; (defined $self) || return(""); if (!defined $delim) { $delim = " "; } my $name = $self->getNodeName(); my $textBuf = ""; if ($self->getNodeName() eq "#text") { $textBuf = $delim . $self->getData(); } elsif ($self->hasChildNodes()) { for my $ch ($self->getChildNodes()) { $textBuf .= collectAllText($ch,$delim); } } return($textBuf); } # collectAllText ############################################################################### # Collect a subtree as WF XML, with appropriate escaping. Can also # put a delimiter before each start-tag; if it contains '\n', then the # XML will also be hierarchically indented. -color applies. # sub collectAllXML { my ($self,$delim,$depth,$ascii) = @_; if (!defined $delim) { $delim = " "; } if (!defined $depth) { $depth = 1; } if (!defined $ascii) { $ascii = 0; } my $indent = ($delim =~ m/\n/) ? (" " x $depth) : ""; my $name = $self->getNodeName(); my $textBuf = ""; if ($self->getNodeName() eq "#text") { if ($ascii) { $textBuf = escapeXML($self->getData()); } else { $textBuf = escapeASCII($self->getData()); } } else { $textBuf = "$delim$indent<$name" . getEscapedAttributeList($self) . ">"; if ($self->hasChildNodes()) { for my $ch ($self->getChildNodes()) { $textBuf .= collectAllXML($ch,$delim,$depth+1); } } $textBuf .= ""; } return($textBuf); } # collectAllXML ############################################################################### ############################################################################### # Index should be a separate object. # package Dom::Index; sub new { my ($class, $document, $aname) = @_; my %index = (); my $node = $document->getDocumentelement(); do { if ($node->getNodeType==1) { # elements my $key = $node->getAttribute($aname); if (defined $key) { $index{$key} = $node; } } $node = $node->getFolowing(); } while($node); my $self = { document => $document, attrName => $aname, theIndex => \%index, }; bless $self, $class; return(\%index); } sub find { my ($self, $avalue) = @_; my $indexRef = $self->{theIndex}; my %index = %$indexRef; return($index{$avalue}); } ############################################################################### ############################################################################### # package CharStuff; sub new { my ($class) = @_; my $self = { spaceChars => setupSpaces(), dashChars => setupDashes(), sQuoteChars => setupSQuotes(), dQuoteChars => setupDQuotes(), }; bless $self, $class; return($self); } sub escapeAttribute { my ($self,$s) = @_; $s =~ s//]]>/g; $s =~ s/&/&/g; return($s); } sub escapeASCII { my ($self,$s) = @_; $s =~ s/(\P{IsASCII})/ { sprintf("&#x%04x;",ord($1)); }/ge; $s = xmlEscape($s); return($s); } sub unescapeXML { my ($self,$s) = @_; $s =~ s/<//g; $s =~ s/&quo;/"/g; $s =~ s/'/'/g; $s =~ s/&#x([0-9a-f]+);/{ chr(hex($1)); }/gie; $s =~ s/&#([0-9]+);/{ chr($1); }/gie; $s =~ s/&/&/g; return($s); } sub normalize_Space { my ($self,$s) = @_; $s =~ s/\s\s+/ /g; $s =~ s/^\s+//g; $s =~ s/\s+$//g; return($s); } sub normalize_DashChars { my ($self,$s) = @_; $s =~ s/[$self->{dashChars}]/-/g; return($s); } sub normalize_DQuoteChars { my ($self,$s) = @_; $s =~ s/[$self->{dQuoteChars}]/"/g; return($s); } sub normalize_SQuoteChars { my ($self,$s) = @_; $s =~ s/[$self->{sQuoteChars}]/'/g; return($s); } sub normalize_SpaceChars { my ($self,$s) = @_; $s =~ s/[$self->{spaceChars}]/ /g; return($s); } sub setupDashes { my ($self,$s) = @_; my $rc = # Leave out regular hyphen so doesn't mess up regex, and since it's # what we normalize *to*. # chr(0x000ad) . # soft hyphen chr(0x0058a) . # armenian hyphen chr(0x01806) . # mongolian todo soft hyphen chr(0x01b60) . # balinese pameneng (line-breaking hyphen) chr(0x02010) . # 008208) . hyphen) . '-', chr(0x02011) . # non-breaking hyphen, chr(0x02012) . # figure dash chr(0x02013) . # 008211) . ndash, '-', chr(0x02014) . # 008212, mdash, '--', chr(0x02015) . # 008213, horbar, '--', chr(0x02027) . # hyphenation point chr(0x02043) . # hyphen bullet chr(0x02053) . # swung dash #chr(0x21E0) . # LEFTWARDS DASHED ARROW #chr(0x21E1) . # UPWARDS DASHED ARROW #chr(0x21E2) . # RIGHTWARDS DASHED ARROW #chr(0x21E3) . # DOWNWARDS DASHED ARROW chr(0x0229d) . # circled dash chr(0x02448) . # ocr dash # Box-drawing dashes (maybe unlikely, but...): # #chr(0x2504) . # ... LIGHT TRIPLE DASH HORIZONTAL #chr(0x2505) . # ... HEAVY TRIPLE DASH HORIZONTAL #chr(0x2508) . # ... LIGHT QUADRUPLE DASH HORIZONTAL #chr(0x2509) . # ... HEAVY QUADRUPLE DASH HORIZONTAL #chr(0x254C) . # ... LIGHT DOUBLE DASH HORIZONTAL #chr(0x254D) . # ... HEAVY DOUBLE DASH HORIZONTAL chr(0x02e17) . # double oblique hyphen chr(0x02E1A) . # HYPHEN WITH DIAERESIS chr(0x0301c) . # wave dash chr(0x03030) . # wavy dash chr(0x030a0) . # katakana-hiragana double hyphen chr(0x0FE49) . # DASHED OVERLINE chr(0x0FE4D) . # DASHED LOW LINE chr(0x0FE58) . # small em dash chr(0x0fe63) . # small hyphen-minus chr(0x0ff0d) . # fullwidth hyphen-minus ""; return $rc; } sub setupDQuotes { my ($self,$s) = @_; my $dQuotes = chr(0x00AB) . # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK * chr(0x00BB) . # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK * chr(0x201C) . # LEFT DOUBLE QUOTATION MARK chr(0x201D) . # RIGHT DOUBLE QUOTATION MARK chr(0x201E) . # DOUBLE LOW-9 QUOTATION MARK chr(0x201F) . # DOUBLE HIGH-REVERSED-9 QUOTATION MARK chr(0x2358) . # APL FUNCTIONAL SYMBOL QUOTE UNDERBAR chr(0x235E) . # APL FUNCTIONAL SYMBOL QUOTE QUAD chr(0x275D) . # HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT chr(0x275E) . # HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT chr(0x301D) . # REVERSED DOUBLE PRIME QUOTATION MARK chr(0x301E) . # DOUBLE PRIME QUOTATION MARK chr(0x301F) . # LOW DOUBLE PRIME QUOTATION MARK ""; return($dQuotes); } sub setupSQuotes { my ($self,$s) = @_; my $sQuotes = chr(0x2018) . # LEFT SINGLE QUOTATION MARK chr(0x2019) . # RIGHT SINGLE QUOTATION MARK chr(0x201A) . # SINGLE LOW-9 QUOTATION MARK chr(0x201B) . # SINGLE HIGH-REVERSED-9 QUOTATION MARK chr(0x2039) . # SINGLE LEFT-POINTING ANGLE QUOTATION MARK chr(0x203A) . # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK chr(0x275B) . # HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT chr(0x275C) . # HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT chr(0x276E) . # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT chr(0x276F) . # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT ""; return($sQuotes); } sub setupSpaces { my ($self,$s) = @_; my $spaces = chr(0x0009) . # TAB chr(0x000A) . # LINE FEED chr(0x000B) . # VERTICAL TAB chr(0x000C) . # FORM FEED chr(0x000D) . # CARRIAGE RETURN chr(0x0020) . # SPACE chr(0x00A0) . # NO-BREAK SPACE chr(0x1680) . # OGHAM SPACE MARK chr(0x2002) . # EN SPACE chr(0x2003) . # EM SPACE chr(0x2004) . # THREE-PER-EM SPACE chr(0x2005) . # FOUR-PER-EM SPACE chr(0x2006) . # SIX-PER-EM SPACE chr(0x2007) . # FIGURE SPACE chr(0x2008) . # PUNCTUATION SPACE chr(0x2009) . # THIN SPACE chr(0x200A) . # HAIR SPACE chr(0x200B) . # ZERO WIDTH SPACE chr(0x202F) . # NARROW NO-BREAK SPACE chr(0x205F) . # MEDIUM MATHEMATICAL SPACE chr(0x2409) . # SYMBOL FOR HORIZONTAL TABULATION chr(0x240B) . # SYMBOL FOR VERTICAL TABULATION chr(0x2420) . # SYMBOL FOR SPACE chr(0x3000) . # IDEOGRAPHIC SPACE chr(0x303F) . # IDEOGRAPHIC HALF FILL SPACE chr(0xFEFF) . # ZERO WIDTH NO-BREAK SPACE (= byte-order mark) ""; return($spaces); } ############################################################################### # sub showUsage { print " =head1 Notes An XML-manipulation package that sits on top of XML::DOM and provides higher-level, XPath-like methods. It's very often useful to walk around the XPath 'axes' in a DOM tree. When doing so, it's often useful to consider only with element nodes, or only #text nodes, or only element nodes of a certain element type, or with a certain attribute or attribute-value pair. Thus, for the relevant XPath axes, there are calls such as these (illustrated for the Child axis, but you can substitute Descendant, PrecedingSibling, FollowingSibling, Preceding, Following, or Ancestor): getChildByAttribute(node,n,type,attributeName,attributeValue) This will return the n-th child of node, which is of the given element type, and has the given attributeName=attributeValue pair. =over =item * If type is undefined or '', any node type is allowed; if it is '*', any *element* is allowed (in both cases, attribute constraints may still apply). =item * If attributeName is undef or '', no attribute is required. =item * If attributeValue is undef or '', the attribute named by attributeName must be present, but may have any value (including ''). =back =head3 Other methods =over =item getInheritedAttribute(name) =item getLeftBranch() -- returns the leftmost descendant of node. This is not the first descendant, which would be the first child, since nodes are ordered before their descendants. =item getRightBranch() -- returns the rightmost (last) descendant of node. =item getDepth() -- returns how deeply nested the node is. =item getFQGI() -- returns the list of element types of the node's ancestors, from the root down, separated by '/'. =item getXPointer() -- returns the XPointer child sequence to node. That is, the list of child-numbers for all the ancestors of the node, from the root down, separated by '/'. This is a fine unique name for the node's location in the document. =item XPointerCompare(x1,x2) -- takes two XPointer child-sequences as just described, and compares them for document order, returning -1, 0, or 1. This does not require actually looking at the tree at all. =item nodeCompare(n1,n2) -- takes two XML::DOM nodes, and compares them for document order, returning -1, 0, or 1. =item XPointerInterpret(document,x) -- Interprets the XPointer child sequence x, in the context of the given document, and returns the node it identifies, or undef if there is no such node. =item getEscapedAttributeList() -- returns the entire attribute list for node, escaped as needed within an XML start-tag. =item deleteWhiteSpaceNodes() -- deletes all of them. =item normalizeAllSpace() -- does the equivalent of XSLT normalize-space() on all text nodes. =item mergeWithFollowingSibling() -- =item groupSiblings(typeForNewParent) -- =item promoteChildren() -- =item forEachNode(preCallback,postCallback) -- traverse the subtree headed at node, calling the callbacks before and after traversing each node's subtree. =item collectAllText(delimiter) -- concatenates together the content of all the text nodes in a subtree, putting the delimiter in between. =item collectAllXML() -- generates the XML representation for the subtree headed at node. If the delimiter contains a newline, the subtree will include newlines and indentation. =back =head2 Index package =over =item buildIndex(attributeName) -- Returns a hash table in which each entry has the value of the specified attribute as key, and the element on which the attribute occurred as value. This is similar to the XSLT 'key' feature. =item find(value) =back =head2 Character stuff package =over =item escapeAttribute(string) -- escapes the string as needed for it to fit in an attribute value. =item escapeXML(string) -- escapes the string as needed for it to fit in XML text content. =item escapeASCII(string) -- escapes the string as needed for it to fit in XML text content, *and* recodes and non-ASCII characters as XML numeric characters references. =item unescapeXML(string) -- Changes XML numeric characters references, as well as references to the 5 pre-defined XML named entities, into the corresponding literal characters. =back =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see http://creativecommons.org/licenses/by-sa/3.0/. The author's present email is sderose at acm.org. For the most recent version, see http://www.derose.net/steve/utilities/. =cut "; }