#!/usr/bin/perl -w # # Make a graphvix graph of element containment in a DTD. # # Written by Steven J. DeRose, mid-2006, sderose@acm.org. # # 2007-01-11 sjd: Add -v, -help-prune, -nopcdata. Cleaner dcl of %phrase. # Add stuff for handling actual document. # 2007-12-12 sjd: strict. fix some small bugs. # 2010-09-12 sjd: Clean up. # # To do: # Allow to map only what occurs in the document, not the whole DTD. # Option to do only descendants of certain element(s). (in progress) # Be able to hilite sub-graph used by a specific document # Add repetition indicator on arc (tricky: (a|b)*) # indicate OR vs. SEQ models in node # Color mixed-content elements? # Emphasize the document element type # Prune namespaces or specific typenames # Distinguish attr, root, pcdata by color/shape # option to not coalesce attributes, or coalesce per element # be able to write out as xtm, rdf, or prolog as well. # use strict; my $version = "2010-11-10"; my $default_catalog = "$ENV{XML_CATALOG}"; my $catalog = $default_catalog; my $attrs = 0; my $attrCollapse = 0; my $directed = 1; my $docarcs = 0; my $docElement = ""; my %NStoIgnore = ( "mml" => 1 ); my $outFormat = "graphviz"; my $mml = 0; my $pcdata = 1; my $prune = 0; my $quiet = 0; my $verbose = 0; my %elementOccurrences = (); # How often each element occurred my %documentArcs = (); # How often each parent/child pair # Set up a list of commom phrase-level element types, that are omitted # if the '-prune' option is set. # my %phrase = (); foreach my $e # UPDATE THE PERLDOC IF YOU CHANGE THIS ( qw / i it ital italic b bd bold bi boldital bolditalic em emph emphasis font span sc scap smallcap mono tt monospace rm rom roman strike strikethru sub sup super inf infsup supinf overline overline-start overline-end u ul us underline underline-start underline-end /) { $phrase{$e} = 1; } # Process arguments # while ($ARGV[0]) { if (index($ARGV[0],"--")==0) { $ARGV[0] = substr($ARGV[0],1); } if ($ARGV[0] eq "-q") { $quiet = 1; } elsif ($ARGV[0] eq "-a") { $attrs = 1; } elsif ($ARGV[0] eq "-ac") { $attrCollapse = 1; } elsif ($ARGV[0] eq "-c") { shift; $catalog = $ARGV[0]; (-f $catalog) || die "Cannot find catalog file '$catalog'.\n"; } elsif ($ARGV[0] eq "-docarcs") { $docarcs = 1; } elsif ($ARGV[0] eq "-help-prune") { die "Elements ignored with -prune:\n " . join(", ",sort keys %phrase) . ".\n"; } elsif ($ARGV[0] eq "-mml") { delete $NStoIgnore{"mml"}; } elsif ($ARGV[0] eq "-nons") { shift; $NStoIgnore{$ARGV[0]} = 1; } elsif ($ARGV[0] eq "-nopcdata") { $pcdata = 0; } elsif ($ARGV[0] eq "-out") { shift; $outFormat = $ARGV[0]; die "-out is not yet supported, sorry.\n"; } elsif ($ARGV[0] eq "-prune") { $prune = 1; } elsif ($ARGV[0] eq "-nodir") { $directed = 0; } elsif ($ARGV[0] eq "-v") { $verbose = 1; } elsif (substr($ARGV[0],0,1) eq "-") { ($ARGV[0] eq '-h' or $ARGV[0] eq '-help') || print "Unknown option '$ARGV[0]'.\n"; system "perldoc makeDtdGraph"; exit; } else { last; } shift; } # options my $file = $ARGV[0]; (defined $file && -f $ARGV[0]) || die "Cannot find DTD file '$file'.\n"; ############################################################################### # Info about each element, indexed by element type name # my %children = (); # value = space-separated children my %operators = (); my %repOperators = (); my @attrTable = (); # Info about attr, by attr name or elem/atrr name my @tagStack = (); # Collect the element dcls # parseDocument(); if ($verbose) { warn "Just after parse, list has:\n"; foreach my $x (sort keys %children) { warn rpad($x,20) . " " . $children{$x} . "\n"; } } if ($prune) { prune(); } # Write out the Graphviz file. # print "// Graphviz dot file, written by makedtdgraph, " . "version of $version by sderose\@acm.org.\n"; ($directed) && print "di"; print "graph DTD { size=\"8.5,11\"; label = \"DTD parent/child chart, drawn by 'makedtdgraph'.\n\"; fontsize=12; "; my $arrow = ($directed) ? "->":"--"; my $arcCount = 0; foreach my $x (sort keys %children) { my @kids = split(" *",$children{$x}); foreach my $k (@kids) { if (!$k) { next; } if ($k eq "EMPTY") { last; } (my $kns = $k) =~ s/:.*$//; if ($kns ne "" and defined $NStoIgnore{$kns}) { next; } if ($prune and $phrase{lc($k)}) { next; } if ($outFormat eq "graphviz") { print rpad("\"$x\"",25) . " $arrow \"$k\""; # Could add [label="...",color=...] arrowhead, arrowtail, # labelfontcolor, labeldistance, tooltip, weight, etc. print ";\n"; } elsif ($outFormat eq "prolog") { print "??? $x $k\n"; } elsif ($outFormat eq "owl") { print "$x:\n Facts: Child $k\n"; } elsif ($outFormat eq "xtm") { print "??? $x $k\n"; } elsif ($outFormat eq "csv") { print "$x, $k\n"; } $arcCount++; } } # Possibly these have to be listed first, not last? print "\n\n// Now hilite nodes that occurred in the document.\n"; my %occurred; foreach my $n (sort keys %documentArcs) { (my $p = $n) =~ s/\/.*$//; (my $c = $n) =~ s/^.*\///; $occurred{$p}++; $occurred{$c}++; } # Show attributes if requested if ($attrs || $attrCollapse) { print "\n\n// Attributes:\n"; print "{ node [style=filled, color=green, fontsize=9]\n"; while (my $a = shift @attrTable) { print "$a;\n"; } print "}\n"; } if (scalar keys %occurred) { print "\nnode [style=filled,color=lightgrey]\n"; foreach my $e (sort keys %occurred) { print "$e; "; } print "\n\n"; } print "}\n"; if ($docarcs) { print "\n\n// =====================================================\n\n"; print "// Occurrence counts of parent/child pairs in the document:\n"; print "//\n"; foreach my $pair (sort keys %documentArcs) { print "//\t$pair\t $documentArcs{$pair}\n"; } print "\n\n"; } my $nodeCount = scalar keys %children; ($quiet) || warn "Written: $nodeCount element nodes, $arcCount child arcs.\n"; exit; ############################################################################### # Remove phrase-level elements # sub prune { for my $k (keys %children) { if ($phrase{lc($k)}) { delete $children{$k}; } } } sub pruneToDescendantsOf { my $root = $_[0]; ($children{$root} ne "") || die "Element '$root' not found.\n"; my %descendants = (); $descendants{$root}++; for my $ch (split(/\s+/,$children{$root})) { if ($descendants{$ch} <= 0) { $descendants{$ch}++; # recurse } } for my $c (sort keys %children) { if (!$descendants{$c}) { delete $children{$c}; delete $operators{$c}; delete $repOperators{$c}; } } } # pruneToDescendantsOf sub rpad { my $rc = $_[0]; my $need = $_[1] - length($rc); if ($need > 0) { $rc .= " " x $need; } return ($rc); } ############################################################################### # Regarding the XML::Parser module: # API doc is at http://search.cpan.org/~msergeant/XML-Parser/Parser.pm # sub parseDocument { # Make a trivial XML document that references the DTD. # Must get root element type right or parser gets unhappy. my $theDoc = "/tmp/doc.xml"; my $path = "$ENV{PWD}/$file"; my $rootname = "art"; open ADOC, ">$theDoc" || die "Can't open '$theDoc'\n"; print ADOC "\n"; print ADOC "\n"; print ADOC "<$rootname>\n"; close ADOC; use XML::Parser; #use XML::Catalog; #my $catalog=XML::Catalog->new($catalog); my $parser = new XML::Parser(ErrorContext => 2, ParseParamEnt => 1); $parser->setHandlers(Element => \&elementDclHandler, Doctype => \&doctypeHandler); if ($attrs || $attrCollapse) { $parser->setHandlers(Attlist => \&attlistDclHandler); } $parser->parsefile($theDoc); } # sub parseDocument sub doctypeHandler { my ($parser, $docel, $docsys, $docpub, $internal) = @_; #warn "Got doctype, sysid is '$docsys'\n"; ($docsys eq "") || (-f $docsys) || warn "No file at system id loc '$docsys'.\n"; } sub elementDclHandler { my ($parser, $name, $rest) = @_; if ($mml == 0 and substr($name,0,4) eq "mml:") { return; } (my $tokens = $rest) =~ s/[()|,?*+&]/ /g; # reduce to just tokens (my $ops = $rest) =~ s/[^|,&]//g; # what operators are used? (my $reps = $rest) =~ s/[^+?*]//g; # what repetition indicators? if (!$pcdata) { $tokens =~ s/\#PCDATA\s*//; } $children{$name} = $tokens; $operators{$name} = $ops; $repOperators{$name} = $reps; } # sub elementDclHandler sub attlistDclHandler { my ($p, $elname, $attname, $type, $dft, $fixed) = @_; if ($mml == 0 and substr($elname,0,4) eq "mml:") { return; } if ($attrCollapse) { push @attrTable, (rpad($elname,25) . " -> $attname"); } else { push @attrTable, (rpad($elname,25) . " -> $elname/$attname"); } } # Counts are kept in a hashtable keyd by 'parent/child'. # sub startHandler { my $parser = shift; my $gi = shift; if (!$docElement) { $docElement = $gi; } my $depth = scalar @tagStack; my $parent = ($depth > 0) ? $tagStack[$depth - 1] : "#ROOT"; push(@tagStack,$gi); $documentArcs{"$parent/$gi"}++; $elementOccurrences{$gi}++; } sub endHandler { pop(@tagStack); } sub charHandler { my $depth = scalar @tagStack; my $parent = ($depth > 0) ? $tagStack[$depth - 1] : "#ROOT"; my $text = $_[1]; (my $nonwhite = $text) =~ s/\s//g; if ($nonwhite) { $documentArcs{"$parent/#PCDATA"}++; } } ############################################################################### # sub showUsage { print " =head1 Usage makedtdgraph dtdfile [>resultfile] Extracts the list of element declarations from a DTD, and makes a graphviz chart of their containment relationships (see www.graphviz.org), or a some other representation of those relationships. Can also count what parent/child instances occur in a given document. =head1 Options =over =item * B<-a> Include attributes, too (not recommended for large DTDs, because the graph gets really ugly). =item * B<-ac> Include attributes, but only once for each distinct attribute name, not once for each element+attribute. =item * B<-c> I Use specified XML catalog (not yet supported). =item * B<-docArcs> Dump a list of parent/child pairs, and how many times each was actually seen in the document (assuming there was a document...). This may be different from the set of pairs the DTD permits. =item * B<-help-prune> Show the list of tags the I<-prune> will cause to be ignored. =item * B<-mml> Include mathml (mml:) tags (otherwise omitted). =item * B<-nons> I Omit tags with namespace prefix 'ns'. =item * B<-out> I
Write out as Graphviz (default), Prolog, Xtm, OWL (Manchester), or CSV (only Graphviz is finished). =item * B<-prune> Omit a variety of familiar phrase-level (inline) element types. This is useful when such elements have a very wide range of permitted contexts in the DTD, and thus would clutter up the graph. The built-in list of element types is: i it ital italic b bd bold bi boldital bolditalic em emph emphasis font span sc scap smallcap mono tt monospace rm rom roman strike strikethru sub sup super inf infsup supinf overline overline-start overline-end u ul us underline underline-start underline-end =item * B<-nopcdata> Omit mention of #PCDATA. =item * B<-q> Suppress most messages. =item * B<-undir> Make an undirected graph, not directed. =item * B<-v> Supply even more messages. =item * B<-version> Display version info and exit. =back =head1 Known bugs and limitations Does not indicate required/repeatable status, or ordering of children. No way to change the list of elements for I<-prune> to ignore. =head1 Related commands makeDtdTable -- makes a chart comparing two or more DTDs. =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see http://creativecommons.org/licenses/by-sa/3.0/. The author's present email is sderose at acm.org. For the most recent version, see http://www.derose.net/steve/utilities/. =cut "; }