#!/usr/bin/perl -w
#
# Make a graphvix graph of element containment in a DTD.
#
# Written by Steven J. DeRose, mid-2006, sderose@acm.org.
#
# 2007-01-11 sjd: Add -v, -help-prune, -nopcdata. Cleaner dcl of %phrase.
#    Add stuff for handling actual document.
# 2007-12-12 sjd: strict. fix some small bugs.
# 2010-09-12 sjd: Clean up.
#
# To do:
#    Allow to map only what occurs in the document, not the whole DTD.
#    Option to do only descendants of certain element(s). (in progress)
#    Be able to hilite sub-graph used by a specific document
#    Add repetition indicator on arc (tricky: (a|b)*)
#    indicate OR vs. SEQ models in node
#    Color mixed-content elements?
#    Emphasize the document element type
#    Prune namespaces or specific typenames
#    Distinguish attr, root, pcdata by color/shape
#    option to not coalesce attributes, or coalesce per element
#    be able to write out as xtm, rdf, or prolog as well.
#

use strict;

my $version = "2010-11-10";

my $default_catalog = "$ENV{XML_CATALOG}";
my $catalog = $default_catalog;

my $attrs        = 0;
my $attrCollapse = 0;
my $directed     = 1;
my $docarcs      = 0;
my $docElement   = "";
my %NStoIgnore   = ( "mml" => 1 );
my $outFormat    = "graphviz";
my $mml          = 0;
my $pcdata       = 1;
my $prune        = 0;
my $quiet        = 0;
my $verbose      = 0;


my %elementOccurrences = ();      # How often each element occurred
my %documentArcs = ();            # How often each parent/child pair


# Set up a list of commom phrase-level element types, that are omitted
# if the '-prune' option is set.
#
my %phrase = ();
foreach my $e                     # UPDATE THE PERLDOC IF YOU CHANGE THIS
    ( qw /
      i it ital italic
      b bd bold
      bi boldital bolditalic
      em emph emphasis font span 
      sc scap smallcap
      mono tt monospace
      rm rom roman
      strike strikethru
      sub sup super inf infsup supinf
      overline overline-start overline-end
      u ul us underline underline-start underline-end /)
{
    $phrase{$e} = 1;
}

# Process arguments
#
while ($ARGV[0]) {
    if (index($ARGV[0],"--")==0) { $ARGV[0] = substr($ARGV[0],1); }
    if ($ARGV[0] eq "-q")              { $quiet = 1; }
    elsif ($ARGV[0] eq "-a")           { $attrs = 1; }
    elsif ($ARGV[0] eq "-ac")          { $attrCollapse = 1; }
    elsif ($ARGV[0] eq "-c")           { shift; $catalog = $ARGV[0];
        (-f $catalog) || die "Cannot find catalog file '$catalog'.\n";
    }
    elsif ($ARGV[0] eq "-docarcs")     { $docarcs = 1; }
    elsif ($ARGV[0] eq "-help-prune")  { 
        die "Elements ignored with -prune:\n    " . 
            join(", ",sort keys %phrase) . ".\n";
    }
    elsif ($ARGV[0] eq "-mml")         { delete $NStoIgnore{"mml"}; }
    elsif ($ARGV[0] eq "-nons")        { shift; $NStoIgnore{$ARGV[0]} = 1; }
    elsif ($ARGV[0] eq "-nopcdata")    { $pcdata = 0; }
    elsif ($ARGV[0] eq "-out")         { shift; $outFormat = $ARGV[0];
        die "-out is not yet supported, sorry.\n";
    }
    elsif ($ARGV[0] eq "-prune")       { $prune = 1; }
    elsif ($ARGV[0] eq "-nodir")       { $directed = 0; }
    elsif ($ARGV[0] eq "-v")           { $verbose = 1; }
    elsif (substr($ARGV[0],0,1) eq "-") {
       ($ARGV[0] eq '-h' or $ARGV[0] eq '-help')
          || print "Unknown option '$ARGV[0]'.\n";
       system "perldoc makeDtdGraph";
       exit;
    }
    else { last; }
    shift;
} # options

my $file = $ARGV[0];
(defined $file && -f $ARGV[0]) || die
    "Cannot find DTD file '$file'.\n";


###############################################################################
# Info about each element, indexed by element type name
#
my %children     = ();   # value = space-separated children
my %operators    = ();
my %repOperators = ();
my @attrTable    = ();   # Info about attr, by attr name or elem/atrr name
my @tagStack     = ();


# Collect the element dcls
#
parseDocument();
if ($verbose) {
    warn "Just after parse, list has:\n";
    foreach my $x (sort keys %children) { warn rpad($x,20) . " " . 
                                           $children{$x} . "\n"; }
}

if ($prune) { prune(); }

# Write out the Graphviz file.
#
print "// Graphviz dot file, written by makedtdgraph, "
    . "version of $version by sderose\@acm.org.\n";
($directed) && print "di";
print "graph DTD {
    size=\"8.5,11\";

    label = \"DTD parent/child chart, drawn by 'makedtdgraph'.\n\";
    fontsize=12;

";

my $arrow = ($directed) ? "->":"--";
my $arcCount = 0;

foreach my $x (sort keys %children) {
     my @kids = split("  *",$children{$x});
     foreach my $k (@kids) {
         if (!$k) { next; }
         if ($k eq "EMPTY") { last; }
         (my $kns = $k) =~ s/:.*$//;
         if ($kns ne "" and defined $NStoIgnore{$kns}) { next; }
         if ($prune and $phrase{lc($k)}) { next; }
         if ($outFormat eq "graphviz") {
             print rpad("\"$x\"",25) . " $arrow \"$k\"";
             # Could add [label="...",color=...] arrowhead, arrowtail,
             # labelfontcolor, labeldistance, tooltip, weight, etc.
             print ";\n";
         }
         elsif ($outFormat eq "prolog") {
             print "??? $x $k\n";
         }
         elsif ($outFormat eq "owl") {
             print "$x:\n    Facts: Child $k\n";
         }
         elsif ($outFormat eq "xtm") {
             print "??? $x $k\n";
         }
         elsif ($outFormat eq "csv") {
             print "$x, $k\n";
         }

         $arcCount++;
     }
}

# Possibly these have to be listed first, not last?
print "\n\n// Now hilite nodes that occurred in the document.\n";
my %occurred;
foreach my $n (sort keys %documentArcs) {
    (my $p = $n) =~ s/\/.*$//;
    (my $c = $n) =~ s/^.*\///;
    $occurred{$p}++;
    $occurred{$c}++;
}

# Show attributes if requested
if ($attrs || $attrCollapse) {
    print "\n\n// Attributes:\n";
    print "{ node [style=filled, color=green, fontsize=9]\n";
    while (my $a = shift @attrTable) {
        print "$a;\n";
    }
    print "}\n";
}

if (scalar keys %occurred) {
    print "\nnode [style=filled,color=lightgrey]\n";
    foreach my $e (sort keys %occurred) {
        print "$e; ";
    }
    print "\n\n";
}

print "}\n";


if ($docarcs) {
    print "\n\n// =====================================================\n\n";
    print "// Occurrence counts of parent/child pairs in the document:\n";
    print "//\n";
    foreach my $pair (sort keys %documentArcs) {
        print "//\t$pair\t $documentArcs{$pair}\n";
    }
    print "\n\n";
}

my $nodeCount = scalar keys %children;
($quiet) || warn "Written: $nodeCount element nodes, $arcCount child arcs.\n";

exit;


###############################################################################
# Remove phrase-level elements
#
sub prune {
    for my $k (keys %children) {
        if ($phrase{lc($k)}) {
            delete $children{$k};
        }
    }
}

sub pruneToDescendantsOf {
	my $root = $_[0];
	($children{$root} ne "") || die "Element '$root' not found.\n";
	my %descendants = ();
	$descendants{$root}++;
    for my $ch (split(/\s+/,$children{$root})) {
		if ($descendants{$ch} <= 0) {
			$descendants{$ch}++;
			# recurse
		}
	}
	for my $c (sort keys %children) {
		if (!$descendants{$c}) {
			delete $children{$c};
			delete $operators{$c};
			delete $repOperators{$c};
		}
	}
} # pruneToDescendantsOf

sub rpad {
   my $rc = $_[0];
   my $need = $_[1] - length($rc);
   if ($need > 0) {
      $rc .= " " x $need;
   }
   return ($rc);
}


###############################################################################
# Regarding the XML::Parser module:
# API doc is at http://search.cpan.org/~msergeant/XML-Parser/Parser.pm
#
sub parseDocument {
    # Make a trivial XML document that references the DTD.
    # Must get root element type right or parser gets unhappy.
    my $theDoc = "/tmp/doc.xml";
    my $path = "$ENV{PWD}/$file";
    my $rootname = "art";
    open ADOC, ">$theDoc" || die "Can't open '$theDoc'\n";
    print ADOC "<?xml version='1.0'?>\n";
    print ADOC "<!DOCTYPE $rootname SYSTEM '$path'>\n";
    print ADOC "<$rootname></$rootname>\n";
    close ADOC;

    use XML::Parser;
    #use XML::Catalog;
    #my $catalog=XML::Catalog->new($catalog);

    my $parser = new XML::Parser(ErrorContext => 2, ParseParamEnt => 1);
    $parser->setHandlers(Element => \&elementDclHandler,
                         Doctype => \&doctypeHandler);
    if ($attrs || $attrCollapse) {
        $parser->setHandlers(Attlist => \&attlistDclHandler);
    }
    $parser->parsefile($theDoc);
} # sub parseDocument

sub doctypeHandler {
    my ($parser, $docel, $docsys, $docpub, $internal) = @_;
    #warn "Got doctype, sysid is '$docsys'\n";
    ($docsys eq "") || (-f $docsys) ||
        warn "No file at system id loc '$docsys'.\n";
}

sub elementDclHandler {
    my ($parser, $name, $rest) = @_;
    if ($mml == 0 and substr($name,0,4) eq "mml:") { return; }
    (my $tokens = $rest) =~ s/[()|,?*+&]/ /g; # reduce to just tokens
    (my $ops = $rest)    =~ s/[^|,&]//g;      # what operators are used?
    (my $reps = $rest)   =~ s/[^+?*]//g;      # what repetition indicators?
    if (!$pcdata) {
        $tokens =~ s/\#PCDATA\s*//;
    }
    $children{$name} = $tokens;
    $operators{$name} = $ops;
    $repOperators{$name} = $reps;
} # sub elementDclHandler


sub attlistDclHandler {
    my ($p, $elname, $attname, $type, $dft, $fixed) = @_;
    if ($mml == 0 and substr($elname,0,4) eq "mml:") { return; }
    
    if ($attrCollapse) {
        push @attrTable, (rpad($elname,25) . " -> $attname");
    }
    else {
        push @attrTable, (rpad($elname,25) . " -> $elname/$attname");
    }       
}


# Counts are kept in a hashtable keyd by 'parent/child'.
#
sub startHandler {
    my $parser = shift;
    my $gi = shift;
    if (!$docElement) { $docElement = $gi; }
    my $depth = scalar @tagStack;
    my $parent = ($depth > 0) ? $tagStack[$depth - 1] : "#ROOT";
    push(@tagStack,$gi);
    $documentArcs{"$parent/$gi"}++;
    $elementOccurrences{$gi}++;
}

sub endHandler {
    pop(@tagStack);
}

sub charHandler {
    my $depth = scalar @tagStack;
    my $parent = ($depth > 0) ? $tagStack[$depth - 1] : "#ROOT";
    my $text = $_[1];
    (my $nonwhite = $text) =~ s/\s//g;
    if ($nonwhite) {
        $documentArcs{"$parent/#PCDATA"}++;
    }
}


###############################################################################
#
sub showUsage {
    print "

=head1 Usage

makedtdgraph dtdfile [>resultfile]

Extracts the list of element declarations from a DTD, and makes a graphviz
chart of their containment relationships (see www.graphviz.org), or
a some other representation of those relationships.

Can also count what parent/child instances occur in a given document.


=head1 Options

=over

=item * B<-a>         Include attributes, too (not recommended for large DTDs,
because the graph gets really ugly).

=item * B<-ac>        Include attributes, but only once for each distinct
attribute name, not once for each element+attribute.

=item * B<-c> I<file>    Use specified XML catalog (not yet supported).

=item * B<-docArcs>   Dump a list of parent/child pairs, 
and how many times each was actually
seen in the document (assuming there was a document...).
This may be different from the set of pairs the DTD permits.

=item * B<-help-prune> Show the list of tags the I<-prune> will cause
to be ignored.

=item * B<-mml>       Include mathml (mml:) tags (otherwise omitted).

=item * B<-nons> I<ns>   Omit tags with namespace prefix 'ns'.

=item * B<-out> I<form>  Write out as Graphviz (default), 
Prolog, Xtm, OWL (Manchester), or CSV
(only Graphviz is finished).

=item * B<-prune>     Omit a
variety of familiar phrase-level (inline) element types. This is useful 
when such elements have a very wide range of permitted contexts in the DTD,
and thus would clutter up the graph. The built-in list of element types is:

    i it ital italic
    b bd bold
    bi boldital bolditalic
    em emph emphasis font span 
    sc scap smallcap
    mono tt monospace
    rm rom roman
    strike strikethru
    sub sup super inf infsup supinf
    overline overline-start overline-end
    u ul us underline underline-start underline-end

=item * B<-nopcdata>  Omit mention of #PCDATA.

=item * B<-q>         Suppress most messages.

=item * B<-undir>     Make an undirected graph, not directed.

=item * B<-v>         Supply even more messages.

=item * B<-version>   Display version info and exit.

=back


=head1 Known bugs and limitations

Does not indicate required/repeatable status, or ordering of children.

No way to change the list of elements for I<-prune> to ignore.


=head1 Related commands

makeDtdTable -- makes a chart comparing two or more DTDs.


=head1 Ownership

This work by Steven J. DeRose is licensed under a Creative Commons 
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see http://creativecommons.org/licenses/by-sa/3.0/.

The author's present email is sderose at acm.org.

For the most recent version, see http://www.derose.net/steve/utilities/.

=cut
";
}