#!/usr/bin/perl -w # # Extract the DOCTYPE information from an XML file. # Written to clean up doctype handling in regression testing system. # # Written 2006-08 by Steven J. DeRose. # # Todo: # Switch to use actual parser and grab the doctype/first element. # That way it won't be confused by really strange stuff (although it's # pretty good already). # use strict; my $version = "2010-09-12"; my $quiet = 0; my $fieldSep = "\t"; my $ret = ""; # default # Process arguments while ($ARGV[0]) { if (index($ARGV[0],"--")==0) { $ARGV[0] = substr($ARGV[0],1); } if ($ARGV[0] eq "-q") { $quiet = 1; } elsif ($ARGV[0] eq "-root") { $ret = "root"; } elsif ($ARGV[0] eq "-public") { $ret = "public"; } elsif ($ARGV[0] eq "-system") { $ret = "system"; } elsif ($ARGV[0] eq "-filename") { $ret = "+filename+"; } elsif ($ARGV[0] eq "-d") { shift; $fieldSep = $ARGV[0]; } elsif ($ARGV[0] eq "-version") { warn "Version of $version, by Steven J. DeRose/\n"; exit; } elsif (substr($ARGV[0],0,1) eq "-") { ($ARGV[0] eq '-h' or $ARGV[0] eq '-help') || warn "Unknown option '$ARGV[0]'. Usage:\n"; showUsage(); exit; } else { last; } shift; } (-e $ARGV[0]) || die "Couldn't find file '$ARGV[0]'.\n"; my $file; while ($file = shift) { if (!-f $file) { next; } my $gotIt = 0; # Did we find something yet? my $root = ""; my $pub = ""; my $sys = ""; my $type = ""; open IN, "<$file"; while (my $line = ) { # Just in case there's a DOCTYPE inside a commend or PI: $line =~ s///g; $line =~ s/<\?.*\?>//g; # If we see start of DOCTYPE, load til we get the end then parse if (index($line,"= 0) { while (index($line,">") < 0 && index($line,"[") < 0) { # load up as needed... my $nextline; if (!($nextline = )) { die "Hit EOF before '>'\n"; } $nextline =~ s/]>//g; # Nuke comments and dcls. $nextline =~ s/<\?[^>]\?>//g; # Nuke PIs if (index($nextline,"<") >= 0) { die "Hit '<' inside of DOCTYPE in $file.\n"; } $line .= $nextline; } # Now we should have the whole DOCTYPE, so parse it up $line =~ s/[\r\n\t]/ /g; # Don't get confused by PC/Mac files $line =~ s/.*\n].*|\1|; chomp($root); $gotIt = 1; # Later DOCTYPE wouldn't count anyway last; } } # while IN close IN; # Report results in the form requested # Write a line even if we failed, in case doing multiple files. if ($ret eq "root") { print "$root"; } elsif ($ret eq "public") { print "$pub"; } elsif ($ret eq "system") { print "$sys"; } elsif ($ret eq "+filename+") { print "$file$fieldSep$root$fieldSep$pub$fieldSep$sys\n"; } else { print "$root$fieldSep$pub$fieldSep$sys\n"; } ($gotIt) || ($quiet) || warn "Unable to find DOCTYPE or root element in '$file'.\n"; } # while ARGV exit; # Extract a single or double quoted string from first arg, without the # quotes, and return it. Remove it from the source string as well. sub getQuoted { my $rc; $_[0] =~ s/^\s*//; if (substr($_[0],0,1) eq "'") { $_[0] =~ s/^\'([^\']*)\'//; $rc = $1; } elsif (substr($_[0],0,1) eq "\"") { $_[0] =~ s/^\"([^\"]*)\"//; $rc = $1; } else { warn "Bad quote type at '$_[0]' in $file.\n"; $rc = ""; } return($rc); } ############################################################################### sub showUsage() { print " =head1 Usage finddoctype [options] xmlfile1 xmlfile2... Extracts the DOCTYPE information from the XML file(s). By default, returns root-element-type\\tpublic-id\\tsystem-id =head1 Options =over -root Return only the root element type -public Return only the public identifier -system Return only the system identifier -filename Include original filename before all default items -d char Use char to separate fields (default tab) -q Suppress most messages. -version Report author and version number (sjd) Notes: Returned identifiers are not quoted. Works with multi-line DOCTYPEs, but could miss a DOCTYPE inside a multi-line comment or PI. That doesn't seem likely enough to merit fixing. =head1 Related commands Useful with converter regression testing, to replace grepping and consequent hand-checking. "; }