#!/usr/bin/perl -w # # makeXmlTestFile # # 2010-02-05: Written by Steven J. DeRose.. # # To do: # Extreme cases of: nnamespaces, synonymous namespace prefixes, # colon and underscore as namestart. # xml dcls with varying quotes, encodings, versions, whitespace. # attr quoting, '"' "'" # linelength # DTD constructs, entity and notation dcls, p ents, paths for external entities,.... # nondeterministic content models # byte-order marks # Display Unicode char names. # use strict; use Getopt::Long; use charnames ':full'; my $version = "2010-09-12"; my $dft_size = 257; my $all = 0; my $attrLength = 0; my $cdata = 0; my $charRefLength= 0; my $depth = 0; my $entNameLength= 0; my $entRange = 0; my $fanout = 0; my $minChar = 0; my $maxChar = 0; my $nameChars = 0; my $nameLength = 0; my $numAttrs = 0; my $textNodeLength = 0; my $tokenLength = 0; my $indent = 0; my $istring = " "; my $olineends = "U"; my $quiet = 0; my $randomize = 0; my $tickInterval = 10000; my $unicode = 0; my $verbose = 0; ############################################################################### # Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "attrLength=n" => \$attrLength, "cdata!" => \$cdata, "charRefLength=n" => \$charRefLength, "depth=n" => \$depth, "entNameLength=n" => \$entNameLength, "entRange=n" => \$entRange, "fanout=n" => \$fanout, "minCharmy=n" => \$minChar, "maxChar=n" => \$maxChar, "nameChars=n" => \$nameChars, "nameLength=n" => \$nameLength, "numAttrs=n" => \$numAttrs, "textNodeLength=n" => \$textNodeLength, "tokenLength=n" => \$tokenLength, "all!" => \$all, "h|help" => sub { system "perldoc makeXmlTetFile"; exit; }, "indent!" => \$indent, "istring=s" => \$istring, "olineends=s" => \$olineends, "q!" => \$quiet, "randomize!" => \$randomize, "tick=n" => \$tickInterval, "unicode!" => sub { $iencoding = "utf8"; }, "v+" => \$verbose, "version" => sub { die "Version of $version, by Steven J. DeRose.\n"; } ); ($result) || die "Bad options.\n"; if ($all) { $attrLength = $dft_size; $cdata = 1; $charRefLength= $dft_size; $depth = $dft_size; $entNameLength= $dft_size; $fanout = $dft_size; $entRange = $dft_size; $minChar = 128; $maxChar = $dft_size; $nameChars = 1; $nameLength = $dft_size; $numAttrs = $dft_size; $textNodeLength = $dft_size; $tokenLength = $dft_size; } ############################################################################### # Set implied options, validate option values... $olineends = uc(substr($olineends."U",0,1)); if ($olineends eq "M") { $\ = chr(13); } elsif ($olineends eq "D") { $\ = chr(13).chr(10); } else { } ############################################################################### # Set up some basic character-set and text data for later. # my $base = ""; my $ideo = ""; my $comb = ""; my $dig = ""; setupCharClasses(); print ""; binmode STDOUT, ":utf8"; print "\n"; print "\n"; if ($entNameLength>0) { my $token = "x" x $entNameLength; print "\n" . "]>\n\n"; } print "\n"; my $c = ""; if ($nameChars) { startSec("Test all base namestart chars"); for $c (split(//, $base)) { showTagName($c); } startSec("Test all ideographic namestart chars"); for $c (split(//, $ideo)) { showTagName($c); } startSec("Test all combining namestart chars, on 'A'"); for $c (split(//, $comb)) { showTagName($c); } endSec(); } if ($depth>0) { startSec("Test nesting depth $depth"); for (my $i=1; $i<=$depth; $i++) { ($indent) && print ($istring x $i); print "
\n"; } ($indent) && print ($istring x ($depth+1)); print "

Here we are!

\n"; for (my $i=$depth; $i>=1; $i--) { ($indent) && print ($istring x $i); print "
\n"; } endSec(); } if ($fanout>0) { startSec("Test fanout $fanout"); for (my $i=1; $i<=$fanout; $i++) { print "\n"; } endSec(); } if ($numAttrs>0) { startSec("Test numAttrs $numAttrs"); print "\n"; endSec(); } if ($nameLength>0) { my $token = ("X" x $nameLength); startSec("Test nameLength $nameLength"); print "<$token>A long element type name\n"; print "\n"; endSec(); } if ($attrLength>0) { my $token = ("X" x $attrLength); startSec("Test attrLength $attrLength"); print "\n"; endSec(); } if ($charRefLength>2) { my $token = "0" x $charRefLength; startSec("Test charRefLength $charRefLength"); print "

A decimal character reference: &#$token" . "65; (ok?)

/>\n"; print "

A hex character reference: &#x$token" . "77; (ok?)

/>\n"; endSec(); } if ($entNameLength>0) { my $token = "x" x $entNameLength; startSec("Test entNameLength $entNameLength"); print "

A long entity name: &$token; (ok?)

/>\n"; endSec(); } if ($cdata>0) { my $token = ("X" x $attrLength); startSec("Test cdata"); print "

? ]]>

\n"; endSec(); } if ($textNodeLength>0) { my $text = ""; while (length($text) < $textNodeLength) { $text .= " " . getRandomWord(); } startSec("Test textNodeLength $textNodeLength"); print "

$text

\n"; endSec(); } if ($tokenLength>0) { my $text = ""; while (length($text) < $tokenLength) { $text .= getRandomWord(); } startSec("Test tokenLength $tokenLength"); print "

$text

\n"; endSec(); } if ($entRange>0) { startSec("Test entRange for character at code point $entRange"); print sprintf("

A decimal reference to code point %d: &#%d; (ok?)

/>\n", $entRange, $entRange); print sprintf("

A hex reference to code point x%x: &#x%x; (ok?)

/>\n", $entRange, $entRange); endSec(); } if ($minChar>0 || $maxChar>0) { startSec("Test a range of code points from $minChar to $maxChar"); for (my $i=$minChar; $i<=$maxChar; $i++) { print sprintf("

Code point d%d {&#%d;}, or x%x: {&#x%x;} (ok?)

/>\n", $i, $i, $i, $i); } endSec(); } startSec("End of tests."); endSec(); print "\n
\n"; exit; ############################################################################### # sub startSec { print "\n\n
\n

$_[0]

\n"; } sub endSec { print "
\n"; } ############################################################################### # The lists here are pasted straight from the XML standard; each is then parsed # and converted into the equivalent string of actual characters. # sub setupCharClasses { my $baseChar = "[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"; my $Ideographic = "[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"; my $CombiningChar = "[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"; my $Digit = "[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"; my $Extender = "#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"; $base = convertSpec($baseChar); if (length($base) != 13602) { warn "Wrong number of baseChar chars: " . length($base) . ".\n"; } $ideo = convertSpec($Ideographic); if (length($ideo) != 20912) { warn "Wrong number of Ideographic chars: " . length($ideo) . ".\n"; } $comb = convertSpec($CombiningChar); if (length($comb) != 437) { warn "Wrong number of CombiningChar chars: " . length($comb) . ".\n"; } $dig = convertSpec($Digit); if (length($dig) != 149) { warn "Wrong number of Digit chars: " . length($dig) . ".\n"; } } sub convertSpec { my @ranges = split(/\s*\|\s*/,$_[0]); my $buf = ""; for my $r (@ranges) { if ($r =~ m/#x([0-9A-F]{4,4})$/) { $buf .= chr(hex("0x$1")); } elsif ($r =~ m/\[#x([0-9A-F]{4,4})-#x([0-9A-F]{4,4})\]$/) { my $start = hex("0x$1"); my $end = hex("0x$2"); for (my $n = $start; $n <= $end; $n++) { $buf .= chr($n); } } else { die "Bad format in list: '$r'\n"; } } return($buf); } ############################################################################### # BEGIN { # See http://www.lipsum.com/ my @words = ( qw/ Lorem ipsum dolor sit amet consectetur adipisicing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum /); my $lastWord = 0; my $nWords = scalar @words; sub getRandomWord { my $w = $words[$lastWord]; $lastWord = ($randomize) ? int(rand()*$nWords) : (($lastWord+1) % $nWords); return($w); } } # END sub showTagName { my $c = $_[0]; my $n = ord($c); ($n % 256 == 0) && print "\n

New block

\n"; print sprintf("

Char %d (x%x, '%s').

\n", $c,$n,$n,getCharName($n),$c); } sub getCharName { if ($_[0] >= 0xAC00) { return("???"); } my $name = charnames::viacode($_[0]); if (!$name) { $name = "(name not found)"; } return($name); } ############################################################################### # sub showUsage { warn " =head1 Usage makeXmlTestFile [options] file Stress test an XML system on various quantities. Writes out an XML file with the requested special cases. Note that -nameChars produces a Long file, since every permissible name char will be tested. You run that through your XML system and see what happens. =head1 Options for what to test: =item * B<-attrLength> n =item * B<-cdata> =item * B<-charRefLength n> =item * B<-depth> n =item * B<-entNameLength n> =item * B<-fanout> n =item * B<-entRange> n =item * B<-minChar> n =item * B<-maxChar> n =item * B<-nameChars> =item * B<-nameLength> n =item * B<-numAttrs> n =item * B<-textNodeLength n> =item * B<-tokenLength> n =back =head1 Other options (prefix 'no' to negate where applicable) =over =item * B<-all> Turn everything on (length tests default to $dft_size). =item * B<-lineends t> Assume Unix, Dos, or Mac line-breaks for input. =item * B<-outlineends t> Write Unix, Dos, or Mac line-breaks for output. =item * B<-q> Suppress most messages. =item * B<-randomize> Generate text content more randomly. =item * B<-v> Add more messages (repeatable). =item * B<-version> Show version/license info and exit. =back =head1 Known bugs and limitations Does not test every combining diacritic on all applicable characters. No way to test all chars in *content*. Cannot generate *incorrect* cases to test if parser catches them. May add that, but it would have to generate many individual files since an XML parser must stop at the first well-formedness error. =head1 Related commands =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see http://creativecommons.org/licenses/by-sa/3.0/. The author's present email is sderose at acm.org. For the most recent version, see http://www.derose.net/steve/utilities/. =cut "; }