#!/usr/bin/perl -w # # Extract all entity references from specified file(s), and # ranklist them. # # Written 2006-03-23 by Steven J. DeRose. # 2007-12-12 sjd: strict, Getopt. # 2011-01-20 sjd: Do in real Perl, not shell calls. Add normHTML. # # To do: # Add -tickInterval, -iencoding, -ilineends. # Add ability to normalize from names to numbers. # use strict; use Getopt::Long; my $version = "2011-01-20"; my $count = 1; my $norm = 0; my $normHTML = 0; my $quiet = 0; my $verbose = 0; # Process options Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "count" => \$count, "h|help|?" => sub { system "perldoc countEntities"; exit; }, "norm" => \$norm, "normHTML" => \$normHTML, "q|quiet!" => \$quiet, "v|verbose+" => \$verbose, "version" => sub { die "Version of $version, by Steven J. DeRose.\n"; } ); ($result) || die "Bad options.\n"; ($norm && $normHTML) && die "Can't specify both -norm and -normHTML.\n"; ############################################################################### # my %entCounts = (); my @htmlMap = (); if ($normHTML) { setupHTML(); } my $totalRecs = 0; my $nHex = my $nDec = my $nNamed = 0; while (my $file = shift) { if (!open(IN, "<$file")) { warn "Cannot open file '$file'.\n"; next; } my $recNum = 0; while (my $rec = ) { $recNum++; chomp $rec; $rec =~ s///g; $rec =~ s///g; $rec =~ s/<\?.*?\?>//g; while ($rec =~ s/^.*?&/&/) { $rec =~ s/&(#?[-_.:\w\d]+);//; my $ent = $1 ? $1:""; if (defined $ent) { handleEntity($ent); } else { warn "Invalid syntax in record $recNum of '$file': $rec\n"; } } # per entity ref } # per record $totalRecs += $recNum; } # per file if ($count) { print "Entity \t Count\n"; for my $e (sort keys %entCounts) { print sprintf("%-12s\t%8d\n", $e, $entCounts{$e}); } print "\n"; } print sprintf("Hex references: %8d\n", $nHex); print sprintf("Decimal references: %8d\n", $nDec); print sprintf("Named references: %8d\n", $nNamed); print sprintf("Total references: %8d\n", $nHex+$nDec+$nNamed); warn "Done, $totalRecs records processed.\n"; exit; ############################################################################### # Pass in everything *between* the & and the ; -- not including either. # sub handleEntity { my ($e) = @_; my $codePoint = 0; if ($e =~ m/^#/) { if ($e =~ m/^#x/i) { $codePoint = hex(substr($e,2)); $nHex++; } else { $codePoint = 0 + substr($e,1); $nDec++; } if ($normHTML) { $e = fixHTMLentityNames($e); } elsif ($norm) { $e = sprintf("#x%05x", $codePoint); } } else { $nNamed++; } if ($count) { $entCounts{$e}++; } } # handleEntity ############################################################################### # sub fixHTMLentityNames { my ($e) = @_; if (defined $htmlMap[$e-0]) { $e = $htmlMap[$e-0]; } return($e); } sub setupHTML { # Built-in $htmlMap[38] = "amp"; $htmlMap[34] = "quot"; $htmlMap[39] = "apos"; $htmlMap[60] = "lt"; $htmlMap[62] = "gt"; # Latin-1 $htmlMap[160] = "nbsp"; $htmlMap[161] = "iexcl"; $htmlMap[162] = "cent"; $htmlMap[163] = "pound"; $htmlMap[164] = "curren"; $htmlMap[165] = "yen"; $htmlMap[166] = "brvbar"; $htmlMap[167] = "sect"; $htmlMap[168] = "uml"; $htmlMap[169] = "copy"; $htmlMap[170] = "ordf"; $htmlMap[171] = "laquo"; $htmlMap[172] = "not"; $htmlMap[173] = "shy"; $htmlMap[174] = "reg"; $htmlMap[175] = "macr"; $htmlMap[176] = "deg"; $htmlMap[177] = "plusmn"; $htmlMap[178] = "sup2"; $htmlMap[179] = "sup3"; $htmlMap[180] = "acute"; $htmlMap[181] = "micro"; $htmlMap[182] = "para"; $htmlMap[183] = "middot"; $htmlMap[184] = "cedil"; $htmlMap[185] = "sup1"; $htmlMap[186] = "ordm"; $htmlMap[187] = "raquo"; $htmlMap[188] = "frac14"; $htmlMap[189] = "frac12"; $htmlMap[190] = "frac34"; $htmlMap[191] = "iquest"; $htmlMap[192] = "Agrave"; $htmlMap[193] = "Aacute"; $htmlMap[194] = "Acirc"; $htmlMap[195] = "Atilde"; $htmlMap[196] = "Auml"; $htmlMap[197] = "Aring"; $htmlMap[198] = "AElig"; $htmlMap[199] = "Ccedil"; $htmlMap[200] = "Egrave"; $htmlMap[201] = "Eacute"; $htmlMap[202] = "Ecirc"; $htmlMap[203] = "Euml"; $htmlMap[204] = "Igrave"; $htmlMap[205] = "Iacute"; $htmlMap[206] = "Icirc"; $htmlMap[207] = "Iuml"; $htmlMap[208] = "ETH"; $htmlMap[209] = "Ntilde"; $htmlMap[210] = "Ograve"; $htmlMap[211] = "Oacute"; $htmlMap[212] = "Ocirc"; $htmlMap[213] = "Otilde"; $htmlMap[214] = "Ouml"; $htmlMap[215] = "times"; $htmlMap[216] = "Oslash"; $htmlMap[217] = "Ugrave"; $htmlMap[218] = "Uacute"; $htmlMap[219] = "Ucirc"; $htmlMap[220] = "Uuml"; $htmlMap[221] = "Yacute"; $htmlMap[222] = "THORN"; $htmlMap[223] = "szlig"; $htmlMap[224] = "agrave"; $htmlMap[225] = "aacute"; $htmlMap[226] = "acirc"; $htmlMap[227] = "atilde"; $htmlMap[228] = "auml"; $htmlMap[229] = "aring"; $htmlMap[230] = "aelig"; $htmlMap[231] = "ccedil"; $htmlMap[232] = "egrave"; $htmlMap[233] = "eacute"; $htmlMap[234] = "ecirc"; $htmlMap[235] = "euml"; $htmlMap[236] = "igrave"; $htmlMap[237] = "iacute"; $htmlMap[238] = "icirc"; $htmlMap[239] = "iuml"; $htmlMap[240] = "eth"; $htmlMap[241] = "ntilde"; $htmlMap[242] = "ograve"; $htmlMap[243] = "oacute"; $htmlMap[244] = "ocirc"; $htmlMap[245] = "otilde"; $htmlMap[246] = "ouml"; $htmlMap[247] = "divide"; $htmlMap[248] = "oslash"; $htmlMap[249] = "ugrave"; $htmlMap[250] = "uacute"; $htmlMap[251] = "ucirc"; $htmlMap[252] = "uuml"; $htmlMap[253] = "yacute"; $htmlMap[254] = "thorn"; $htmlMap[255] = "yuml"; # Other $htmlMap[402] = "fnof"; $htmlMap[913] = "Alpha"; $htmlMap[914] = "Beta"; $htmlMap[915] = "Gamma"; $htmlMap[916] = "Delta"; $htmlMap[917] = "Epsilon"; $htmlMap[918] = "Zeta"; $htmlMap[919] = "Eta"; $htmlMap[920] = "Theta"; $htmlMap[921] = "Iota"; $htmlMap[922] = "Kappa"; $htmlMap[923] = "Lambda"; $htmlMap[924] = "Mu"; $htmlMap[925] = "Nu"; $htmlMap[926] = "Xi"; $htmlMap[927] = "Omicron"; $htmlMap[928] = "Pi"; $htmlMap[929] = "Rho"; $htmlMap[931] = "Sigma"; $htmlMap[932] = "Tau"; $htmlMap[933] = "Upsilon"; $htmlMap[934] = "Phi"; $htmlMap[935] = "Chi"; $htmlMap[936] = "Psi"; $htmlMap[937] = "Omega"; $htmlMap[945] = "alpha"; $htmlMap[946] = "beta"; $htmlMap[947] = "gamma"; $htmlMap[948] = "delta"; $htmlMap[949] = "epsilon"; $htmlMap[950] = "zeta"; $htmlMap[951] = "eta"; $htmlMap[952] = "theta"; $htmlMap[953] = "iota"; $htmlMap[954] = "kappa"; $htmlMap[955] = "lambda"; $htmlMap[956] = "mu"; $htmlMap[957] = "nu"; $htmlMap[958] = "xi"; $htmlMap[959] = "omicron"; $htmlMap[960] = "pi"; $htmlMap[961] = "rho"; $htmlMap[962] = "sigmaf"; $htmlMap[963] = "sigma"; $htmlMap[964] = "tau"; $htmlMap[965] = "upsilon"; $htmlMap[966] = "phi"; $htmlMap[967] = "chi"; $htmlMap[968] = "psi"; $htmlMap[969] = "omega"; $htmlMap[977] = "thetasy"; $htmlMap[978] = "upsih"; $htmlMap[982] = "piv"; $htmlMap[8226] = "bull"; $htmlMap[8230] = "hellip"; $htmlMap[8242] = "prime"; $htmlMap[8243] = "Prime"; $htmlMap[8254] = "oline"; $htmlMap[8260] = "frasl"; $htmlMap[8472] = "weierp"; $htmlMap[8465] = "image"; $htmlMap[8476] = "real"; $htmlMap[8482] = "trade"; $htmlMap[8501] = "alefsym"; $htmlMap[8592] = "larr"; $htmlMap[8593] = "uarr"; $htmlMap[8594] = "rarr"; $htmlMap[8595] = "darr"; $htmlMap[8596] = "harr"; $htmlMap[8629] = "crarr"; $htmlMap[8656] = "lArr"; $htmlMap[8657] = "uArr"; $htmlMap[8658] = "rArr"; $htmlMap[8659] = "dArr"; $htmlMap[8660] = "hArr"; $htmlMap[8704] = "forall"; $htmlMap[8706] = "part"; $htmlMap[8707] = "exist"; $htmlMap[8709] = "empty"; $htmlMap[8711] = "nabla"; $htmlMap[8712] = "isin"; $htmlMap[8713] = "notin"; $htmlMap[8715] = "ni"; $htmlMap[8719] = "prod"; $htmlMap[8721] = "sum"; $htmlMap[8722] = "minus"; $htmlMap[8727] = "lowast"; $htmlMap[8730] = "radic"; $htmlMap[8733] = "prop"; $htmlMap[8734] = "infin"; $htmlMap[8736] = "ang"; $htmlMap[8743] = "and"; $htmlMap[8744] = "or"; $htmlMap[8745] = "cap"; $htmlMap[8746] = "cup"; $htmlMap[8747] = "int"; $htmlMap[8756] = "there4"; $htmlMap[8764] = "sim"; $htmlMap[8773] = "cong"; $htmlMap[8776] = "asymp"; $htmlMap[8800] = "ne"; $htmlMap[8801] = "equiv"; $htmlMap[8804] = "le"; $htmlMap[8805] = "ge"; $htmlMap[8834] = "sub"; $htmlMap[8835] = "sup"; $htmlMap[8836] = "nsub"; $htmlMap[8838] = "sube"; $htmlMap[8839] = "supe"; $htmlMap[8853] = "oplus"; $htmlMap[8855] = "otimes"; $htmlMap[8869] = "perp"; $htmlMap[8901] = "sdot"; $htmlMap[8968] = "lceil"; $htmlMap[8969] = "rceil"; $htmlMap[8970] = "lfloor"; $htmlMap[8971] = "rfloor"; $htmlMap[9001] = "lang"; $htmlMap[9002] = "rang"; $htmlMap[9674] = "loz"; $htmlMap[9824] = "spades"; $htmlMap[9827] = "clubs"; $htmlMap[9829] = "hearts"; $htmlMap[9830] = "diams"; } ############################################################################### # =pod =head1 Usage countEntities [options] [files] Finds and optionally counts all distinct entity references in HTML/XML files. =head1 Options =over =item * B<-count> Provide a rank-list of the occurrences of each entity (whether numeric and named references to the same character, or hex vs. decimal references, or numeric references with varying numbers of leading zeros, count as different, all depend on how you set the I<-norm> and I<-normHTML> options. I<-count> is the default. Setting I<-nocount> means that only the grand total number of entity and character references will be printed, and how many were hex, decimal, and named. =item * B<-norm> Normalize numeric character references to 5-digit hexadecimal. =item * B<-normHTML> Turn numeric entity reference into HTML named entities (when possible). =item * B<-q> Suppress most messages. =back =head1 Known bugs and Limitations Does I use a full-fledged XML parser. This allows it to handle non-well-formed data, but also means it will fail on some cases. It does discard comments, PIs, and marked sections so long as they're entirely on one line. XML general entities will not be expanded, so any entity references buried within such entities will not be noticed. There is no way to normalize named entities to numerics (yet). =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see http://creativecommons.org/licenses/by-sa/3.0/. The author's present email is sderose at acm.org. For the most recent version, see http://www.derose.net/steve/utilities/. =cut