#!/usr/bin/perl -w # # SimplifyUnicode # # 2010-11-19ff: Written by Steven J. DeRose, sderose@acm.org. # Mostly pulled from domExtensions, via normalizeUnicode. # 2010-12-01 sjd: Make into Perl Module. Add uriEscapes. Clean up. # 2012-01-19 sjd: Debug. Fix setOption() to sync w/ Python version. # 2012-05-22 sjd: Drop entities and uriEscapes in favor of sjdUtils. # Simplify option handling. # 2012-08-31 sjd: Add 'compatibility', start changing to use UCD, etc. # Use [\p{Initial_Punctuation}\p{Final_Punctuation}'"`] # 2013-09-09: Allow setting options via hash on constructor. # # To do: # Redo ligatureDomain as ligatureAll or something. # Precompile changes for ligatures, so one fancy change not many. # What about generating ligatures, dashes, quotes, spaces? # Pull in code from Tokenizer.pm. # *soft* hyphens special, emdash to double hyphens # Sync Unicode-name-checking with Python version # Option to fix CP1252 stuff, or at least warn??? # # Integrate into: # findKeyWords, normalizeXML, vocab, tokenizer, etc. # use strict; use Getopt::Long; use Unicode::Normalize; use Unicode::Normalize 'decompose'; our $VERSION = "0.6"; package SimplifyUnicode; ############################################################################### # sub new { my ($class, $optionHash) = @_; my $self = { version => "2013-09-09", options => { compatibility => "0", accent => "keep", dash => 0, ligatureDomain => "all", ligature => "keep", math => 0, number => 0, qBack => 0, qInitial => 0, qFinal => 0, quote => 0, space => 0, }, # internal data ligatureChars => setupLigatures(), mathStarts => setupMaths(), # HashRefs lig2seqBasic => undef, seq2ligBasic => undef, lig2seq => undef, seq2lig => undef, }; bless $self, $class; if (ref($optionHash) eq "HASH") { for my $o (keys(%{$optionHash})) { $self->setOption($o, $optionHash->{$o}); } } return($self); } ############################################################################### # Apply all the selected simplifications # sub simplify { my ($self, $rec) = @_; #if ($self->{entities}) { # $rec = $self->handle_Entities($rec); #} #if ($self->{uriEscapes}) { # $rec = $self->handle_UriEscapes($rec); #} if ($self->{options}->{compatibility} ne "keep") { $rec = decompose($rec,1); } if ($self->{options}->{accent} ne "keep") { $rec = $self->handle_Diacritics($rec); } if ($self->{options}->{ligature} ne "keep") { $rec = $self->handle_Ligatures($rec); } if ($self->{options}->{math} ne "keep") { $rec = $self->handle_Maths($rec); } if ($self->{options}->{dash}) { $rec =~ s/\p{Pd}/-/g; } if ($self->{options}->{quote}) { $rec =~ s/\p{Pi}/`/g; $rec =~ s/\p{Pf}/'/g; $rec =~ s/`"/'/g; } else { if ($self->{options}->{qInitial}) { $rec =~ s/\p{Pi}/`/g; } if ($self->{options}->{qFinal}) { $rec =~ s/\p{Pf}/'/g; } if ($self->{options}->{qBack}) { $rec =~ s/`/'/g; } } if ($self->{options}->{space}) { $rec =~ s/\pZ/ /g; } if ($self->{options}->{number}) { die "numbers not yet supported.\n"; } return($rec); } ############################################################################### # sub setOption { my ($self,$oname,$value) = @_; # Check whether option name is actually valid. if (!defined $self->{options}->{$oname}) { warn("simplifyUicode.setOption: Unknown optoin '$oname'.\n"); return(undef); } if (!$oname || !$value) { warn("Option value not passed to simplifyUicode.setOption.\n"); return(undef); } $self->{options}->{$oname} = $value; # Check value for the non-Boolean ones ($self->{options}->{accent} =~ m/^(composed|keep|decomposed|strip|space|delete)$/) || warn "Bad value for simplifyUicode.setOption($oname,$value).\n"; ($self->{options}->{ligatures} =~ m/^(composed|keep|decomposed|strip|space|delete)$/) || warn "Bad value for simplifyUicode.setOption($oname,$value).\n"; } # setOption sub getOption { my ($self, $oname) = @_; return($self->{options}->{$oname}); } ############################################################################### # sub normalize_Space { # Normalize as for XML whitespace my ($self, $s) = @_; $s =~ s/\s\s+/ /g; $s =~ s/^\s+//g; $s =~ s/\s+$//g; return($s); } # Implement changes that have multiple options. # sub handle_Diacritics { my ($self, $rec) = @_; my $aopt = $self->{options}->{accent}; if ($aopt eq "composed") { warn "unsupported -accent handling '$aopt'\n"; } elsif ($aopt eq "decomposed") { $rec = Unicode::NFKD($rec); } elsif ($aopt eq "strip" || $aopt eq "translit") { $rec = Unicode::NFKD($rec); $rec =~ s/\pM+//g; } elsif ($aopt eq "space") { warn "unsupported -accent handling '$aopt'\n"; } elsif ($aopt eq "delete") { warn "unsupported -accent handling '$aopt'\n"; } else { # keep } return($rec); } # NOTE: Unicode compatibility decomposition does break up some ligs (ae,...) # sub handle_Ligatures { my ($self, $rec) = @_; my $s = ""; my $s2lRef = $self->{seq2lig}; my %s2l = %$s2lRef; my $lopt = $self->{options}->{ligatures}; if ($lopt eq "space") { for my $lig (keys %s2l) { $rec =~ s/$lig/ /g; } } elsif ($lopt eq "delete") { for my $lig (keys %s2l) { $rec =~ s/$lig//g; } } elsif ($lopt eq "composed") { for my $lig (keys %s2l) { $rec =~ s/$lig/$s2l{$lig}/g; } warn "unsupported -ligatures handling '$lopt'\n"; } elsif ($lopt eq "decomposed") { for my $lig (keys %s2l) { $rec =~ s/$lig/$s2l{$lig}/g; } warn "unsupported -ligatures handling '$lopt'\n"; } else { # keep } return($rec); } # Return basic ASCII letter if we got a math letter; else undef. # sub handle_Maths { my ($self,$rec) = @_; my $buf = ""; for (my $i=0; $i{mathStarts}; my %ms = %$mathStartsRef; for my $mathRange (keys %ms) { my $diff = $n - $mathRange; next unless ($n>=0 && $n<26); my $type = $ms{$mathRange}; if ($type eq "UPPER") { return(substr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",$diff,1)); } elsif ($type eq "LOWER") { return(substr("abcdefghijklmnopqrstuvwxyz",$diff,1)); } else { warn "Bad math alphabet type '$type'.\n"; } } return(undef); } ############################################################################### # Non-Latin ligatures are not here yet, because there are *lots* of them. # sub setupLigatures { my %seq2ligBasic = ( "ff" => chr(0xFB00), "ff" => chr(0xFB01), "fl" => chr(0xFB02), "ffi" => chr(0xFB03), "ffl" => chr(0xFB04), ); my %seq2lig = ( "AE" => chr(0x00C6), # = latin capital ligature ae (1.0) "ae" => chr(0x00E6), # = latin small ligature ae (1.0) "IJ" => chr(0x0132), # LATIN CAPITAL LIGATURE IJ "ij" => chr(0x0133), # LATIN SMALL LIGATURE IJ "OE" => chr(0x0152), # LATIN CAPITAL LIGATURE OE "oe" => chr(0x0153), # LATIN SMALL LIGATURE OE "st" => chr(0xFB06), # LATIN SMALL LIGATURE ST # (chr(0x017F)."t") => chr(0xFB05), # long-s t # (chr(0x017F)."s") => chr(0x00DF), # in origin ligature of long s, s # there's also a combining long s at 1de5 and some accented one. ); my %lig2seqBasic = (); my %lig2seq = (); # Add the basic to the main map for my $seq (keys %seq2ligBasic) { $seq2lig{$seq} = $seq2ligBasic{$seq}; } # Make both reverse maps for my $seq (keys %seq2lig) { my $lig = $seq2lig{$seq}; $lig2seq{$lig} = $seq; if (defined $seq2ligBasic{$seq}) { $lig2seqBasic{$lig} = $seq; } } my $foo = qq { 0587 ARMENIAN SMALL LIGATURE ECH YIWN FB13 ARMENIAN SMALL LIGATURE MEN NOW FB14 ARMENIAN SMALL LIGATURE MEN ECH FB15 ARMENIAN SMALL LIGATURE MEN INI FB16 ARMENIAN SMALL LIGATURE VEW NOW FB17 ARMENIAN SMALL LIGATURE MEN XEH 04A4 CYRILLIC CAPITAL LIGATURE EN GHE 04B4 CYRILLIC CAPITAL LIGATURE TE TSE (Abkhasian) 04D4 CYRILLIC CAPITAL LIGATURE A IE FB1F HEBREW LIGATURE YIDDISH YOD YOD PATAH FB4F HEBREW LIGATURE ALEF LAMED 05F0 HEBREW LIGATURE YIDDISH DOUBLE VAV 05F1 HEBREW LIGATURE YIDDISH VAV YOD 05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD }; } # Maths are handled differently, because they generally include an entire # Latin alphabet (lower and or upper case) for each variant; no use in # listing 26x as much data.... # sub setupMaths { my %mathAlphabetStarts = ( 0x1d400 => "UPPER", # mathematical bold 0x1d41A => "LOWER", 0x1d434 => "UPPER", # mathematical italic 0x1d434 => "LOWER", 0x1d468 => "UPPER", # mathematical bold italic 0x1d482 => "LOWER", 0x1d49C => "UPPER", # mathematical script 0x1d4B6 => "LOWER", 0x1d4D0 => "UPPER", # mathematical bold script 0x1d4EA => "LOWER", 0x1d504 => "UPPER", # mathematical fraktur 0x1d51E => "LOWER", 0x1d538 => "UPPER", # mathematical double-struck 0x1d552 => "LOWER", 0x1d56C => "UPPER", # mathematical bold fraktur 0x1d586 => "LOWER", 0x1d5A0 => "UPPER", # mathematical sans-serif 0x1d58A => "LOWER", 0x1d5D4 => "UPPER", # mathematical sans-serif bold 0x1d5EE => "LOWER", 0x1d608 => "UPPER", # mathematical sans-serif italic 0x1d622 => "LOWER", 0x1d63C => "UPPER", # mathematical sans-serif bold italic 0x1d656 => "LOWER", 0x0249c => "LOWER", # parenthesized lower (no upper!) 0x024b6 => "UPPER", # circled upper 0x024d0 => "LOWER", # circled lower 0x1d670 => "UPPER", # mathematical monospace 0x1d68a => "LOWER", ); # (couple extras at 1d6a4: dotless i, j) # greek upper+lower: 1d6a8, 1d6e2, etc. return(\%mathAlphabetStarts); } # End of SimplifyUnicodePackage ############################################################################### ############################################################################### ############################################################################### # =pod =head1 Usage Maps many classes of Unicode characters to more basic ones. Example: use SimplifyUnicode; my $su = new SimplifyUnicode(); $su->setOptions("dashes",1); $myString = $su->simplify($myString); =head1 Methods =head2 new(name, optionsRef?) Create a new simplifier. You can set exactly what gets simplified using the I method described below, or by passing a reference to a hash of options to set as I. =head2 setOption(name, value) Use this method to configure just what kinds of characters will be normalized: =over =item * B Do Unicode compatibility decomposition. =item * B Turns accents and other diacritics to some form: decomposed, composed, deleted, strip, space, keep. =item * B Turns em dash, en dash, hyphen, etc. to hyphen. =item * B Whether I does only the basics, or all. =item * B Turn ligatures to some form: decomposed, composed, deleted, space, keep. =item * B Normalize alternate math Latin alphabets to ASCII. =item * B Normalize alternate forms of numbers to ASCII. =item * B Normalizes backquote to apostrophe. =item * B Turns many kinds of open quotes to backquote. =item * B Turns many kinds of close quotes to apostrophe. =item * B Turns many kinds of quotes to apostrophe. =item * B Normalize various whitespace chars to ASCII space. =back =head2 string = simplify(string) Simplifies the Unicode string according to the current configuration options. =head1 Related commands C C -- Has many similar normalization featurs, integrated into a fairly generic tokenizer. Might be nice to have a script that checked the scripts in use, and added '' around block of text in various languages. Or at least one that screams when it finds characters in the wrong xml:lang context. =head1 Known bugs and limitations Can't map emdash to double hyphen and/or soft-hyphen to nil. Doesn't know anything about various parentheses, brackets, etc. =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see L. The author's present email is sderose at acm.org. For the most recent version, see L. =cut 1;