#!/usr/bin/perl -w # # Datatypes.pm: Package to help with XSD datatype checking. # # 2012-04-24: Written by Steven J. DeRose, sderose@acm.org. # 2012-04-30 sjd: Handle 0x in datatype specs. Actually check ENUM and # new STRING. Check REGEX. Add 'scream' parameter to check, # and issue specific error messages. Add types for some sets of chars. # 2012-05-23 sjd: Drop XmlTuples to avoid circular dependency. # 2012-05-31 sjd: Fix REGEX. # 2012-06-08ff sjd: Change [arg] or \targ to (arg). Improve parsing of it. # Allow /[\s\|]+/ between ENUM tokens. # normalizeData(type, raw)? At least string, bool, baseint. # 2012-06-14 sjd: Add 'rep arg to checkValueForType(). # 2012-11-26 sjd: Clean up. # # To do: # Perhaps add File, Directory, WFile, Rfile # addType()? # conversions to/from ctime? # methods to define/delete types? # use strict; our $VERSION = "0.83"; package Datatypes; ############################################################################### # Define the datatypes known (from XML Schema, mostly). # # Each gets a regex used to check it ('^' and '$' will be added by checker). # For integers, bounds are added after the regex: \s*### min ### max) # A few local (non-XML-Schema) types are defined, all starting with "". # Of those, ENUM, STRING, and REGEX are validated specially. # ENUM and STRING take an (argument) after the type-name. # # ==> This could use XSV except for circular package dependency <== # my $NAME = "[_.\\w][-_.:\\w\\d]*"; # XML Identifiers my $NCNAME = "[_.\\w][-_.\\w\\d]*"; # No namespace colon my $NMTOKEN = "[-_.:\\w\\d]+"; # No special first char my %dtInfoSource = ( # SPECIALS (ENUM and STRING are parameterized) 'ENUM' => { NormWS=>"1", Expr=>'.*' }, 'STRING' => { NormWS=>"1", Expr=>'.*' }, 'REGEX' => { Expr=>'.*' }, 'ASCII' => { Expr=>'\\p{isASCII}*' }, 'BASEINT' => { Num=>1, Expr=>'([1-9]\\d*|0[0-7]+|0x\\s+)$' }, # 'UChar' => { Expr=>'.' }, # 'XmlNameChar' => { Expr=>'[-_.:\\w]' }, # 'XmlNameChar' => { Expr=>'[-_.:\\w]' }, # 'XmlNameStart' => { Expr=>'\\w' }, # 'LatinLetter' => { Expr=>'[A-Za-z]' }, # 'LatinLower' => { Expr=>'[a-z]' }, # 'LatinUpper' => { Expr=>'[A-Z]' }, # 'Digit' => { Num=>1, Expr=>'\\d' }, # Truth values 'boolean' => { Expr=>'(true|1|false|0)' }, # Real numbers 'decimal' => { Num=>1, Expr=>'[-+]?\\d+(\\.\\d+)' }, 'double' => { Num=>1, Expr=>'([-+]?\\d+(\\.\\d+)([eE][-+]?\\d+)?)|INF|-INF|NaN' }, 'float' => { Num=>1, Expr=>'([-+]?\\d+(\\.\\d+)([eE][-+]?\\d+)?)|INF|-INF|NaN' }, # Various integers 'byte' => { Num=>1, Expr=>'[-+]?\\d+', Min=>'-0x80', Max=>'0x7F' }, 'int' => { Num=>1, Expr=>'[-+]?\\d+', Min=>'-0x8000', Max=>'0x7FFF' }, 'short' => { Num=>1, Expr=>'[-+]?\\d+', Min=>'-0x8000', Max=>'0x7FFF' }, 'integer' => { Num=>1, Expr=>'[-+]?\\d+', Min=>'-0x80000000', Max=>'0x7FFFFFFF' }, 'long' => { Num=>1, Expr=>'[-+]?\\d+' },#Min=>$min64, Max=>$max64 }, 'nonPositiveInteger' => { Num=>1, Expr=>'(-\\d+)|0+', Max=>'0' },#Min=>$min64 }, 'negativeInteger' => { Num=>1, Expr=>'-\\d+', Max=>'-1'},#Min=>'$min64' }, 'nonNegativeInteger' => { Num=>1, Expr=>'\\+?\\d+', Min=>'0' },#Max=>$max64 }, 'positiveInteger' => { Num=>1, Expr=>'\\+?\\d+', Min=>'1' },#Max=>$max64 }, 'unsignedByte' => { Num=>1, Expr=>'\\+?\\d+', Min=>'0', Max=>'0x7F' }, 'unsignedShort' => { Num=>1, Expr=>'\\+?\\d+', Min=>'0', Max=>'0x7FFF' }, 'unsignedInt' => { Num=>1, Expr=>'\\+?\\d+', Min=>'0', Max=>'0x7FFFFFFF' }, 'unsignedLong' => { Num=>1, Expr=>'\\+?\\d+', Min=>'0' },#Max=>$max64 }, # Dates and times (imperfect...) 'date' => { Expr=>'-?\d{4,}-\d\d-\d\d([-+]\d\d:\d\d|Z)?' }, 'dateTime' => { Expr=>'-?\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?([-+]\d\d:\d\d|Z)?' }, 'time' => { Expr=>'\d\d:\d\d:\d\d(\.\d+)?([-+]\d\d:\d\d|Z)?' }, 'duration' => { Expr=> '--\\d+([-+]\d\d:\d\d|Z)?' }, 'gDay' => { Expr=>'---\\d+([-+]\d\d:\d\d|Z)?' }, 'gMonth' => { Expr=>'\\d+', Min=>'0', Max=>'12' }, 'gMonthDay' => { Expr=>'--\d\d-\d\d+([-+]\d\d:\d\d|Z)?' }, 'gYear' => { Expr=>'-?\d{4,}' }, 'gYearMonth' => { Expr=>'-?\d{4,}-\d\d' }, # Strings 'language' => { Expr=>'.+' }, 'normalizedString' => { NormWS=>"1", Expr=>'[^\\r\\n\\t]*' }, 'string' => { NormWS=>"1", Expr=>'.*' }, 'token' => { NormWS=>"1", Expr=>'\\S( ?\\S+)*' }, # XML constructs (note caps) "NMTOKEN" => { NormWS=>"1", Expr=>"$NMTOKEN" }, "NMTOKENS" => { NormWS=>"1", Expr=>"$NMTOKEN(\\s+$NMTOKEN)*" }, "Name" => { NormWS=>"1", Expr=>"$NAME" }, "NCName" => { NormWS=>"1", Expr=>"$NCNAME" }, "ID" => { NormWS=>"1", Expr=>"$NCNAME" }, "IDREF" => { NormWS=>"1", Expr=>"$NCNAME" }, "IDREFS" => { NormWS=>"1", Expr=>"$NCNAME(\\s+$NCNAME)*" }, "ENTITY" => { NormWS=>"1", Expr=>"$NCNAME" }, "ENTITIES" => { NormWS=>"1", Expr=>"$NCNAME(\\s+$NCNAME)*" }, "QName" => { NormWS=>"1", Expr=>"$NCNAME:$NCNAME" }, # Net constructs 'anyURI' => { Expr=>'(([a-zA-Z0-9-$_.+!*,();\\/?:\@=&])|(%\\x\\x))+' }, 'base64Binary' => { Expr=>'[\s+\\/=a-zA-Z0-9]+' }, 'hexBinary' => { Expr=>'([0-9a-fA-F][0-9a-fA-F])+' }, ); ############################################################################### # sub new { my ($class) = @_; my $self = { version => "2012-11-26", options => { verbose => 0, scream => 1, }, dtInfo => \%dtInfoSource, }; (0) && warn("Constructing Datatypes object: (" . join(", ", sort keys(%{$self->{dtInfo}})) . ").\n"); bless $self, $class; for my $dtName (sort keys(%{$self->{dtInfo}})) { my $dtSpec = $self->{dtInfo}->{$dtName}; my $min = decimalize($dtSpec->{Min}); my $max = decimalize($dtSpec->{Max}); $dtSpec->{Min} = $min; $dtSpec->{Max} = $max; $self->vMsg(1, "*****Datatypes: $dtName: /$dtSpec->{Expr}/ >" . ($min ? $min:"N/A") . " <" . ($max ? $max:"N/A") . "\n"); } return($self); } sub err { my ($self, $msg) = @_; return unless ($self->{scream}); $self->vMsg(0,"Datatypes: $msg"); } sub vMsg { my ($self, $level, $msg) = @_; return unless ($self->{options}->{verbose} >= $level); warn("$msg\n"); } sub decimalize { my ($string) = @_; ($string) || return(undef); $string =~ s/^(-)//; my $sign = ($1) ? -1:1; # This will complain about the really big values if ($string =~ m/^0/) { $string = oct($string); } $string *= $sign; return($string); } sub getVersion { my ($self) = @_; return($self->{version}); } sub setOption { my ($self, $name, $value) = @_; if (!defined $self->{options}->{$name}) { $self->err("Unknown option name '$name'."); return(undef); } $self->{options}->{$name} = $value; return($value); } sub getOption { my ($self, $name) = @_; if (!defined $self->{options}->{$name}) { $self->err("Unknown option name '$name'."); return(undef); } return($self->{options}->{$name}); } # Return true iff value is ok. # sub checkValueForType { my ($self, $dtName, # Descriptive name of the datatype $rep, # Repetition indicator (just "?" or "!" for now) $value, # Value to be checked $scream, # Whether to report mismatch to STDOUT ) = @_; if ($rep !~ m/(?|!)/) { $self->vMsg(1, "checkValueForType: Bap rep: '$rep'.\n"); $rep = "!"; } if ($value eq "" && ($rep eq "?" || $rep eq "*")) { return(1); } if (defined $scream) { $self->{scream} = $scream; } my $dtSpec = $self->{dtInfo}->{$dtName}; ($dtSpec) || return(0); # Scream about this, too? my $expr = $dtSpec->{Expr} || ""; $self->vMsg(1, "Checking '$dtName': value '$value' vs. /$expr/"); if ($dtName =~ m/[\t\[\{\]\}]/) { # obsolete syntax! $self->err("Bad char in datatype '$dtName'"); } $dtName =~ s/\((.*)//; # Extract (arg) if present my $arg = $1 ? $1:""; $arg =~ s/\).*//; if ($dtName eq "ENUM") { # ENUM my @vals = split(/[\s\|,]+/, $arg); shift @vals; for my $v (@vals) { if ($dtName eq $v) { return(1); } } $self->err("Value '$value' is not in enum (" . join(", ", @vals) . ")."); return(0); } elsif ($dtName eq "STRING") { # STRING my $rc = $value =~ m/$arg/ ? 1:0; (!$rc) && $self->err( "Value '$value' does not match STRING Expr /$arg/"); return($rc); } elsif ($dtName =~ m/^REGEX$/) { # REGEX my $x = "'x' =~ m/$value/;"; #warn "About to eval '$x'\n"; eval($x); # On error, $@ is set to non-empty error message. my $rc = $@ ? 0:1; (!$rc) && $self->err("Bad regex: /$value/.\n"); return($rc); } if ($value !~ m/$expr/) { # Normal expr-based type $self->err("$dtName value '$value' does not match /$expr/.\n"); return(0); } my $min = $dtSpec->{Min}; # Numeric ranges if ($min && $value < $min) { $self->err("$dtName value $value is below min $min.\n"); return(0); } my $max = $dtSpec->{Max}; if ($max && $value > $max) { $self->err("$dtName value $value is above max $max.\n"); return(0); } return(1); } sub isXSD { # Built-in type from XSD? my ($self, $dtName) = @_; my $dtSpec = $self->{dtInfo}->{$dtName}; return(($dtSpec && $dtSpec !~ m/^/) ? 1:0); } sub isNumericDatatype { my ($self, $dtName) = @_; my $dtSpec = $self->{dtInfo}->{$dtName}; return(($dtSpec && $dtSpec->{"Num"}) ? 1:0); } sub isWSNormalizable { my ($self, $dtName) = @_; my $dtSpec = $self->{dtInfo}->{$dtName}; return(($dtSpec && $dtSpec->{"Norm"}) ? 1:0); } sub isKnownDatatype { my ($self, $dtName) = @_; # (following two types have appended args if ($dtName =~ m/^(ENUM|STRING)\(.*\)/) { return(1); } my $dtSpec = $self->{dtInfo}->{$dtName}; return(defined($dtSpec) ? 1:0); } sub getKnownDatatypes { my ($self) = @_; my @foo = keys(%{$self->{dtInfo}}); return(\@foo); } ############################################################################### # sub normalize { my ($self, $typeName, $value) = @_; if ($typeName eq "BASEINT") { # BASEINT $value = oct($value) if ($value =~ m/^0/); } elsif (isNumericDatatype($typeName)) { # Other numeric $value = $value + 0; } elsif (isWSNormalizable($typeName)) { # WS normalizing $value =~ s/\s+/ /g; $value =~ s/^ //g; $value =~ s/ $//g; } elsif ($typeName eq "boolean") { # Boolean $value = (!$value || $value eq "false") ? 0:1; } return($value); } 1; ############################################################################### ############################################################################### ############################################################################### # =pod =head1 Usage Type-checking for the XSD built-in datatypes, plus a few others. use Datatypes; my $dt = new Datatypes(); ... if ($dt->checkValueForType("typename", "!", $value)) { ... } Values are checked for lexical form, and numeric types are also checked for min and max values. The second argument should be "?" if a nil value is acceptable, otherwise "!" ("*" and "+" will likely be added). =for nobody ################################################################### =head1 Supported XSD built-in types =over =item * B: I. =item * B: I, I, I. =item * B: I, I, I, I, I, I, I, I, I, I, I, I, I. =item * B: I, I, I