#!/usr/bin/perl -w
#
# chr: Show char for a given code point number.
#
# 2007-10-29: Written by Steven J. DeRose, sderose@acm.org.
# 2008-01-02 sjd: Add -n. Fix name access for C1 range.
# 2008-09-03 sjd: Move to BSD.
#
# To do:
#     

use strict;
use Getopt::Long;

my $version = "2008-09-03";

my $help = 0;
my $long = 0;
my $multiple = 0;
my $nums = 0;
my $quiet = 0;
my $verbose = 0;

Getopt::Long::Configure ("ignore_case");
my $result = GetOptions(
    "h|help|?"          => \$help,
    "l!"                => \$long,
    "m!"                => \$multiple,
    "n"                 => \$nums,
    "q|quiet!"          => \$quiet,
    "v|verbose+"        => \$verbose,
    "version"           => sub {
        die "Version of $version, by Steven J. DeRose, sderose\@acm.org.\n";
    }
    );

if ($help) { showUsage(); exit; }
($result) || die "Bad options.\n";

($ARGV[0] ne "" && oct($ARGV[0]) >= 0) ||
	die "Must have a non-negative numeric argument.\n";

################################################################################

# Create arrays of names for the C0 and C1 control characters
# (See chart at bottom)
my @C0names = (
          "NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", 
           "BS",  "HT",  "LF",  "VT",  "FF",  "CR",  "SO",  "SI", 
          "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB", 
          "CAN",  "EM", "SUB", "ESC",  "FS",  "GS",  "RS",  "US",
          "SPACE");
# PAD, HOP, and SGCI are listed as "XXX" in Unicode (acc. Wikipedia).
my @C1names = (
          "PAD", "HOP", "BPH", "NBH", "IND", "NEL", "SSA", "ESA",
          "HTS", "HTJ", "VTS", "PLD", "PLU",  "RI", "SS2", "SS3", 
          "DCS", "PU1", "PU2", "STS", "CCH",  "MW", "SPA", "EPA",
          "SOS", "SGCI", "SCI", "CSI",  "ST", "OSC",  "PM", "APC",
          "NBS");

my @C0longNames = (
    "Null",
    "Start Of Heading",
    "Start Of Text",
    "End Of Text",    
    "End Of Transmission",
    "Enquiry",
    "Acknowledge",
    "Bell",

    "Backspace",
    "Horizontal Tab",
    "Newline",
    "Vertical Tab",
    "Form Feed",
    "Carriage Return",
    "Shift Out",
    "Shift In",

    "Data Link Escape",
    "Device Control 1",
    "Device Control 2",
    "Device Control 3",
    "Device Control 4",
    "Negative Acknowledge",
    "Synchronous Idle",
    "End Of Transmission Block",

    "Cancel",
    "End Of Medium",
    "Substitute",
    "Escape",
    "Field Separator",
    "Group Separator",
    "Record Separator",
    "Unit Separator",

    "Space");

my @C1longNames = (
    "Padding Character", 
    "High Octet Preset", 
    "Break Permitted Here", 
    "No Break Here", 
    "Index", 
    "Next Line", 
    "Start of Selected Area", 
    "End of Selected Area", 

    "Horizontal Tab Set", 
    "Horizontal Tab Justified", 
    "Vertical Tab Set", 
    "Partial Line Forward", 
    "Partial Line Backward", 
    "Reverse Line Feed", 
    "Single-Shift 2", 
    "Single-Shift 3", 

    "Device Control String", 
    "Private Use 1", 
    "Private Use 2", 
    "Set Transmit State", 
    "Cancel character", 
    "Message Waiting", 
    "Start of Protected Area", 
    "End of Protected Area", 

    "Start of String", 
    "Single Graphic Char Intro", 
    "Single Char Intro", 
    "Control Sequence Introducer", 
    "String Terminator", 
    "OS Command", 
    "Private Message", 
    "App Program Command", 

    "Non-breaking Space");


################################################################################

if ($multiple) {
	my $arg = shift;
	if ($arg =~ m/^0x/i) {
		while ($arg = substr($arg,2)) {
			my $cur = "0x" . substr($arg,0,2);
			print "$cur:  ";
			doOneChar($cur);
		}
	}
	else {
		print "Sorry, with -m string must start with '0x' for now.\n";
	}
}
else {
	doOneChar(shift);
}

exit;


################################################################################

sub doOneChar {
	my $n = $_[0];
    $n = oct $n if ($n =~ m/^0/);
    
    if ($n < 32) {
        ($verbose) && warn "C0 control character.\n";
        if ($long) { print "$C0longNames[$n]\n"; }
        else       { print "$C0names[$n]\n"; }
    }
    elsif ($n == 32) {
        ($verbose) && warn "Space.\n";
        if ($long) { print "$C0longNames[$n]\n"; }
        else       { print "$C0names[$n]\n"; }
    }
    elsif ($n < 128) {
        ($verbose) && warn "G0 graphic character.\n";
        print chr($n) . "\n";
    }
    elsif ($n < 161) {
        ($verbose) && warn "C1 control character.\n";
        if ($long) { print "$C1longNames[$n-128]\n"; }
        else       { print "$C1names[$n-128]\n"; }
    }
    elsif ($n < 256) {
        ($verbose) && warn "G1 graphic character.\n";
        print chr($n) . "\n";
    }
    else {
        ($verbose) && warn "Out of Latin-1 range.\n";
        print chr($n) . "\n";
    }
    
    if ($nums) {
        printf("    0x%02x, 0d%d, 0%o, 0b%08b\n", $n, $n, $n, $n);
    }
}


################################################################################

sub showUsage {
    warn "
Usage: chr [options] [num]
    Displays the character corresponding to the code point [num].
    [num] may be in hexadecimal (0x...), octal (0...), binary (0b...), or decimal.
    Control characters and spaces will be displayed as mnemonics.
    Display of other characters depends on your terminal program.
Options:
    -l          Give long names for control characters, instead of mnemonics.
    -m          Try to decode many characters in a row (e.g., 0x4465526f7365...).
    -n          Also display [num] in multiple bases.
    -q          Suppress most messages.
    -v          Add more detailed messages.
    -version    Display version info and exit ($version, sjd).
WARNING:
    Most terminal programs assume Latin-1, while Perl most readily writes out
    UTF-8 Unicode. Interpreting UTF-8 as Latin-1 is usually wrong.
Related commands:
    ord:       Does the reverse.
    findChars: Locates chars in text by code, XML reference, URI escape, etc.
    nonascii:  Locates characters outside a given range, including XML character
                   references, URI escapes, etc.
";
}