#!/usr/local/bin/perl ############################################################################# # # $RCSfile: jis2uni.pl,v $ # $Date: 1999/03/25 21:56:36 $ # $Source: /home/richard/Xml/RCS/jis2uni.pl,v $ # $Revision: 1.4 $ # $Author: richard $ # ############################################################################# # # Converts Unicode-consortium eastasia tables into tables Xmlparse # can use to convert the east-Asian codes to Unicode. Developed # originally for Japanese Shift-JIS encoding, but will also work # for other Asian encodings. # ############################################################################# # use Getopt::Std; my $GAP = 183; # For Debugging purposes #my $GAP = 2; MAIN: { my %jistbl; my (@jislist, @unilist); my ($jis, $uni, $comment, $encoding); my ($line, $key, $maxjis, $minjis, $comma); getopts ('e:'); $encoding = $opt_e ? $opt_e : "jis"; $maxjis = 0; $minjis = 65535; while ($line = <>) { next if $line =~ /^\s*\#/; if ($line =~ /^\s*0x[0-9A-Z]*\s*0x[0-9A-Z]*\s*0x[0-9A-Z]*/i) { # Three-column table, Shift-JIS, 0208, and comment; remove # the first (Shift-JIS) column. $line =~ s/^\s*0x[0-9A-Z]*\s*//i; } if ($line =~ /^\s*0x([0-9A-Z]*)\s*0x([0-9A-Z]*)(.*)$/i) { # Two column table, with a comment at the end $jis = hex($1); $uni = hex($2); $comment = $3; $comment =~ s/^\s*\#\s*//; $comment =~ s/\s+$//; $jistbl{$jis} = [$uni, $comment]; $maxjis = ($jis > $maxjis) ? $jis : $maxjis; $minjis = ($jis < $minjis) ? $jis : $minjis; } } print "/* Maximum value for a $encoding character */\n"; printf "const unsigned int max_${encoding}_char = 0x%04x;\n", $maxjis; printf "const unsigned int min_${encoding}_char = 0x%04x;\n\n", $minjis; my $gaps = 0; my $lastkey = -1; my $printcount = 0; @jislist = sort numerically keys %jistbl; foreach $key (@jislist) { die ("Geez, last key, $lastkey, is greater than key, $key\n") if $lastkey > $key; if (($lastkey + $GAP) < $key) { $gaps++; $lastkey = $key - 1; } while (++$lastkey < $key) { $printcount++; } $printcount++; } my $count = 0; my $off_by = 0; $lastkey = -1; print "/* List of gaps of more than $GAP in the sparse unicode_$encoding array.\n"; print " * Fields 1-2 are the start-end of a gap in the unicode_$encoding array;\n"; print " * field 3 is how far off we are at character (end + 1).\n"; print " */\n"; printf "const unsigned int ${encoding}_gaps[%d][3] = {\n", $gaps + 1; foreach $key (@jislist) { if (($lastkey + $GAP) < $key) { $count++; $off_by += ($key - $lastkey) - 1; printf " /* Gap $count; decimal %d, %d, %d */\n", ($lastkey + 1), ($key - 1), $off_by; printf " { 0x%04x, 0x%04x, 0x%04x },\n", ($lastkey + 1), ($key - 1), $off_by; } $lastkey = $key; } print " /* How we tell we're at the end of this array */\n"; print " { 0, 0, 0 }\n"; print "};\n\n"; printf "/* ${encoding}-to-Unicode mappings (starts at 0x%04x) */\n", $minjis; print "const unsigned int unicode_${encoding}\[$printcount] = {\n"; $count = 0; $lastkey = -1; $off_by = 0; foreach $key (@jislist) { if (($lastkey + $GAP) < $key) { # Mark this gap in the sequence $off_by += ($key - $lastkey) - 1; print "\n /* Gap number ", ++$count, "; previous lines off by $off_by */\n\n"; $lastkey = $key - 1; } while (++$lastkey < $key) { if ($lastkey <= 0xFF) { printf " 0x%04x,\t/* not strictly a $encoding char */\n", $lastkey; } else { print " 0xFFFD,\t/* just padding (no $encoding char here) */\n"; } } # Give the unicode translation for $jis $comma = ($key == $maxjis) ? "" : ","; $jis = @{ $jistbl{$key} }[0]; $comment = @{ $jistbl{$key} }[1] ? "\t/* $encoding value, $key; @{$jistbl{$key}}[1] */" : ""; printf " 0x%04x${comma}${comment}\n", $jis; } print "};\n"; } sub numerically { return $a <=> $b; }