Created
March 28, 2025 11:32
-
-
Save Gro-Tsen/d650b9da751bfb5d7abbb0b8d0a3565f to your computer and use it in GitHub Desktop.
Convert UTF-8 data to the plaintext numbers and names of Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/local/bin/perl -w | |
use strict; | |
my $unidata = "/usr/share/unicode/UnicodeData.txt" // $ENV{"UNIDATA"}; | |
open F, "<", $unidata or die "Can't open $unidata"; | |
my @unidata; | |
while (<F>) { | |
my @l = split /\;/; | |
my $n = hex($l[0]); | |
$unidata[$n] = $l[1] unless $l[1] =~ /^\<.*\>$/; | |
} | |
close F; | |
sub putunichar { | |
my $c = shift; | |
if ( defined($unidata[$c]) ) { | |
printf "U+%04X %s\n", $c, $unidata[$c]; | |
} elsif ( $c>=0xAC00 && $c<=0xD7AF ) { | |
my $n = $c-0xAC00; | |
my $l = int($n/(21*28)); | |
my $v = (int($n/28))%21; | |
my $t = $n%28; | |
my @partL = ( "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", | |
"SS", "", "J", "JJ", "C", "K", "T", "P", "H" ); | |
my @partV = ( "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", | |
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", | |
"I" ); | |
my @partT = ( "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", | |
"LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S", | |
"SS", "NG", "J", "C", "K", "T", "P", "H" ); | |
printf "U+%04X HANGUL SYLLABLE %s%s%s\n", $c, $partL[$l], $partV[$v], $partT[$t]; | |
} elsif ( ( $c>=0x3400 && $c<=0x4DB5 ) | |
|| ( $c>=0x4E00 && $c<=0x9FCB ) | |
|| ( $c>=0x20000 && $c<=0x2A6D6 ) | |
|| ( $c>=0x2A700 && $c<=0x2B734 ) | |
|| ( $c>=0x2B740 && $c<=0x2B81D ) ) { | |
printf "U+%04X CJK UNIFIED IDEOGRAPH-%04X\n", $c, $c; | |
} else { | |
printf "U+%04X\n", $c; | |
} | |
} | |
sub dosinglechar { | |
# Process a single (ASCII) character. | |
my $char = shift; | |
$char = ord($char); | |
putunichar $char; | |
} | |
sub dodoublechar { | |
my $ch1 = shift; | |
my $ch2 = shift; | |
$ch1 = ord($ch1); | |
$ch2 = ord($ch2); | |
putunichar ((($ch1-0300)<<6)|($ch2-0200)); | |
} | |
sub dotriplechar { | |
my $ch1 = shift; | |
my $ch2 = shift; | |
my $ch3 = shift; | |
$ch1 = ord($ch1); | |
$ch2 = ord($ch2); | |
$ch3 = ord($ch3); | |
putunichar ((($ch1-0340)<<12)|(($ch2-0200)<<6)|($ch3-0200)); | |
} | |
sub doquadruplechar { | |
my $ch1 = shift; | |
my $ch2 = shift; | |
my $ch3 = shift; | |
my $ch4 = shift; | |
$ch1 = ord($ch1); | |
$ch2 = ord($ch2); | |
$ch3 = ord($ch3); | |
$ch4 = ord($ch4); | |
putunichar ((($ch1-0360)<<18)|(($ch2-0200)<<12)|(($ch3-0200)<<6)|($ch4-0200)); | |
} | |
sub doquintuplechar { | |
my $ch1 = shift; | |
my $ch2 = shift; | |
my $ch3 = shift; | |
my $ch4 = shift; | |
my $ch5 = shift; | |
$ch1 = ord($ch1); | |
$ch2 = ord($ch2); | |
$ch3 = ord($ch3); | |
$ch4 = ord($ch4); | |
$ch5 = ord($ch5); | |
putunichar ((($ch1-0370)<<24)|(($ch2-0200)<<18)|(($ch3-0200)<<12)|(($ch4-0200)<<6)|($ch5-0200)); | |
} | |
sub dosextuplechar { | |
my $ch1 = shift; | |
my $ch2 = shift; | |
my $ch3 = shift; | |
my $ch4 = shift; | |
my $ch5 = shift; | |
my $ch6 = shift; | |
$ch1 = ord($ch1); | |
$ch2 = ord($ch2); | |
$ch3 = ord($ch3); | |
$ch4 = ord($ch4); | |
$ch5 = ord($ch5); | |
$ch6 = ord($ch6); | |
putunichar ((($ch1-0374)<<30)|(($ch2-0200)<<24)|(($ch3-0200)<<18)|(($ch4-0200)<<12)|(($ch5-0200)<<6)|($ch6-0200)); | |
} | |
while ( <> ) { | |
GOBBLE: | |
while ( 1 ) { | |
if ( s/^([\300-\337])([\200-\277])//s ) { | |
dodoublechar $1, $2; | |
} elsif ( s/^([\340-\357])([\200-\277])([\200-\277])//s ) { | |
dotriplechar $1, $2, $3; | |
} elsif ( s/^([\360-\367])([\200-\277])([\200-\277])([\200-\277])//s ) { | |
doquadruplechar $1, $2, $3, $4; | |
} elsif ( s/^([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])//s ) { | |
doquintuplechar $1, $2, $3, $4, $5; | |
} elsif ( s/^([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])//s ) { | |
dosextuplechar $1, $2, $3, $4, $5, $6; | |
} elsif ( s/^(.)//s ) { | |
dosinglechar $1; | |
} else { | |
last GOBBLE; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment