]> granicus.if.org Git - postgresql/commitdiff
Rewrite the perl scripts to produce our Unicode conversion tables.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 30 Nov 2016 12:54:02 +0000 (14:54 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 30 Nov 2016 12:54:52 +0000 (14:54 +0200)
Generate EUC_CN mappings from gb-18030-2000.xml, because GB2312.TXT is no
longer available.

Get UHC from windows-949-2000.xml, it's more up-to-date.

Plus tons more small changes. With these changes, the perl scripts
faithfully produce the *.map files we have in the repository, from the
external source files.

In the passing, fix the Makefile to also download CP932.TXT and CP950.TXT.

Based on patches by Kyotaro Horiguchi, reviewed by Daniel Gustafsson.

Discussion: https://postgr.es/m/08e7892a-d55c-eefe-76e6-7910bc8dd1f3@iki.fi

33 files changed:
src/backend/utils/mb/Unicode/Makefile
src/backend/utils/mb/Unicode/UCS_to_BIG5.pl
src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl
src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl
src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl [new file with mode: 0755]
src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
src/backend/utils/mb/Unicode/UCS_to_SJIS.pl
src/backend/utils/mb/Unicode/UCS_to_UHC.pl [new file with mode: 0755]
src/backend/utils/mb/Unicode/UCS_to_most.pl
src/backend/utils/mb/Unicode/convutils.pm [new file with mode: 0644]
src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map
src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map
src/backend/utils/mb/Unicode/euc_jp_to_utf8.map
src/backend/utils/mb/Unicode/euc_kr_to_utf8.map
src/backend/utils/mb/Unicode/johab_to_utf8.map
src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map
src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map
src/backend/utils/mb/Unicode/ucs2utf.pl [deleted file]
src/backend/utils/mb/Unicode/uhc_to_utf8.map
src/backend/utils/mb/Unicode/utf8_to_euc_cn.map
src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map
src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map
src/backend/utils/mb/Unicode/utf8_to_euc_jp.map
src/backend/utils/mb/Unicode/utf8_to_euc_kr.map
src/backend/utils/mb/Unicode/utf8_to_johab.map
src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map
src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map
src/backend/utils/mb/Unicode/utf8_to_sjis.map
src/backend/utils/mb/Unicode/utf8_to_uhc.map

index 9d2ef5e3d22e7eaa6b4779cfbc958582c0a3e76a..ea21f4a8527084a45ef5557fb201211d04b1fda3 100644 (file)
@@ -39,8 +39,6 @@ WINMAPS = win866_to_utf8.map utf8_to_win866.map \
        win1258_to_utf8.map utf8_to_win1258.map
 
 GENERICMAPS = $(ISO8859MAPS) $(WINMAPS) \
-       johab_to_utf8.map utf8_to_johab.map \
-       uhc_to_utf8.map utf8_to_uhc.map \
        gbk_to_utf8.map utf8_to_gbk.map \
        koi8r_to_utf8.map utf8_to_koi8r.map
 
@@ -51,6 +49,8 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \
        sjis_to_utf8.map utf8_to_sjis.map \
        gb18030_to_utf8.map utf8_to_gb18030.map \
        big5_to_utf8.map utf8_to_big5.map \
+       johab_to_utf8.map utf8_to_johab.map \
+       uhc_to_utf8.map utf8_to_uhc.map \
        euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map \
        utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map \
        shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map \
@@ -63,23 +63,29 @@ ISO8859TEXTS = 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT \
        8859-10.TXT 8859-13.TXT 8859-14.TXT 8859-15.TXT \
        8859-16.TXT
 
-WINTEXTS = CP866.TXT CP874.TXT CP936.TXT CP949.TXT \
+WINTEXTS = CP866.TXT CP874.TXT CP936.TXT \
        CP1250.TXT CP1251.TXT \
        CP1252.TXT CP1253.TXT CP1254.TXT CP1255.TXT \
        CP1256.TXT CP1257.TXT CP1258.TXT
 
 GENERICTEXTS = $(ISO8859TEXTS) $(WINTEXTS) \
-       KOI8-R.TXT KOI8-U.TXT JOHAB.TXT
+       KOI8-R.TXT KOI8-U.TXT
 
 all: $(MAPS)
 
 $(GENERICMAPS): UCS_to_most.pl $(GENERICTEXTS)
        $(PERL) $<
 
-euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl JIS0201.TXT JIS0208.TXT JIS0212.TXT
+johab_to_utf8.map utf8_to_johab.map: UCS_to_JOHAB.pl JOHAB.TXT
+       $(PERL) $<
+
+uhc_to_utf8.map utf8_to_uhc.map: UCS_to_UHC.pl windows-949-2000.xml
+       $(PERL) $<
+
+euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl CP932.TXT JIS0212.TXT
        $(PERL) $<
 
-euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl GB2312.TXT
+euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl gb-18030-2000.xml
        $(PERL) $<
 
 euc_kr_to_utf8.map utf8_to_euc_kr.map: UCS_to_EUC_KR.pl KSX1001.TXT
@@ -119,7 +125,7 @@ BIG5.TXT CNS11643.TXT:
 euc-jis-2004-std.txt sjis-0213-2004-std.txt:
        $(DOWNLOAD) http://x0213.org/codetable/$(@F)
 
-gb-18030-2000.xml:
+gb-18030-2000.xml windows-949-2000.xml:
        $(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
 
 GB2312.TXT:
@@ -137,7 +143,7 @@ KOI8-R.TXT KOI8-U.TXT:
 $(ISO8859TEXTS):
        $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
 
-$(filter-out CP8%,$(WINTEXTS)):
+$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
        $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
 
 $(filter CP8%,$(WINTEXTS)):
index 127fd157b07db238c913ee606ad6b9395c783a4e..6a1321bab84eed95dd6d2059fbd70b6d09779ded 100755 (executable)
 #               # and Unicode name (not used in this script)
 
 
-require "ucs2utf.pl";
+require "convutils.pm";
 
+# Load BIG5.TXT
+my $all = &read_source("BIG5.TXT");
 
-#
-# first, generate UTF8 --> BIG5 table
-#
-$in_file = "BIG5.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
+# Load CP950.TXT
+my $cp950txt = &read_source("CP950.TXT");
 
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
-       {
-               next;
-       }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-               $array{$utf} = $code;
-       }
-}
-close(FILE);
-
-$in_file = "CP950.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
-       {
-               next;
-       }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
+foreach my $i (@$cp950txt) {
+       my $code = $i->{code};
+       my $ucs = $i->{ucs};
 
        # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
        # from CP950.TXT
@@ -83,126 +44,25 @@ while (<FILE>)
                && $code >= 0xf9d6
                && $code <= 0xf9dc)
        {
-               $utf = &ucs2utf($ucs);
-               if ($array{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-               $array{$utf} = $code;
+               push @$all, {code => $code,
+                                        ucs => $ucs,
+                                        comment => $i->{comment},
+                                        direction => "both"};
        }
 }
-close(FILE);
-
-$file = lc("utf8_to_big5.map");
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $code = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
-       }
-}
-
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate BIG5 --> UTF8 table
-#
-$in_file = "BIG5.TXT";
 
-open(FILE, $in_file) || die("cannot open $in_file");
+foreach my $i (@$all) {
+       my $code = $i->{code};
+       my $ucs = $i->{ucs};
 
-reset 'array';
-
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
-       {
-               next;
-       }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-               $array{$code} = $utf;
-       }
-}
-close(FILE);
-
-$in_file = "CP950.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
-       {
-               next;
-       }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-
-       # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
-       # from CP950.TXT
-       if (   $code >= 0x80
-               && $ucs >= 0x0080
-               && $code >= 0xf9d6
-               && $code <= 0xf9dc)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-               $array{$code} = $utf;
-       }
-}
-close(FILE);
-
-$file = lc("big5_to_utf8.map");
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $utf = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
-       }
-       else
+       # BIG5.TXT maps several BIG5 characters to U+FFFD. The UTF-8 to BIG5 mapping can
+       # contain only one of them. XXX: Doesn't really make sense to include any of them,
+       # but for historical reasons, we map the first one of them.
+       if ($i->{ucs} == 0xFFFD && $i->{code} != 0xA15A)
        {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
+               $i->{direction} = "to_unicode";
        }
 }
 
-print FILE "};\n";
-close(FILE);
+# Output
+print_tables("BIG5", $all);
index 53f44773c938fa63d7bc22d98dd6d834420b2b35..8df23f8be65fae3ff3b90c32328b2dc49c592b32 100755 (executable)
 #! /usr/bin/perl
 #
-# Copyright (c) 2001-2016, PostgreSQL Global Development Group
+# Copyright (c) 2007-2016, PostgreSQL Global Development Group
 #
-# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
+# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
 #
-# Generate UTF-8 <--> EUC_CN code conversion tables from
-# map files provided by Unicode organization.
-# Unfortunately it is prohibited by the organization
-# to distribute the map files. So if you try to use this script,
-# you have to obtain GB2312.TXT from
-# the organization's ftp site.
+# Generate UTF-8 <--> GB18030 code conversion tables from
+# "gb-18030-2000.xml", obtained from
+# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
 #
-# GB2312.TXT format:
-#               GB2312 code in hex
-#               UCS-2 code in hex
-#               # and Unicode name (not used in this script)
+# The lines we care about in the source file look like
+#    <a u="009A" b="81 30 83 36"/>
+# where the "u" field is the Unicode code point in hex,
+# and the "b" field is the hex byte sequence for GB18030
 
-require "ucs2utf.pl";
+require "convutils.pm";
 
-# first generate UTF-8 --> EUC_CN table
+# Read the input
 
-$in_file = "GB2312.TXT";
+$in_file = "gb-18030-2000.xml";
 
 open(FILE, $in_file) || die("cannot open $in_file");
 
+my @mapping;
+
 while (<FILE>)
 {
-       chop;
-       if (/^#/)
-       {
-               next;
-       }
-       ($c, $u, $rest) = split;
+       next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
+       $u = $1;
+       $c = $2;
+       $c =~ s/ //g;
        $ucs  = hex($u);
        $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-
-               $array{$utf} = ($code | 0x8080);
-       }
-}
-close(FILE);
-
-$file = "utf8_to_euc_cn.map";
-open(FILE, "> $file") || die("cannot open $file");
 
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapEUC_CN[ $count ] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $code = $array{$index};
-       $count--;
-       if ($count == 0)
+       # The GB-18030 character set, which we use as the source, contains
+       # a lot of extra characters on top of the GB2312 character set that
+       # EUC_CN encodes. Filter out those extra characters.
+       next if (($code & 0xFF) < 0xA1);
+       next if (!($code >= 0xA100 && $code <= 0xA9FF ||
+                          $code >= 0xB000 && $code <= 0xF7FF));
+
+       next if ($code >= 0xA2A1 && $code <= 0xA2B0);
+       next if ($code >= 0xA2E3 && $code <= 0xA2E4);
+       next if ($code >= 0xA2EF && $code <= 0xA2F0);
+       next if ($code >= 0xA2FD && $code <= 0xA2FE);
+       next if ($code >= 0xA4F4 && $code <= 0xA4FE);
+       next if ($code >= 0xA5F7 && $code <= 0xA5FE);
+       next if ($code >= 0xA6B9 && $code <= 0xA6C0);
+       next if ($code >= 0xA6D9 && $code <= 0xA6FE);
+       next if ($code >= 0xA7C2 && $code <= 0xA7D0);
+       next if ($code >= 0xA7F2 && $code <= 0xA7FE);
+       next if ($code >= 0xA8BB && $code <= 0xA8C4);
+       next if ($code >= 0xA8EA && $code <= 0xA8FE);
+       next if ($code >= 0xA9A1 && $code <= 0xA9A3);
+       next if ($code >= 0xA9F0 && $code <= 0xA9FE);
+       next if ($code >= 0xD7FA && $code <= 0xD7FE);
+
+       # A couple of characters are mapped differently from GB-2312 or GB-18030
+       if ($code == 0xA1A4)
        {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
+               $ucs = 0x30FB;
        }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
-       }
-}
-
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate EUC_CN --> UTF8 table
-#
-reset 'array';
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
+       if ($code == 0xA1AA)
        {
-               next;
+               $ucs = 0x2015;
        }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$code} ne "")
-               {
-                       printf STDERR "Warning: duplicate code: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
 
-               $code |= 0x8080;
-               $array{$code} = $utf;
+       push @mapping, {
+               ucs => $ucs,
+               code => $code,
+               direction => 'both'
        }
 }
 close(FILE);
 
-$file = "euc_cn_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapEUC_CN[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $utf = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
-       }
-}
-
-print FILE "};\n";
-close(FILE);
+print_tables("EUC_CN", \@mapping);
index d2f1b757cb30ac3c1fb86c7b78dacca1682ebf35..b4e140b657c993be291dfafa3ea66d13181341af 100755 (executable)
@@ -7,9 +7,7 @@
 # Generate UTF-8 <--> EUC_JIS_2004 code conversion tables from
 # "euc-jis-2004-std.txt" (http://x0213.org)
 
-require "ucs2utf.pl";
-
-$TEST = 0;
+require "convutils.pm";
 
 # first generate UTF-8 --> EUC_JIS_2004 table
 
@@ -17,10 +15,7 @@ $in_file = "euc-jis-2004-std.txt";
 
 open(FILE, $in_file) || die("cannot open $in_file");
 
-reset 'array';
-reset 'array1';
-reset 'comment';
-reset 'comment1';
+my @all;
 
 while ($line = <FILE>)
 {
@@ -31,14 +26,14 @@ while ($line = <FILE>)
                $u2             = $3;
                $rest           = "U+" . $u1 . "+" . $u2 . $4;
                $code           = hex($c);
-               $ucs            = hex($u1);
-               $utf1           = &ucs2utf($ucs);
-               $ucs            = hex($u2);
-               $utf2           = &ucs2utf($ucs);
-               $str            = sprintf "%08x%08x", $utf1, $utf2;
-               $array1{$str}   = $code;
-               $comment1{$str} = $rest;
-               $count1++;
+               $ucs1           = hex($u1);
+               $ucs2           = hex($u2);
+
+               push @all, { direction => 'both',
+                                        ucs => $ucs1,
+                                        ucs_second => $ucs2,
+                                        code => $code,
+                                        comment => $rest };
                next;
        }
        elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
@@ -54,252 +49,11 @@ while ($line = <FILE>)
 
        $ucs  = hex($u);
        $code = hex($c);
-       $utf  = &ucs2utf($ucs);
-       if ($array{$utf} ne "")
-       {
-               printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-               next;
-       }
-       $count++;
-
-       $array{$utf}    = $code;
-       $comment{$code} = $rest;
-}
-close(FILE);
-
-$file = "utf8_to_euc_jis_2004.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE "static const pg_utf_to_local ULmapEUC_JIS_2004[] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $code = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%08x, 0x%06x} /* %s */\n", $index, $code,
-                 $comment{$code};
-       }
-       else
-       {
-               printf FILE "  {0x%08x, 0x%06x},        /* %s */\n", $index, $code,
-                 $comment{$code};
-       }
-}
-
-print FILE "};\n";
-close(FILE);
-
-if ($TEST == 1)
-{
-       $file1 = "utf8.data";
-       $file2 = "euc_jis_2004.data";
-       open(FILE1, "> $file1") || die("cannot open $file1");
-       open(FILE2, "> $file2") || die("cannot open $file2");
-
-       for $index (sort { $a <=> $b } keys(%array))
-       {
-               $code = $array{$index};
-               if (   $code > 0x00
-                       && $code != 0x09
-                       && $code != 0x0a
-                       && $code != 0x0d
-                       && $code != 0x5c
-                       && (   $code < 0x80
-                               || ($code >= 0x8ea1   && $code <= 0x8efe)
-                               || ($code >= 0x8fa1a1 && $code <= 0x8ffefe)
-                               || ($code >= 0xa1a1   && $code <= 0x8fefe)))
-               {
-                       for ($i = 3; $i >= 0; $i--)
-                       {
-                               $s    = $i * 8;
-                               $mask = 0xff << $s;
-                               print FILE1 pack("C", ($index & $mask) >> $s)
-                                 if $index & $mask;
-                               print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask;
-                       }
-                       print FILE1 "\n";
-                       print FILE2 "\n";
-               }
-       }
-}
 
-$file = "utf8_to_euc_jis_2004_combined.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE
-  "static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {\n";
+       next if ($code < 0x80 && $ucs < 0x80);
 
-for $index (sort { $a cmp $b } keys(%array1))
-{
-       $code = $array1{$index};
-       $count1--;
-       if ($count1 == 0)
-       {
-               printf FILE "  {0x%s, 0x%s, 0x%06x}     /* %s */\n", substr($index, 0, 8),
-                 substr($index, 8, 8), $code, $comment1{$index};
-       }
-       else
-       {
-               printf FILE "  {0x%s, 0x%s, 0x%06x},    /* %s */\n",
-                 substr($index, 0, 8), substr($index, 8, 8), $code,
-                 $comment1{$index};
-       }
+       push @all, { direction => 'both', ucs => $ucs, code => $code, comment => $rest };
 }
-
-print FILE "};\n";
 close(FILE);
 
-if ($TEST == 1)
-{
-       for $index (sort { $a cmp $b } keys(%array1))
-       {
-               $code = $array1{$index};
-               if (   $code > 0x00
-                       && $code != 0x09
-                       && $code != 0x0a
-                       && $code != 0x0d
-                       && $code != 0x5c
-                       && (   $code < 0x80
-                               || ($code >= 0x8ea1   && $code <= 0x8efe)
-                               || ($code >= 0x8fa1a1 && $code <= 0x8ffefe)
-                               || ($code >= 0xa1a1   && $code <= 0x8fefe)))
-               {
-
-                       $v1 = hex(substr($index, 0, 8));
-                       $v2 = hex(substr($index, 8, 8));
-
-                       for ($i = 3; $i >= 0; $i--)
-                       {
-                               $s    = $i * 8;
-                               $mask = 0xff << $s;
-                               print FILE1 pack("C", ($v1 & $mask) >> $s)   if $v1 & $mask;
-                               print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask;
-                       }
-                       for ($i = 3; $i >= 0; $i--)
-                       {
-                               $s    = $i * 8;
-                               $mask = 0xff << $s;
-                               print FILE1 pack("C", ($v2 & $mask) >> $s) if $v2 & $mask;
-                       }
-                       print FILE1 "\n";
-                       print FILE2 "\n";
-               }
-       }
-       close(FILE1);
-       close(FILE2);
-}
-
-# then generate EUC_JIS_2004 --> UTF-8 table
-
-$in_file = "euc-jis-2004-std.txt";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
-reset 'array1';
-reset 'comment';
-reset 'comment1';
-
-while ($line = <FILE>)
-{
-       if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
-       {
-               $c               = $1;
-               $u1              = $2;
-               $u2              = $3;
-               $rest            = "U+" . $u1 . "+" . $u2 . $4;
-               $code            = hex($c);
-               $ucs             = hex($u1);
-               $utf1            = &ucs2utf($ucs);
-               $ucs             = hex($u2);
-               $utf2            = &ucs2utf($ucs);
-               $str             = sprintf "%08x%08x", $utf1, $utf2;
-               $array1{$code}   = $str;
-               $comment1{$code} = $rest;
-               $count1++;
-               next;
-       }
-       elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
-       {
-               $c    = $1;
-               $u    = $2;
-               $rest = "U+" . $u . $3;
-       }
-       else
-       {
-               next;
-       }
-
-       $ucs  = hex($u);
-       $code = hex($c);
-       $utf  = &ucs2utf($ucs);
-       if ($array{$code} ne "")
-       {
-               printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-               next;
-       }
-       $count++;
-
-       $array{$code}  = $utf;
-       $comment{$utf} = $rest;
-}
-close(FILE);
-
-$file = "euc_jis_2004_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE "static const pg_local_to_utf LUmapEUC_JIS_2004[] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $code = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%06x, 0x%08x} /* %s */\n", $index, $code,
-                 $comment{$code};
-       }
-       else
-       {
-               printf FILE "  {0x%06x, 0x%08x},        /* %s */\n", $index, $code,
-                 $comment{$code};
-       }
-}
-
-print FILE "};\n";
-close(FILE);
-
-$file = "euc_jis_2004_to_utf8_combined.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE
-  "static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array1))
-{
-       $code = $array1{$index};
-       $count1--;
-       if ($count1 == 0)
-       {
-               printf FILE "  {0x%06x, 0x%s, 0x%s}     /* %s */\n", $index,
-                 substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
-       }
-       else
-       {
-               printf FILE "  {0x%06x, 0x%s, 0x%s},    /* %s */\n", $index,
-                 substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
-       }
-}
-
-print FILE "};\n";
-close(FILE);
+print_tables("EUC_JIS_2004", \@all, 1);
index 055fc849bae4ee70933eb1e52ca5ca7bb5ebda25..0e9dd292bff11c58df0000cd563cbc59c5b76229 100755 (executable)
 # map files provided by Unicode organization.
 # Unfortunately it is prohibited by the organization
 # to distribute the map files. So if you try to use this script,
-# you have to obtain JIS0201.TXT, JIS0208.TXT, JIS0212.TXT from
-# the organization's ftp site.
-#
-# JIS0201.TXT format:
-#               JIS0201 code in hex
-#               UCS-2 code in hex
-#               # and Unicode name (not used in this script)
-#
-# JIS0208.TXT format:
-#               JIS0208 shift-JIS code in hex
-#               JIS0208 code in hex
-#               UCS-2 code in hex
-#               # and Unicode name (not used in this script)
-#
-# JIS0212.TXT format:
-#               JIS0212 code in hex
-#               UCS-2 code in hex
-#               # and Unicode name (not used in this script)
-
-require "ucs2utf.pl";
-
-# first generate UTF-8 --> EUC_JP table
+# you have to obtain CP932.TXT and JIS0212.TXT from the
+# organization's ftp site.
 
-#
-# JIS0201
-#
-$in_file = "JIS0201.TXT";
+use strict;
+require "convutils.pm";
 
-open(FILE, $in_file) || die("cannot open $in_file");
+# Load JIS0212.TXT
+my $jis0212 = &read_source("JIS0212.TXT");
 
-reset 'array';
+my @mapping;
 
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
+foreach my $i (@$jis0212) {
+       # We have a different mapping for this in the EUC_JP to UTF-8 direction.
+       if ($i->{code} == 0x2243)
        {
-               next;
+               $i->{direction} = "from_unicode";
        }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
 
-               # add single shift 2
-               $array{$utf} = ($code | 0x8e00);
+       if ($i->{code} == 0x2271)
+       {
+               $i->{direction} = "to_unicode";
        }
-}
-close(FILE);
-
-#
-# JIS0208
-#
-$in_file = "JIS0208.TXT";
 
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
+       if ($i->{ucs} >= 0x080)
        {
-               next;
+               $i->{code} = $i->{code} | 0x8f8080;
        }
-       ($s, $c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
+       else
        {
-               $utf = &ucs2utf($ucs);
-               if ($array{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-
-               $array{$utf} = ($code | 0x8080);
+               next;
        }
+
+       push @mapping, $i;
 }
-close(FILE);
 
-#
-# JIS0212
-#
-$in_file = "JIS0212.TXT";
+# Load CP932.TXT.
+my $ct932 = &read_source("CP932.TXT");
 
-open(FILE, $in_file) || die("cannot open $in_file");
+foreach my $i (@$ct932) {
+       my $sjis = $i->{code};
 
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
+       # We have a different mapping for this in the EUC_JP to UTF-8 direction.
+       if ($sjis == 0xeefa ||
+               $sjis == 0xeefb ||
+               $sjis == 0xeefc)
        {
                next;
        }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
 
-               $array{$utf} = ($code | 0x8f8080);
-       }
-}
-close(FILE);
+       if ($sjis >= 0xa1)
+       {
+               my $jis = &sjis2jis($sjis);
 
-$file = "utf8_to_euc_jp.map";
-open(FILE, "> $file") || die("cannot open $file");
+               $i->{code} = $jis | ($jis < 0x100 ? 0x8e00 :
+                                                        ($sjis >= 0xeffd  ? 0x8f8080 : 0x8080));
 
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapEUC_JP[ $count ] = {\n";
+               # Remember the SJIS code for later.
+               $i->{sjis} = $sjis;
 
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $code = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
+               push @mapping, $i;
        }
 }
 
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate EUC_JP --> UTF8 table
-#
+foreach my $i (@mapping) {
+       my $sjis = $i->{sjis};
 
-#
-# JIS0201
-#
-$in_file = "JIS0201.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
-
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
+       # These SJIS characters are excluded completely.
+       if ($sjis >= 0xed00 && $sjis <= 0xeef9 ||
+               $sjis >= 0xfa54 && $sjis <= 0xfa56 ||
+               $sjis >= 0xfa58 && $sjis <= 0xfc4b)
        {
+               $i->{direction} = "none";
                next;
        }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$code} ne "")
-               {
-                       printf STDERR "Warning: duplicate code: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-
-               # add single shift 2
-               $code |= 0x8e00;
-               $array{$code} = $utf;
-       }
-}
-close(FILE);
-
-#
-# JIS0208
-#
-$in_file = "JIS0208.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
 
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
+       # These SJIS characters are only in the UTF-8 to EUC_JP table
+       if ($sjis == 0xeefa || $sjis == 0xeefb || $sjis == 0xeefc)
        {
+               $i->{direction} = "from_unicode";
                next;
        }
-       ($s, $c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$code} ne "")
-               {
-                       printf STDERR "Warning: duplicate code: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
 
-               $code |= 0x8080;
-               $array{$code} = $utf;
+       if ($sjis == 0x8790 || $sjis == 0x8791 || $sjis == 0x8792 ||
+               $sjis == 0x8795 || $sjis == 0x8796 || $sjis == 0x8797 ||
+               $sjis == 0x879a || $sjis == 0x879b || $sjis == 0x879c ||
+               ($sjis >= 0xfa4a && $sjis <= 0xfa53))
+       {
+               $i->{direction} = "to_unicode";
+               next;
        }
 }
-close(FILE);
 
-#
-# JIS0212
-#
-$in_file = "JIS0212.TXT";
+push @mapping, (
+        {direction => 'both', ucs => 0x4efc, code => 0x8ff4af, comment => '# CJK(4EFC)'},
+        {direction => 'both', ucs => 0x50f4, code => 0x8ff4b0, comment => '# CJK(50F4)'},
+        {direction => 'both', ucs => 0x51EC, code => 0x8ff4b1, comment => '# CJK(51EC)'},
+        {direction => 'both', ucs => 0x5307, code => 0x8ff4b2, comment => '# CJK(5307)'},
+        {direction => 'both', ucs => 0x5324, code => 0x8ff4b3, comment => '# CJK(5324)'},
+        {direction => 'both', ucs => 0x548A, code => 0x8ff4b5, comment => '# CJK(548A)'},
+        {direction => 'both', ucs => 0x5759, code => 0x8ff4b6, comment => '# CJK(5759)'},
+        {direction => 'both', ucs => 0x589E, code => 0x8ff4b9, comment => '# CJK(589E)'},
+        {direction => 'both', ucs => 0x5BEC, code => 0x8ff4ba, comment => '# CJK(5BEC)'},
+        {direction => 'both', ucs => 0x5CF5, code => 0x8ff4bb, comment => '# CJK(5CF5)'},
+        {direction => 'both', ucs => 0x5D53, code => 0x8ff4bc, comment => '# CJK(5D53)'},
+        {direction => 'both', ucs => 0x5FB7, code => 0x8ff4be, comment => '# CJK(5FB7)'},
+        {direction => 'both', ucs => 0x6085, code => 0x8ff4bf, comment => '# CJK(6085)'},
+        {direction => 'both', ucs => 0x6120, code => 0x8ff4c0, comment => '# CJK(6120)'},
+        {direction => 'both', ucs => 0x654E, code => 0x8ff4c1, comment => '# CJK(654E)'},
+        {direction => 'both', ucs => 0x663B, code => 0x8ff4c2, comment => '# CJK(663B)'},
+        {direction => 'both', ucs => 0x6665, code => 0x8ff4c3, comment => '# CJK(6665)'},
+        {direction => 'both', ucs => 0x6801, code => 0x8ff4c6, comment => '# CJK(6801)'},
+        {direction => 'both', ucs => 0x6A6B, code => 0x8ff4c9, comment => '# CJK(6A6B)'},
+        {direction => 'both', ucs => 0x6AE2, code => 0x8ff4ca, comment => '# CJK(6AE2)'},
+        {direction => 'both', ucs => 0x6DF2, code => 0x8ff4cc, comment => '# CJK(6DF2)'},
+        {direction => 'both', ucs => 0x6DF8, code => 0x8ff4cb, comment => '# CJK(6DF8)'},
+        {direction => 'both', ucs => 0x7028, code => 0x8ff4cd, comment => '# CJK(7028)'},
+        {direction => 'both', ucs => 0x70BB, code => 0x8ff4ae, comment => '# CJK(70BB)'},
+        {direction => 'both', ucs => 0x7501, code => 0x8ff4d0, comment => '# CJK(7501)'},
+        {direction => 'both', ucs => 0x7682, code => 0x8ff4d1, comment => '# CJK(7682)'},
+        {direction => 'both', ucs => 0x769E, code => 0x8ff4d2, comment => '# CJK(769E)'},
+        {direction => 'both', ucs => 0x7930, code => 0x8ff4d4, comment => '# CJK(7930)'},
+        {direction => 'both', ucs => 0x7AE7, code => 0x8ff4d9, comment => '# CJK(7AE7)'},
+        {direction => 'both', ucs => 0x7DA0, code => 0x8ff4dc, comment => '# CJK(7DA0)'},
+        {direction => 'both', ucs => 0x7DD6, code => 0x8ff4dd, comment => '# CJK(7DD6)'},
+        {direction => 'both', ucs => 0x8362, code => 0x8ff4df, comment => '# CJK(8362)'},
+        {direction => 'both', ucs => 0x85B0, code => 0x8ff4e1, comment => '# CJK(85B0)'},
+        {direction => 'both', ucs => 0x8807, code => 0x8ff4e4, comment => '# CJK(8807)'},
+        {direction => 'both', ucs => 0x8B7F, code => 0x8ff4e6, comment => '# CJK(8B7F)'},
+        {direction => 'both', ucs => 0x8CF4, code => 0x8ff4e7, comment => '# CJK(8CF4)'},
+        {direction => 'both', ucs => 0x8D76, code => 0x8ff4e8, comment => '# CJK(8D76)'},
+        {direction => 'both', ucs => 0x90DE, code => 0x8ff4ec, comment => '# CJK(90DE)'},
+        {direction => 'both', ucs => 0x9115, code => 0x8ff4ee, comment => '# CJK(9115)'},
+        {direction => 'both', ucs => 0x9592, code => 0x8ff4f1, comment => '# CJK(9592)'},
+        {direction => 'both', ucs => 0x973B, code => 0x8ff4f4, comment => '# CJK(973B)'},
+        {direction => 'both', ucs => 0x974D, code => 0x8ff4f5, comment => '# CJK(974D)'},
+        {direction => 'both', ucs => 0x9751, code => 0x8ff4f6, comment => '# CJK(9751)'},
+        {direction => 'both', ucs => 0x999E, code => 0x8ff4fa, comment => '# CJK(999E)'},
+        {direction => 'both', ucs => 0x9AD9, code => 0x8ff4fb, comment => '# CJK(9AD9)'},
+        {direction => 'both', ucs => 0x9B72, code => 0x8ff4fc, comment => '# CJK(9B72)'},
+        {direction => 'both', ucs => 0x9ED1, code => 0x8ff4fe, comment => '# CJK(9ED1)'},
+        {direction => 'both', ucs => 0xF929, code => 0x8ff4c5, comment => '# CJK COMPATIBILITY IDEOGRAPH-F929'},
+        {direction => 'both', ucs => 0xF9DC, code => 0x8ff4f2, comment => '# CJK COMPATIBILITY IDEOGRAPH-F9DC'},
+        {direction => 'both', ucs => 0xFA0E, code => 0x8ff4b4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0E'},
+        {direction => 'both', ucs => 0xFA0F, code => 0x8ff4b7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0F'},
+        {direction => 'both', ucs => 0xFA10, code => 0x8ff4b8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA10'},
+        {direction => 'both', ucs => 0xFA11, code => 0x8ff4bd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA11'},
+        {direction => 'both', ucs => 0xFA12, code => 0x8ff4c4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA12'},
+        {direction => 'both', ucs => 0xFA13, code => 0x8ff4c7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA13'},
+        {direction => 'both', ucs => 0xFA14, code => 0x8ff4c8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA14'},
+        {direction => 'both', ucs => 0xFA15, code => 0x8ff4ce, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA15'},
+        {direction => 'both', ucs => 0xFA16, code => 0x8ff4cf, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA16'},
+        {direction => 'both', ucs => 0xFA17, code => 0x8ff4d3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA17'},
+        {direction => 'both', ucs => 0xFA18, code => 0x8ff4d5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA18'},
+        {direction => 'both', ucs => 0xFA19, code => 0x8ff4d6, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA19'},
+        {direction => 'both', ucs => 0xFA1A, code => 0x8ff4d7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1A'},
+        {direction => 'both', ucs => 0xFA1B, code => 0x8ff4d8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1B'},
+        {direction => 'both', ucs => 0xFA1C, code => 0x8ff4da, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1C'},
+        {direction => 'both', ucs => 0xFA1D, code => 0x8ff4db, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1D'},
+        {direction => 'both', ucs => 0xFA1E, code => 0x8ff4de, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1E'},
+        {direction => 'both', ucs => 0xFA1F, code => 0x8ff4e0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1F'},
+        {direction => 'both', ucs => 0xFA20, code => 0x8ff4e2, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA20'},
+        {direction => 'both', ucs => 0xFA21, code => 0x8ff4e3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA21'},
+        {direction => 'both', ucs => 0xFA22, code => 0x8ff4e5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA22'},
+        {direction => 'both', ucs => 0xFA23, code => 0x8ff4e9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA23'},
+        {direction => 'both', ucs => 0xFA24, code => 0x8ff4ea, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA24'},
+        {direction => 'both', ucs => 0xFA25, code => 0x8ff4eb, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA25'},
+        {direction => 'both', ucs => 0xFA26, code => 0x8ff4ed, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA26'},
+        {direction => 'both', ucs => 0xFA27, code => 0x8ff4ef, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA27'},
+        {direction => 'both', ucs => 0xFA28, code => 0x8ff4f0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA28'},
+        {direction => 'both', ucs => 0xFA29, code => 0x8ff4f3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA29'},
+        {direction => 'both', ucs => 0xFA2A, code => 0x8ff4f7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2A'},
+        {direction => 'both', ucs => 0xFA2B, code => 0x8ff4f8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2B'},
+        {direction => 'both', ucs => 0xFA2C, code => 0x8ff4f9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2C'},
+        {direction => 'both', ucs => 0xFA2D, code => 0x8ff4fd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2D'},
+        {direction => 'both', ucs => 0xFF07, code => 0x8ff4a9, comment => '# FULLWIDTH APOSTROPHE'},
+        {direction => 'both', ucs => 0xFFE4, code => 0x8fa2c3, comment => '# FULLWIDTH BROKEN BAR'},
+
+        # additional conversions for EUC_JP -> UTF-8 conversion
+        {direction => 'to_unicode', ucs => 0x2116, code => 0x8ff4ac, comment => '# NUMERO SIGN'},
+        {direction => 'to_unicode', ucs => 0x2121, code => 0x8ff4ad, comment => '# TELEPHONE SIGN'},
+        {direction => 'to_unicode', ucs => 0x3231, code => 0x8ff4ab, comment => '# PARENTHESIZED IDEOGRAPH STOCK'}
+       );
+
+print_tables("EUC_JP", \@mapping);
+
+#######################################################################
+# sjis2jis ; SJIS => JIS conversion
+sub sjis2jis
+{
+       my ($sjis) = @_;
 
-open(FILE, $in_file) || die("cannot open $in_file");
+       return $sjis if ($sjis <= 0x100);
 
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
-       {
-               next;
-       }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
+       my $hi = $sjis >> 8;
+       my $lo = $sjis & 0xff;
+
+       if ($lo >= 0x80) { $lo--; }
+       $lo -= 0x40;
+       if ($hi >= 0xe0) { $hi -= 0x40; }
+       $hi -= 0x81;
+       my $pos = $lo + $hi * 0xbc;
+
+       if ($pos >= 114 * 0x5e && $pos <= 115 * 0x5e + 0x1b)
        {
-               $utf = &ucs2utf($ucs);
-               if ($array{$code} ne "")
-               {
-                       printf STDERR "Warning: duplicate code: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
+               # This region (115-ku) is out of range of JIS code but for
+               # convenient to generate code in EUC CODESET 3, move this to
+               # seemingly duplicate region (83-84-ku).
+               $pos = $pos - ((31 * 0x5e) + 12);
 
-               $code |= 0x8f8080;
-               $array{$code} = $utf;
+               # after 85-ku 82-ten needs to be moved 2 codepoints
+               $pos = $pos - 2 if ($pos >= 84 * 0x5c + 82)
        }
-}
-close(FILE);
 
-$file = "euc_jp_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
+       my $hi2 = $pos / 0x5e;
+       my $lo2 = ($pos % 0x5e);
 
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapEUC_JP[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $utf = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
-       }
-}
+       my $ret = $lo2 + 0x21 + (($hi2 + 0x21) << 8);
 
-print FILE "};\n";
-close(FILE);
+       return $ret;
+}
index a7c94bca915c91279be0bfafafe8343c2d346e54..a917d067172a25c53fad45a840b2f1f792e68474 100755 (executable)
 #               UCS-2 code in hex
 #               # and Unicode name (not used in this script)
 
-require "ucs2utf.pl";
+require "convutils.pm";
 
-# first generate UTF-8 --> EUC_KR table
+# Load the source file.
 
-$in_file = "KSX1001.TXT";
+my $mapping = &read_source("KSX1001.TXT");
 
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
-       {
-               next;
-       }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-
-               $array{$utf} = ($code | 0x8080);
-       }
-}
-close(FILE);
-
-$file = "utf8_to_euc_kr.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapEUC_KR[ $count ] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
+foreach my $i (@$mapping)
 {
-       $code = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
-       }
+       $i->{code} = $i->{code} | 0x8080;
 }
 
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate EUC_KR --> UTF8 table
-#
-reset 'array';
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
-       {
-               next;
-       }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$code} ne "")
-               {
-                       printf STDERR "Warning: duplicate code: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-
-               $code |= 0x8080;
-               $array{$code} = $utf;
-       }
-}
-close(FILE);
-
-$file = "euc_kr_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapEUC_KR[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $utf = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
-       }
-}
+# Some extra characters that are not in KSX1001.TXT
+push @$mapping, (
+       {direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN'},
+       {direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN'},
+       {direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U'}
+       );
 
-print FILE "};\n";
-close(FILE);
+print_tables("EUC_KR", $mapping);
index e4fc535b1800031bd42c06357eef1c2d812942f5..aceef5433c28bd59590c164ec58019893957cbc1 100755 (executable)
 #               UCS-2 code in hex
 #               # and Unicode name (not used in this script)
 
-require "ucs2utf.pl";
+require "convutils.pm";
 
-# first generate UTF-8 --> EUC_TW table
+my $mapping = &read_source("CNS11643.TXT");
 
-$in_file = "CNS11643.TXT";
+my @extras;
 
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
+foreach my $i (@$mapping)
 {
-       chop;
-       if (/^#/)
+       my $ucs = $i->{ucs};
+       my $code = $i->{code};
+       my $origcode = $i->{code};
+
+       my $plane = ($code & 0x1f0000) >> 16;
+       if ($plane > 16)
        {
+               printf STDERR "Warning: invalid plane No.$plane. ignored\n";
                next;
        }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-
-               $plane = ($code & 0x1f0000) >> 16;
-               if ($plane > 16)
-               {
-                       printf STDERR "Warning: invalid plane No.$plane. ignored\n";
-                       next;
-               }
-
-               if ($plane == 1)
-               {
-                       $array{$utf} = (($code & 0xffff) | 0x8080);
-               }
-               else
-               {
-                       $array{$utf} =
-                         (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
-               }
-       }
-}
-close(FILE);
-
-$file = "utf8_to_euc_tw.map";
-open(FILE, "> $file") || die("cannot open $file");
 
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapEUC_TW[ $count ] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $code = $array{$index};
-       $count--;
-       if ($count == 0)
+       if ($plane == 1)
        {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
+               $code = ($code & 0xffff) | 0x8080;
        }
        else
        {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
+               $code = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
        }
-}
-
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate EUC_TW --> UTF8 table
-#
-reset 'array';
-
-open(FILE, $in_file) || die("cannot open $in_file");
+       $i->{code} = $code;
 
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
+       # Some codes are mapped twice in the EUC_TW to UTF-8 table.
+       if ($origcode >= 0x12121 && $origcode <= 0x20000)
        {
-               next;
-       }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ($array{$code} ne "")
-               {
-                       printf STDERR "Warning: duplicate code: %04x\n", $ucs;
-                       next;
-               }
-               $count++;
-
-               $plane = ($code & 0x1f0000) >> 16;
-               if ($plane > 16)
-               {
-                       printf STDERR "Warning: invalid plane No.$plane. ignored\n";
-                       next;
-               }
-
-               if ($plane == 1)
-               {
-                       $c = (($code & 0xffff) | 0x8080);
-                       $array{$c} = $utf;
-                       $count++;
+               push @extras, {
+                       ucs => $i->{ucs},
+                       code => ($i->{code} + 0x8ea10000),
+                       rest => $i->{rest},
+                       direction => 'to_unicode'
                }
-               $c = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
-               $array{$c} = $utf;
        }
 }
-close(FILE);
-
-$file = "euc_tw_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
 
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapEUC_TW[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $utf = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
-       }
-}
+push @$mapping, @extras;
 
-print FILE "};\n";
-close(FILE);
+print_tables("EUC_TW", $mapping);
index 043c1c27ec8f6bccc85c4d759a47449784172221..f58361024e4c67e375b25a11cf90728fb36cbdb8 100755 (executable)
@@ -13,8 +13,7 @@
 # where the "u" field is the Unicode code point in hex,
 # and the "b" field is the hex byte sequence for GB18030
 
-require "ucs2utf.pl";
-
+require "convutils.pm";
 
 # Read the input
 
@@ -22,6 +21,8 @@ $in_file = "gb-18030-2000.xml";
 
 open(FILE, $in_file) || die("cannot open $in_file");
 
+my @mapping;
+
 while (<FILE>)
 {
        next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
@@ -32,78 +33,13 @@ while (<FILE>)
        $code = hex($c);
        if ($code >= 0x80 && $ucs >= 0x0080)
        {
-               $utf = &ucs2utf($ucs);
-               if ($arrayu{$utf} ne "")
-               {
-                       printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                       next;
+               push @mapping, {
+                       ucs => $ucs,
+                       code => $code,
+                       direction => 'both'
                }
-               if ($arrayc{$code} ne "")
-               {
-                       printf STDERR "Warning: duplicate GB18030: %08x\n", $code;
-                       next;
-               }
-               $arrayu{$utf}  = $code;
-               $arrayc{$code} = $utf;
-               $count++;
-       }
-}
-close(FILE);
-
-
-#
-# first, generate UTF8 --> GB18030 table
-#
-
-$file = "utf8_to_gb18030.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n";
-
-$cc = $count;
-for $index (sort { $a <=> $b } keys(%arrayu))
-{
-       $code = $arrayu{$index};
-       $cc--;
-       if ($cc == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
        }
 }
-
-print FILE "};\n";
 close(FILE);
 
-
-#
-# then generate GB18030 --> UTF8 table
-#
-
-$file = "gb18030_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n";
-
-$cc = $count;
-for $index (sort { $a <=> $b } keys(%arrayc))
-{
-       $utf = $arrayc{$index};
-       $cc--;
-       if ($cc == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
-       }
-}
-
-print FILE "};\n";
-close(FILE);
+print_tables("GB18030", \@mapping);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl
new file mode 100755 (executable)
index 0000000..b98f9a7
--- /dev/null
@@ -0,0 +1,31 @@
+#! /usr/bin/perl
+#
+# Copyright (c) 2001-2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl
+#
+# Generate UTF-8 <--> JOHAB conversion tables from
+# map files provided by Unicode organization.
+# Unfortunately it is prohibited by the organization
+# to distribute the map files. So if you try to use this script,
+# you have to obtain the map files from the organization's ftp site.
+# ftp://www.unicode.org/Public/MAPPINGS/
+# We assume the file include three tab-separated columns:
+#               JOHAB code in hex
+#               UCS-2 code in hex
+#               # and Unicode name (not used in this script)
+
+require "convutils.pm";
+
+# Load the source file.
+
+my $mapping = &read_source("JOHAB.TXT");
+
+# Some extra characters that are not in JOHAB.TXT
+push @$mapping, (
+       {direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN'},
+       {direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN'},
+       {direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U'}
+       );
+
+print_tables("JOHAB", $mapping);
index 51ffd86b2c96a0047a2ff043e3c8615c1d8373a9..16a53ad1d9fa74d46f80f61de5c178db13efa775 100755 (executable)
@@ -7,7 +7,7 @@
 # Generate UTF-8 <--> SHIFT_JIS_2004 code conversion tables from
 # "sjis-0213-2004-std.txt" (http://x0213.org)
 
-require "ucs2utf.pl";
+require "convutils.pm";
 
 # first generate UTF-8 --> SHIFT_JIS_2004 table
 
@@ -15,10 +15,7 @@ $in_file = "sjis-0213-2004-std.txt";
 
 open(FILE, $in_file) || die("cannot open $in_file");
 
-reset 'array';
-reset 'array1';
-reset 'comment';
-reset 'comment1';
+my @mapping;
 
 while ($line = <FILE>)
 {
@@ -29,14 +26,16 @@ while ($line = <FILE>)
                $u2             = $3;
                $rest           = "U+" . $u1 . "+" . $u2 . $4;
                $code           = hex($c);
-               $ucs            = hex($u1);
-               $utf1           = &ucs2utf($ucs);
-               $ucs            = hex($u2);
-               $utf2           = &ucs2utf($ucs);
-               $str            = sprintf "%08x%08x", $utf1, $utf2;
-               $array1{$str}   = $code;
-               $comment1{$str} = $rest;
-               $count1++;
+               $ucs1           = hex($u1);
+               $ucs2           = hex($u2);
+
+               push @mapping, {
+                       code => $code,
+                       ucs => $ucs1,
+                       ucs_second => $ucs2,
+                       comment => $rest,
+                       direction => 'both'
+               };
                next;
        }
        elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
@@ -52,183 +51,31 @@ while ($line = <FILE>)
 
        $ucs  = hex($u);
        $code = hex($c);
-       $utf  = &ucs2utf($ucs);
-       if ($array{$utf} ne "")
-       {
-               printf STDERR
-                 "Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf,
-                 $ucs, $code;
-               next;
-       }
-       $count++;
 
-       $array{$utf}    = $code;
-       $comment{$code} = $rest;
-}
-close(FILE);
-
-$file = "utf8_to_shift_jis_2004.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE "static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $code = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%08x, 0x%06x} /* %s */\n", $index, $code,
-                 $comment{$code};
-       }
-       else
+       if ($code < 0x80 && $ucs < 0x80)
        {
-               printf FILE "  {0x%08x, 0x%06x},        /* %s */\n", $index, $code,
-                 $comment{$code};
-       }
-}
-
-print FILE "};\n";
-close(FILE);
-
-$file = "utf8_to_shift_jis_2004_combined.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE
-"static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = {\n";
-
-for $index (sort { $a cmp $b } keys(%array1))
-{
-       $code = $array1{$index};
-       $count1--;
-       if ($count1 == 0)
-       {
-               printf FILE "  {0x%s, 0x%s, 0x%04x}     /* %s */\n", substr($index, 0, 8),
-                 substr($index, 8, 8), $code, $comment1{$index};
-       }
-       else
-       {
-               printf FILE "  {0x%s, 0x%s, 0x%04x},    /* %s */\n",
-                 substr($index, 0, 8), substr($index, 8, 8), $code,
-                 $comment1{$index};
-       }
-}
-
-print FILE "};\n";
-close(FILE);
-
-# then generate SHIFT_JIS_2004 --> UTF-8 table
-
-$in_file = "sjis-0213-2004-std.txt";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
-reset 'array1';
-reset 'comment';
-reset 'comment1';
-
-while ($line = <FILE>)
-{
-       if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
-       {
-               $c               = $1;
-               $u1              = $2;
-               $u2              = $3;
-               $rest            = "U+" . $u1 . "+" . $u2 . $4;
-               $code            = hex($c);
-               $ucs             = hex($u1);
-               $utf1            = &ucs2utf($ucs);
-               $ucs             = hex($u2);
-               $utf2            = &ucs2utf($ucs);
-               $str             = sprintf "%08x%08x", $utf1, $utf2;
-               $array1{$code}   = $str;
-               $comment1{$code} = $rest;
-               $count1++;
                next;
        }
-       elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
+       elsif ($code < 0x80)
        {
-               $c    = $1;
-               $u    = $2;
-               $rest = "U+" . $u . $3;
+               $direction = 'from_unicode';
        }
-       else
-       {
-               next;
-       }
-
-       $ucs  = hex($u);
-       $code = hex($c);
-       $utf  = &ucs2utf($ucs);
-       if ($array{$code} ne "")
-       {
-               printf STDERR
-                 "Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf,
-                 $ucs, $code;
-               printf STDERR "Previous value: UTF8: %08x\n", $array{$utf};
-               next;
-       }
-       $count++;
-
-       $array{$code}  = $utf;
-       $comment{$utf} = $rest;
-}
-close(FILE);
-
-$file = "shift_jis_2004_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_SHIFTJIS_2004.pl\n";
-print FILE " */\n";
-print FILE "static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $code = $array{$index};
-       $count--;
-       if ($count == 0)
+       elsif ($ucs < 0x80)
        {
-               printf FILE "  {0x%04x, 0x%08x} /* %s */\n", $index, $code,
-                 $comment{$code};
+               $direction = 'to_unicode';
        }
        else
        {
-               printf FILE "  {0x%04x, 0x%08x},        /* %s */\n", $index, $code,
-                 $comment{$code};
+               $direction = 'both';
        }
-}
-
-print FILE "};\n";
-close(FILE);
-
-$file = "shift_jis_2004_to_utf8_combined.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE
-"static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = {\n";
 
-for $index (sort { $a <=> $b } keys(%array1))
-{
-       $code = $array1{$index};
-       $count1--;
-       if ($count1 == 0)
-       {
-               printf FILE "  {0x%04x, 0x%s, 0x%s}     /* %s */\n", $index,
-                 substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%s, 0x%s},    /* %s */\n", $index,
-                 substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
-       }
+       push @mapping, {
+               code => $code,
+               ucs => $ucs,
+               comment => $rest,
+               direction => $direction
+       };
 }
-
-print FILE "};\n";
 close(FILE);
+
+print_tables("SHIFT_JIS_2004", \@mapping, 1);
index 10e54b157d266199cd8b2a9ebfdcdaa712ae3217..c8ff712af8fd279349da855752bb848ee6101a6e 100755 (executable)
 #
 # src/backend/utils/mb/Unicode/UCS_to_SJIS.pl
 #
-# Generate UTF-8 <--> SJIS code conversion tables from
-# map files provided by Unicode organization.
-# Unfortunately it is prohibited by the organization
-# to distribute the map files. So if you try to use this script,
-# you have to obtain SHIFTJIS.TXT from
-# the organization's ftp site.
-#
-# SHIFTJIS.TXT format:
-#               SHIFTJIS code in hex
-#               UCS-2 code in hex
-#               # and Unicode name (not used in this script)
-# Warning: SHIFTJIS.TXT contains only JIS0201 and JIS0208. no JIS0212.
-
-require "ucs2utf.pl";
-
-# first generate UTF-8 --> SJIS table
-
-$in_file = "CP932.TXT";
-$count   = 0;
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
-       {
-               next;
-       }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               if ((($code >= 0xed40) && ($code <= 0xeefc))
-                       || (   ($code >= 0x8754)
-                               && ($code <= 0x875d))
-                       || ($code == 0x878a)
-                       || ($code == 0x8782)
-                       || ($code == 0x8784)
-                       || ($code == 0xfa5b)
-                       || ($code == 0xfa54)
-                       || (   ($code >= 0x8790)
-                               && ($code <= 0x8792))
-                       || (   ($code >= 0x8795)
-                               && ($code <= 0x8797))
-                       || (   ($code >= 0x879a)
-                               && ($code <= 0x879c)))
-               {
-                       printf STDERR "Warning: duplicate UTF8: UCS=0x%04x SJIS=0x%04x\n",
-                         $ucs,
-                         $code;
-                       next;
-               }
-               $count++;
-               $array{$utf} = $code;
-       }
-}
+# Generate UTF-8 <=> SJIS code conversion radix tree Generate UTF-8
+# <=> SJIS code conversion radix tree Unfortunately it is prohibited
+# by the organization to distribute the map files. So if you try to
+# use this script, you have to obtain CP932.TXT from the organization's
+# ftp site.
 
-close(FILE);
+use strict;
+require "convutils.pm";
 
-$file = "utf8_to_sjis.map";
-open(FILE, "> $file") || die("cannot open $file");
+my $charset = read_source("CP932.TXT");
 
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapSJIS[ $count ] = {\n";
+# Drop these SJIS codes from the source for UTF8=>SJIS conversion
+my @reject_sjis =(
+       0xed40..0xeefc, 0x8754..0x875d, 0x878a, 0x8782,
+       0x8784, 0xfa5b, 0xfa54, 0x8790..0x8792, 0x8795..0x8797,
+       0x879a..0x879c
+);
 
-for $index (sort { $a <=> $b } keys(%array))
+foreach my $i (@$charset)
 {
-       $code = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
-       }
-       else
-       {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
-       }
-}
-
-print FILE "};\n";
-close(FILE);
+       my $code = $i->{code};
+       my $ucs = $i->{ucs};
 
-#
-# then generate SJIS --> UTF8 table
-#
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
-$count = 0;
-
-while (<FILE>)
-{
-       chop;
-       if (/^#/)
-       {
-               next;
-       }
-       ($c, $u, $rest) = split;
-       $ucs  = hex($u);
-       $code = hex($c);
-       if ($code >= 0x80 && $ucs >= 0x0080)
-       {
-               $utf = &ucs2utf($ucs);
-               $count++;
-
-               $array{$code} = $utf;
-       }
-}
-close(FILE);
-
-$file = "sjis_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapSJIS[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
-       $utf = $array{$index};
-       $count--;
-       if ($count == 0)
-       {
-               printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
-       }
-       else
+       if (grep {$code == $_} @reject_sjis)
        {
-               printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
+               $i->{direction} = "to_unicode";
        }
 }
 
-print FILE "};\n";
-close(FILE);
+# Add these UTF8->SJIS pairs to the table.
+push @$charset, (
+       {direction => "from_unicode", ucs => 0x00a2,   code => 0x8191, comment => '# CENT SIGN'},
+       {direction => "from_unicode", ucs => 0x00a3,   code => 0x8192, comment => '# POUND SIGN'},
+       {direction => "from_unicode", ucs => 0x00a5,   code => 0x5c,   comment => '# YEN SIGN'},
+       {direction => "from_unicode", ucs => 0x00ac,   code => 0x81ca, comment => '# NOT SIGN'},
+       {direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE'},
+       {direction => "from_unicode", ucs => 0x203e, code => 0x7e,   comment => '# OVERLINE'},
+       {direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN'},
+       {direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH'}
+);
+
+print_tables("SJIS", $charset);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl
new file mode 100755 (executable)
index 0000000..b6bf3bd
--- /dev/null
@@ -0,0 +1,51 @@
+#! /usr/bin/perl
+#
+# Copyright (c) 2007-2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+#
+# Generate UTF-8 <--> UHC code conversion tables from
+# "windows-949-2000.xml", obtained from
+# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
+#
+# The lines we care about in the source file look like
+#    <a u="009A" b="81 30 83 36"/>
+# where the "u" field is the Unicode code point in hex,
+# and the "b" field is the hex byte sequence for UHC
+
+require "convutils.pm";
+
+# Read the input
+
+$in_file = "windows-949-2000.xml";
+
+open(FILE, $in_file) || die("cannot open $in_file");
+
+my @mapping;
+
+while (<FILE>)
+{
+       next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
+       $u = $1;
+       $c = $2;
+       $c =~ s/ //g;
+       $ucs  = hex($u);
+       $code = hex($c);
+
+       next if ($code == 0x0080 || $code == 0x00FF);
+
+       if ($code >= 0x80 && $ucs >= 0x0080)
+       {
+               push @mapping, {
+                       ucs => $ucs,
+                       code => $code,
+                       direction => 'both'
+               }
+       }
+}
+close(FILE);
+
+# One extra character that's not in the source file.
+push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U' };
+
+print_tables("UHC", \@mapping);
index 125378f149ac0ae3c3862be4d58e0318e689f12d..a3cf436eefd56708788d8ac18bb64fef67c85b9e 100755 (executable)
@@ -15,7 +15,7 @@
 #               UCS-2 code in hex
 #               # and Unicode name (not used in this script)
 
-require "ucs2utf.pl";
+require "convutils.pm";
 
 %filename = (
        'WIN866'     => 'CP866.TXT',
@@ -44,121 +44,13 @@ require "ucs2utf.pl";
        'ISO8859_16' => '8859-16.TXT',
        'KOI8R'      => 'KOI8-R.TXT',
        'KOI8U'      => 'KOI8-U.TXT',
-       'GBK'        => 'CP936.TXT',
-       'UHC'        => 'CP949.TXT',
-       'JOHAB'      => 'JOHAB.TXT',);
+       'GBK'        => 'CP936.TXT');
 
 @charsets = keys(%filename);
 @charsets = @ARGV if scalar(@ARGV);
 foreach $charset (@charsets)
 {
+       my $mapping = &read_source($filename{$charset});
 
-       #
-       # first, generate UTF8-> charset table
-       #
-       $in_file = $filename{$charset};
-
-       open(FILE, $in_file) || die("cannot open $in_file");
-
-       reset 'array';
-
-       while (<FILE>)
-       {
-               chop;
-               if (/^#/)
-               {
-                       next;
-               }
-               ($c, $u, $rest) = split;
-               $ucs  = hex($u);
-               $code = hex($c);
-               if ($code >= 0x80 && $ucs >= 0x0080)
-               {
-                       $utf = &ucs2utf($ucs);
-                       if ($array{$utf} ne "")
-                       {
-                               printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                               next;
-                       }
-                       $count++;
-                       $array{$utf} = $code;
-               }
-       }
-       close(FILE);
-
-       $file = lc("utf8_to_${charset}.map");
-       open(FILE, "> $file") || die("cannot open $file");
-
-       print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-       print FILE "static const pg_utf_to_local ULmap${charset}[ $count ] = {\n";
-
-       for $index (sort { $a <=> $b } keys(%array))
-       {
-               $code = $array{$index};
-               $count--;
-               if ($count == 0)
-               {
-                       printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
-               }
-               else
-               {
-                       printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
-               }
-       }
-
-       print FILE "};\n";
-       close(FILE);
-
-       #
-       # then generate character set code ->UTF8 table
-       #
-       open(FILE, $in_file) || die("cannot open $in_file");
-
-       reset 'array';
-
-       while (<FILE>)
-       {
-               chop;
-               if (/^#/)
-               {
-                       next;
-               }
-               ($c, $u, $rest) = split;
-               $ucs  = hex($u);
-               $code = hex($c);
-               if ($code >= 0x80 && $ucs >= 0x0080)
-               {
-                       $utf = &ucs2utf($ucs);
-                       if ($array{$code} ne "")
-                       {
-                               printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-                               next;
-                       }
-                       $count++;
-                       $array{$code} = $utf;
-               }
-       }
-       close(FILE);
-
-       $file = lc("${charset}_to_utf8.map");
-       open(FILE, "> $file") || die("cannot open $file");
-
-       print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-       print FILE "static const pg_local_to_utf LUmap${charset}[ $count ] = {\n";
-       for $index (sort { $a <=> $b } keys(%array))
-       {
-               $utf = $array{$index};
-               $count--;
-               if ($count == 0)
-               {
-                       printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
-               }
-               else
-               {
-                       printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
-               }
-       }
-
-       print FILE "};\n";
-       close(FILE);
+       print_tables($charset, $mapping);
 }
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
new file mode 100644 (file)
index 0000000..d6a13e8
--- /dev/null
@@ -0,0 +1,282 @@
+#
+# Copyright (c) 2001-2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/convutils.pm
+
+use strict;
+
+#######################################################################
+# convert UCS-4 to UTF-8
+#
+sub ucs2utf
+{
+       my ($ucs) = @_;
+       my $utf;
+
+       if ($ucs <= 0x007f)
+       {
+               $utf = $ucs;
+       }
+       elsif ($ucs > 0x007f && $ucs <= 0x07ff)
+       {
+               $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
+       }
+       elsif ($ucs > 0x07ff && $ucs <= 0xffff)
+       {
+               $utf =
+                 ((($ucs >> 12) | 0xe0) << 16) |
+                 (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+       }
+       else
+       {
+               $utf =
+                 ((($ucs >> 18) | 0xf0) << 24) |
+                 (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
+                 (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+       }
+       return ($utf);
+}
+
+#######################################################################
+# read_source - common routine to read source file
+#
+# fname ; input file name
+sub read_source
+{
+       my ($fname) = @_;
+       my @r;
+
+       open(my $in, '<', $fname) || die("cannot open $fname");
+
+       while (<$in>)
+       {
+               next if (/^#/);
+               chop;
+
+               next if (/^$/); # Ignore empty lines
+
+               next if (/^0x([0-9A-F]+)\s+(#.*)$/);
+
+               # Skip the first column for JIS0208.TXT
+               if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/)
+               {
+                       print STDERR "READ ERROR at line $. in $fname: $_\n";
+                       exit;
+               }
+               my $out = {f => $fname, l => $.,
+                                  code => hex($1),
+                                  ucs => hex($2),
+                                  comment => $4,
+                                  direction => "both"
+                               };
+
+               # Ignore pure ASCII mappings. PostgreSQL character conversion code
+               # never even passes these to the conversion code.
+               next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
+
+               push(@r, $out);
+       }
+       close($in);
+
+       return \@r;
+}
+
+##################################################################
+# print_tables : output mapping tables
+#
+# Arguments:
+#  charset - string name of the character set.
+#  table   - mapping table (see format below)
+#  verbose - if 1, output comment on each line,
+#            if 2, also output source file name and number
+#
+#
+#
+# Mapping table format:
+#
+# Mapping table is a list of hashes. Each hash has the following fields:
+#   direction  - Direction: 'both', 'from_unicode' or 'to_unicode'
+#   ucs        - Unicode code point
+#   ucs_second - Second Unicode code point, if this is a "combined" character.
+#   code       - Byte sequence in the "other" character set, as an integer
+#   comment    - Text representation of the character
+#   f          - Source filename
+#   l          - Line number in source file
+#
+#
+sub print_tables
+{
+       my ($charset, $table, $verbose) = @_;
+
+       # Build an array with only the to-UTF8 direction mappings
+       my @to_unicode;
+       my @to_unicode_combined;
+       my @from_unicode;
+       my @from_unicode_combined;
+
+       foreach my $i (@$table)
+       {
+               if (defined $i->{ucs_second})
+               {
+                       my $entry = {utf8 => ucs2utf($i->{ucs}),
+                                                utf8_second => ucs2utf($i->{ucs_second}),
+                                                code => $i->{code},
+                                                comment => $i->{comment},
+                                                f => $i->{f}, l => $i->{l}};
+                       if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
+                       {
+                               push @to_unicode_combined, $entry;
+                       }
+                       if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
+                       {
+                               push @from_unicode_combined, $entry;
+                       }
+               }
+               else
+               {
+                       my $entry = {utf8 => ucs2utf($i->{ucs}),
+                                                code => $i->{code},
+                                                comment => $i->{comment},
+                                                f => $i->{f}, l => $i->{l}};
+                       if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
+                       {
+                               push @to_unicode, $entry;
+                       }
+                       if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
+                       {
+                               push @from_unicode, $entry;
+                       }
+               }
+       }
+
+       print_to_utf8_map($charset, \@to_unicode, $verbose);
+       print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
+       print_from_utf8_map($charset, \@from_unicode, $verbose);
+       print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
+}
+
+sub print_from_utf8_map
+{
+       my ($charset, $table, $verbose) = @_;
+
+       my $last_comment = "";
+
+       my $fname = lc("utf8_to_${charset}.map");
+       print "- Writing UTF8=>${charset} conversion table: $fname\n";
+       open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+       printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
+                  "static const pg_utf_to_local ULmap${charset}[ %d ] = {",
+                  scalar(@$table));
+       my $first = 1;
+       foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
+    {
+               print($out ",") if (!$first);
+               $first = 0;
+               print($out "\t/* $last_comment */") if ($verbose);
+
+               printf($out "\n  {0x%04x, 0x%04x}", $$i{utf8}, $$i{code});
+               if ($verbose >= 2)
+               {
+                       $last_comment = "$$i{f}:$$i{l} $$i{comment}";
+               }
+               else
+               {
+                       $last_comment = $$i{comment};
+               }
+       }
+       print($out "\t/* $last_comment */") if ($verbose);
+       print $out "\n};\n";
+       close($out);
+}
+
+sub print_from_utf8_combined_map
+{
+       my ($charset, $table, $verbose) = @_;
+
+       my $last_comment = "";
+
+       my $fname = lc("utf8_to_${charset}_combined.map");
+       print "- Writing UTF8=>${charset} conversion table: $fname\n";
+       open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+       printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
+                  "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
+                  scalar(@$table));
+       my $first = 1;
+       foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
+    {
+               print($out ",") if (!$first);
+               $first = 0;
+               print($out "\t/* $last_comment */") if ($verbose);
+
+               printf($out "\n  {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code});
+               $last_comment = "$$i{comment}";
+       }
+       print($out "\t/* $last_comment */") if ($verbose);
+       print $out "\n};\n";
+       close($out);
+}
+
+sub print_to_utf8_map
+{
+       my ($charset, $table, $verbose) = @_;
+
+       my $last_comment = "";
+
+       my $fname = lc("${charset}_to_utf8.map");
+
+       print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
+       open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+       printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
+                  "static const pg_local_to_utf LUmap${charset}[ %d ] = {",
+                  scalar(@$table));
+       my $first = 1;
+       foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
+    {
+               print($out ",") if (!$first);
+               $first = 0;
+               print($out "\t/* $last_comment */") if ($verbose);
+
+               printf($out "\n  {0x%04x, 0x%x}", $$i{code}, $$i{utf8});
+               if ($verbose >= 2)
+               {
+                       $last_comment = "$$i{f}:$$i{l} $$i{comment}";
+               }
+               else
+               {
+                       $last_comment = $$i{comment};
+               }
+       }
+       print($out "\t/* $last_comment */") if ($verbose);
+       print $out "\n};\n";
+       close($out);
+}
+
+sub print_to_utf8_combined_map
+{
+       my ($charset, $table, $verbose) = @_;
+
+       my $last_comment = "";
+
+       my $fname = lc("${charset}_to_utf8_combined.map");
+
+       print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
+       open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+       printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
+                  "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
+                  scalar(@$table));
+       my $first = 1;
+       foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
+    {
+               print($out ",") if (!$first);
+               $first = 0;
+               print($out "\t/* $last_comment */") if ($verbose);
+
+               printf($out "\n  {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second});
+               $last_comment = "$$i{comment}";
+       }
+       print($out "\t/* $last_comment */") if ($verbose);
+       print $out "\n};\n";
+       close($out);
+}
+
+1;
index 2c3a607bf86e43cff6d466162df09303c088d8bb..33fd42ac4647390a1ee38fb97e92e57cd6388527 100644 (file)
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_EUC_JIS_2004.pl
- */
-static const pg_local_to_utf LUmapEUC_JIS_2004[] = {
+/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map */
+
+static const pg_local_to_utf LUmapEUC_JIS_2004[ 11303 ] = {    /*  */
   {0x0080, 0xc280},    /* U+0080        <control> */
   {0x0081, 0xc281},    /* U+0081        <control> */
   {0x0082, 0xc282},    /* U+0082        <control> */
@@ -205,7 +204,7 @@ static const pg_local_to_utf LUmapEUC_JIS_2004[] = {
   {0xa2ac, 0xe28691},  /* U+2191        UPWARDS ARROW */
   {0xa2ad, 0xe28693},  /* U+2193        DOWNWARDS ARROW */
   {0xa2ae, 0xe38093},  /* U+3013        GETA MARK */
-  {0xa2af, 0xefbc87},  /* U+FF07        FULLWIDTH APOSTROPHE   [2000] */
+  {0xa2af, 0xefbc87},  /* U+FF07        FULLWIDTH APOSTROPHE */
   {0xa2b0, 0xefbc82},  /* U+FF02        FULLWIDTH QUOTATION MARK       [2000] */
   {0xa2b1, 0xefbc8d},  /* U+FF0D        FULLWIDTH HYPHEN-MINUS [2000] */
   {0xa2b2, 0xefbd9e},  /* U+FF5E        FULLWIDTH TILDE        [2000] */
index 7a7f85b105d3e9936aae7a4727d3e4128d473277..2d8987b990814b72b456ff2b7e8592fb74cdfa62 100644 (file)
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_EUC_JIS_2004.pl
- */
-static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {
+/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map */
+
+static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[ 25 ] = {     /*  */
   {0xa4f7, 0x00e3818b, 0x00e3829a},    /* U+304B+309A          [2000] */
   {0xa4f8, 0x00e3818d, 0x00e3829a},    /* U+304D+309A          [2000] */
   {0xa4f9, 0x00e3818f, 0x00e3829a},    /* U+304F+309A          [2000] */
index db427cbb24cdc94617ed026b3fe50638e42d8f9e..eb17f9829c54f6d19488e99c22a3d6b0b0271571 100644 (file)
@@ -1,6 +1,6 @@
 /* src/backend/utils/mb/Unicode/euc_jp_to_utf8.map */
 
-static const pg_local_to_utf LUmapEUC_JP[] = {
+static const pg_local_to_utf LUmapEUC_JP[ 13197 ] = {
   {0x8ea1, 0xefbda1},
   {0x8ea2, 0xefbda2},
   {0x8ea3, 0xefbda3},
@@ -13197,5 +13197,5 @@ static const pg_local_to_utf LUmapEUC_JP[] = {
   {0x8ff4fb, 0xe9ab99},
   {0x8ff4fc, 0xe9adb2},
   {0x8ff4fd, 0xefa8ad},
-  {0x8ff4fe, 0xe9bb91},
+  {0x8ff4fe, 0xe9bb91}
 };
index e37152137d6b93bc95466e9135fb5df2d7d71784..701a7a476ffed95008722289708a648c95ee9cf2 100644 (file)
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/euc_kr_to_utf8.map */
+
 static const pg_local_to_utf LUmapEUC_KR[ 8227 ] = {
   {0xa1a1, 0xe38080},
   {0xa1a2, 0xe38081},
index 8110f6e8531c7aeebd298ac368475d272640c76d..e31d24184c15a792a4cbbf21add0cbdc697f3b7b 100644 (file)
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/johab_to_utf8.map */
+
 static const pg_local_to_utf LUmapJOHAB[ 17049 ] = {
   {0x8444, 0xe384b3},
   {0x8446, 0xe384b5},
index 81c898c6be487ade03ba9208a5fa9927f427343d..958dde7b83d6c90a06732de6ec256df3a376d599 100644 (file)
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_SHIFTJIS_2004.pl
- */
-static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {
+/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map */
+
+static const pg_local_to_utf LUmapSHIFT_JIS_2004[ 11271 ] = {  /*  */
   {0x00a1, 0xefbda1},  /* U+FF61        HALFWIDTH IDEOGRAPHIC FULL STOP */
   {0x00a2, 0xefbda2},  /* U+FF62        HALFWIDTH LEFT CORNER BRACKET */
   {0x00a3, 0xefbda3},  /* U+FF63        HALFWIDTH RIGHT CORNER BRACKET */
@@ -173,7 +172,7 @@ static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {
   {0x81aa, 0xe28691},  /* U+2191        UPWARDS ARROW */
   {0x81ab, 0xe28693},  /* U+2193        DOWNWARDS ARROW */
   {0x81ac, 0xe38093},  /* U+3013        GETA MARK */
-  {0x81ad, 0xefbc87},  /* U+FF07        FULLWIDTH APOSTROPHE   [2000] */
+  {0x81ad, 0xefbc87},  /* U+FF07        FULLWIDTH APOSTROPHE */
   {0x81ae, 0xefbc82},  /* U+FF02        FULLWIDTH QUOTATION MARK       [2000] */
   {0x81af, 0xefbc8d},  /* U+FF0D        FULLWIDTH HYPHEN-MINUS [2000] */
   {0x81b0, 0x7e},      /* U+007E        TILDE  [2000]  Fullwidth: U+FF5E */
index b1c7bced5fd605816fd02bc2185c7d59f83ddabc..414e59dc404348d3ef10889d7d1b4121f932230e 100644 (file)
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_SHIFT_JIS_2004.pl
- */
-static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = {
+/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map */
+
+static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[ 25 ] = {   /*  */
   {0x82f5, 0x00e3818b, 0x00e3829a},    /* U+304B+309A          [2000] */
   {0x82f6, 0x00e3818d, 0x00e3829a},    /* U+304D+309A          [2000] */
   {0x82f7, 0x00e3818f, 0x00e3829a},    /* U+304F+309A          [2000] */
diff --git a/src/backend/utils/mb/Unicode/ucs2utf.pl b/src/backend/utils/mb/Unicode/ucs2utf.pl
deleted file mode 100644 (file)
index e0f1fb2..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Copyright (c) 2001-2016, PostgreSQL Global Development Group
-#
-# src/backend/utils/mb/Unicode/ucs2utf.pl
-# convert UCS-4 to UTF-8
-#
-sub ucs2utf
-{
-       local ($ucs) = @_;
-       local $utf;
-
-       if ($ucs <= 0x007f)
-       {
-               $utf = $ucs;
-       }
-       elsif ($ucs > 0x007f && $ucs <= 0x07ff)
-       {
-               $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
-       }
-       elsif ($ucs > 0x07ff && $ucs <= 0xffff)
-       {
-               $utf =
-                 ((($ucs >> 12) | 0xe0) << 16) |
-                 (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
-       }
-       else
-       {
-               $utf =
-                 ((($ucs >> 18) | 0xf0) << 24) |
-                 (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
-                 (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
-       }
-       return ($utf);
-}
-1;
index 26a7b18f658672ef404d3894d2e9d64fb316305c..65c7e114a3a9019580609b2b3db14d7ab197c30b 100644 (file)
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/uhc_to_utf8.map */
+
 static const pg_local_to_utf LUmapUHC[ 17237 ] = {
   {0x8141, 0xeab082},
   {0x8142, 0xeab083},
index b28eb9cc0c7ba0cb7f6d2ebc14f5a275c005a09a..3d64cd1a604dc9676b86a39f1db4838c261692b5 100644 (file)
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/utf8_to_euc_cn.map */
+
 static const pg_utf_to_local ULmapEUC_CN[ 7445 ] = {
   {0xc2a4, 0xa1e8},
   {0xc2a7, 0xa1ec},
index 513720121768b7adeafce100f802eac3facfa05f..b50e232b6ce1ce6630b7c1895186cd2a91a8c9ce 100644 (file)
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_EUC_JIS_2004.pl
- */
-static const pg_utf_to_local ULmapEUC_JIS_2004[] = {
+/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map */
+
+static const pg_utf_to_local ULmapEUC_JIS_2004[ 11303 ] = {    /*  */
   {0xc280, 0x0080},    /* U+0080        <control> */
   {0xc281, 0x0081},    /* U+0081        <control> */
   {0xc282, 0x0082},    /* U+0082        <control> */
@@ -10849,7 +10848,7 @@ static const pg_utf_to_local ULmapEUC_JIS_2004[] = {
   {0xefbc84, 0xa1f0},  /* U+FF04        FULLWIDTH DOLLAR SIGN */
   {0xefbc85, 0xa1f3},  /* U+FF05        FULLWIDTH PERCENT SIGN */
   {0xefbc86, 0xa1f5},  /* U+FF06        FULLWIDTH AMPERSAND */
-  {0xefbc87, 0xa2af},  /* U+FF07        FULLWIDTH APOSTROPHE   [2000] */
+  {0xefbc87, 0xa2af},  /* U+FF07        FULLWIDTH APOSTROPHE */
   {0xefbc88, 0xa1ca},  /* U+FF08        FULLWIDTH LEFT PARENTHESIS */
   {0xefbc89, 0xa1cb},  /* U+FF09        FULLWIDTH RIGHT PARENTHESIS */
   {0xefbc8a, 0xa1f6},  /* U+FF0A        FULLWIDTH ASTERISK */
index d8ff5c05868fc584104d6c83edf19f2932f39b4f..0d57667a558fdbe3553e49dca70fc9ee3fe0110d 100644 (file)
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_EUC_JIS_2004.pl
- */
-static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {
+/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map */
+
+static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[ 25 ] = {     /*  */
   {0x0000c3a6, 0x0000cc80, 0xabc4},    /* U+00E6+0300          [2000] */
   {0x0000c994, 0x0000cc80, 0xabc8},    /* U+0254+0300          [2000] */
   {0x0000c994, 0x0000cc81, 0xabc9},    /* U+0254+0301          [2000] */
index 137d4fdef614e574fa53ba34ea3872ac34e2a83d..eef6db65b3440f693ad98227917663336734896e 100644 (file)
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/utf8_to_euc_jp.map */
+
 static const pg_utf_to_local ULmapEUC_JP[ 13175 ] = {
   {0xc2a1, 0x8fa2c2},
   {0xc2a4, 0x8fa2f0},
index 4a78b260ea45759058d2762b3a0a247fd819de97..a642b2154f29d0da387871a70e873bcd9b40f034 100644 (file)
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/utf8_to_euc_kr.map */
+
 static const pg_utf_to_local ULmapEUC_KR[ 8227 ] = {
   {0xc2a1, 0xa2ae},
   {0xc2a4, 0xa2b4},
index 869f8213d214bb23d05ce309c844ce58e7b2825b..78997d82d04e54602485c323030dbb21fce2edb1 100644 (file)
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/utf8_to_johab.map */
+
 static const pg_utf_to_local ULmapJOHAB[ 17049 ] = {
   {0xc2a1, 0xd9ae},
   {0xc2a4, 0xd9b4},
index 4fab64fc95662bf54bac224c51bc7c636e458f22..e9f9e638c66826906cb70004d9f4a02a1e7755ed 100644 (file)
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_SHIFT_JIS_2004.pl
- */
-static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {
+/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map */
+
+static const pg_utf_to_local ULmapSHIFT_JIS_2004[ 11271 ] = {  /*  */
   {0xc2a0, 0x8541},    /* U+00A0        NO-BREAK SPACE [2000] */
   {0xc2a1, 0x8542},    /* U+00A1        INVERTED EXCLAMATION MARK      [2000] */
   {0xc2a2, 0x8191},    /* U+00A2        CENT SIGN      Windows: U+FFE0 */
@@ -10817,7 +10816,7 @@ static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {
   {0xefbc84, 0x8190},  /* U+FF04        FULLWIDTH DOLLAR SIGN */
   {0xefbc85, 0x8193},  /* U+FF05        FULLWIDTH PERCENT SIGN */
   {0xefbc86, 0x8195},  /* U+FF06        FULLWIDTH AMPERSAND */
-  {0xefbc87, 0x81ad},  /* U+FF07        FULLWIDTH APOSTROPHE   [2000] */
+  {0xefbc87, 0x81ad},  /* U+FF07        FULLWIDTH APOSTROPHE */
   {0xefbc88, 0x8169},  /* U+FF08        FULLWIDTH LEFT PARENTHESIS */
   {0xefbc89, 0x816a},  /* U+FF09        FULLWIDTH RIGHT PARENTHESIS */
   {0xefbc8a, 0x8196},  /* U+FF0A        FULLWIDTH ASTERISK */
index e55d4a2a6cfd53eca7e96653dc4f38c32b605ebf..3642851fd6aa672e743358a5a66a8ca7c347d59c 100644 (file)
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_SHIFT_JIS_2004.pl
- */
-static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = {
+/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map */
+
+static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[ 25 ] = {   /*  */
   {0x0000c3a6, 0x0000cc80, 0x8663},    /* U+00E6+0300          [2000] */
   {0x0000c994, 0x0000cc80, 0x8667},    /* U+0254+0300          [2000] */
   {0x0000c994, 0x0000cc81, 0x8668},    /* U+0254+0301          [2000] */
index fb0566a1db0e0e8aaf7489a46ee82b05a703dc30..cd6ea48ffc320e26dbbef3cff8f0c93bdb1ff94b 100644 (file)
@@ -3,7 +3,7 @@
 static const pg_utf_to_local ULmapSJIS[ 7397 ] = {
   {0xc2a2, 0x8191},
   {0xc2a3, 0x8192},
-  {0xc2a5, 0x5c},
+  {0xc2a5, 0x005c},
   {0xc2a7, 0x8198},
   {0xc2a8, 0x814e},
   {0xc2ac, 0x81ca},
@@ -142,7 +142,7 @@ static const pg_utf_to_local ULmapSJIS[ 7397 ] = {
   {0xe280b2, 0x818c},
   {0xe280b3, 0x818d},
   {0xe280bb, 0x81a6},
-  {0xe280be, 0x7e},
+  {0xe280be, 0x007e},
   {0xe28483, 0x818e},
   {0xe28496, 0xfa59},
   {0xe284a1, 0xfa5a},
index 15dfb56a09958393bc1d7bbea776ab45a5f179f2..dc04726364a83adf9e894d91608307822e1118fd 100644 (file)
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/utf8_to_uhc.map */
+
 static const pg_utf_to_local ULmapUHC[ 17237 ] = {
   {0xc2a1, 0xa2ae},
   {0xc2a4, 0xa2b4},