3 # Copyright (c) 2001-2010, PostgreSQL Global Development Group
5 # src/backend/utils/mb/Unicode/UCS_to_BIG5.pl
7 # Generate UTF-8 <--> BIG5 conversion tables from
8 # map files provided by Unicode organization.
9 # Unfortunately it is prohibited by the organization
10 # to distribute the map files. So if you try to use this script,
11 # you have to obtain the map files from the organization's ftp site.
12 # ftp://www.unicode.org/Public/MAPPINGS/
14 # Our "big5" comes from BIG5.TXT, with the addition of the characters
15 # in the range 0xf9d6-0xf9dc from CP950.TXT.
20 # # and Unicode name (not used in this script)
25 # # and Unicode name (not used in this script)
32 # first, generate UTF8 --> BIG5 table
34 $in_file = "BIG5.TXT";
36 open( FILE, $in_file ) || die( "cannot open $in_file" );
45 ( $c, $u, $rest ) = split;
48 if( $code >= 0x80 && $ucs >= 0x0080){
49 $utf = &ucs2utf($ucs);
50 if( $array{ $utf } ne "" ){
51 printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
55 $array{ $utf } = $code;
60 $in_file = "CP950.TXT";
62 open( FILE, $in_file ) || die( "cannot open $in_file" );
69 ( $c, $u, $rest ) = split;
73 # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
75 if( $code >= 0x80 && $ucs >= 0x0080 &&
76 $code >= 0xf9d6 && $code <= 0xf9dc ){
77 $utf = &ucs2utf($ucs);
78 if( $array{ $utf } ne "" ){
79 printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
83 $array{ $utf } = $code;
88 $file = lc("utf8_to_big5.map");
89 open( FILE, "> $file" ) || die( "cannot open $file" );
90 print FILE "static pg_utf_to_local ULmapBIG5[ $count ] = {\n";
92 for $index ( sort {$a <=> $b} keys( %array ) ){
93 $code = $array{ $index };
96 printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
98 printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
106 # then generate BIG5 --> UTF8 table
108 $in_file = "BIG5.TXT";
110 open( FILE, $in_file ) || die( "cannot open $in_file" );
119 ( $c, $u, $rest ) = split;
122 if( $code >= 0x80 && $ucs >= 0x0080){
123 $utf = &ucs2utf($ucs);
124 if( $array{ $utf } ne "" ){
125 printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
129 $array{ $code } = $utf;
134 $in_file = "CP950.TXT";
136 open( FILE, $in_file ) || die( "cannot open $in_file" );
143 ( $c, $u, $rest ) = split;
147 # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
149 if( $code >= 0x80 && $ucs >= 0x0080 &&
150 $code >= 0xf9d6 && $code <= 0xf9dc ){
151 $utf = &ucs2utf($ucs);
152 if( $array{ $utf } ne "" ){
153 printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
157 $array{ $code } = $utf;
162 $file = lc("big5_to_utf8.map");
163 open( FILE, "> $file" ) || die( "cannot open $file" );
164 print FILE "static pg_local_to_utf LUmapBIG5[ $count ] = {\n";
165 for $index ( sort {$a <=> $b} keys( %array ) ){
166 $utf = $array{ $index };
169 printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
171 printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;