]> granicus.if.org Git - php/commitdiff
Remove useless mbstring encoding 'JIS-ms'
authorAlex Dowad <alexinbeijing@gmail.com>
Sat, 17 Oct 2020 19:29:47 +0000 (21:29 +0200)
committerAlex Dowad <alexinbeijing@gmail.com>
Fri, 15 Jan 2021 19:55:41 +0000 (21:55 +0200)
MicroSoft invented three encodings very similar to ISO-2022-JP/JIS7/JIS8, called
CP50220, CP50221, and CP50222. All three are supported by mbstring.

Since these encodings are very similar, some code can be shared. Actually,
conversion of CP50220/1/2 to Unicode is exactly the same operation; it's when
converting from Unicode to CP50220/1/2 that some small differences arise in how
certain katakana are handled.

The most important common code was a function called `mbfl_filt_wchar_jis_ms`.
The `jis_ms` part doubtless refers to the fact that these encodings are modified
versions of 'JIS' invented by 'MS'. mbstring also went a step further and exported
'JIS-ms' to userland as a separate encoding from CP50220/1/2. If users requested
'JIS-ms' conversion, they got something like CP50220/1/2, minus their special
ways of handling half-width katakana when converting from Unicode.

But... that 'encoding' is not something which actually exists in the world outside
of mbstring. CP50220/1/2 do exist in MicroSoft software, but not 'JIS-ms'.

For a text encoding conversion library, inventing new variant encodings and
implementing them is not very productive. Our interest is in handling text
encodings which real people actually use for... you know, storing actual text
and things like that.

ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c
ext/mbstring/libmbfl/filters/mbfilter_cp5022x.h
ext/mbstring/libmbfl/mbfl/mbfl_encoding.c
ext/mbstring/libmbfl/mbfl/mbfl_encoding.h

index d98366d1a6c6b922b8b025f6883e54445167f6c0..da4b08c892fe197d356f90133db49e22e4c1d714 100644 (file)
@@ -45,19 +45,12 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
  * This was just CP50220, but the implementation was less strict regarding
  * invalid characters; it would silently pass some through
  * This 'encoding' only existed in mbstring. In case some poor, lost soul is
- * still using it, retain minimal support by aliasing it to CP50220 */
-static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", NULL};
-
-const mbfl_encoding mbfl_encoding_jis_ms = {
-       mbfl_no_encoding_jis_ms,
-       "JIS-ms",
-       "ISO-2022-JP",
-       NULL,
-       NULL,
-       MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE,
-       &vtbl_jis_ms_wchar,
-       &vtbl_wchar_jis_ms
-};
+ * still using it, retain minimal support by aliasing it to CP50220
+ *
+ * Further, mbstring also had a made-up encoding called "JIS-ms"
+ * This was the same as CP5022{0,1,2}, but without their special ways of
+ * handling conversion of Unicode half-width katakana */
+static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", "JIS-ms", NULL};
 
 const mbfl_encoding mbfl_encoding_cp50220 = {
        mbfl_no_encoding_cp50220,
@@ -92,32 +85,12 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
        &vtbl_wchar_cp50222
 };
 
-const struct mbfl_convert_vtbl vtbl_jis_ms_wchar = {
-       mbfl_no_encoding_jis_ms,
-       mbfl_no_encoding_wchar,
-       mbfl_filt_conv_common_ctor,
-       NULL,
-       mbfl_filt_conv_jis_ms_wchar,
-       mbfl_filt_conv_common_flush,
-       NULL,
-};
-
-const struct mbfl_convert_vtbl vtbl_wchar_jis_ms = {
-       mbfl_no_encoding_wchar,
-       mbfl_no_encoding_jis_ms,
-       mbfl_filt_conv_common_ctor,
-       NULL,
-       mbfl_filt_conv_wchar_jis_ms,
-       mbfl_filt_conv_any_jis_flush,
-       NULL,
-};
-
 const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
        mbfl_no_encoding_cp50220,
        mbfl_no_encoding_wchar,
        mbfl_filt_conv_common_ctor,
        NULL,
-       mbfl_filt_conv_jis_ms_wchar,
+       mbfl_filt_conv_cp5022x_wchar,
        mbfl_filt_conv_cp5022x_wchar_flush,
        NULL,
 };
@@ -137,7 +110,7 @@ const struct mbfl_convert_vtbl vtbl_cp50221_wchar = {
        mbfl_no_encoding_wchar,
        mbfl_filt_conv_common_ctor,
        NULL,
-       mbfl_filt_conv_jis_ms_wchar,
+       mbfl_filt_conv_cp5022x_wchar,
        mbfl_filt_conv_cp5022x_wchar_flush,
        NULL,
 };
@@ -157,7 +130,7 @@ const struct mbfl_convert_vtbl vtbl_cp50222_wchar = {
        mbfl_no_encoding_wchar,
        mbfl_filt_conv_common_ctor,
        NULL,
-       mbfl_filt_conv_jis_ms_wchar,
+       mbfl_filt_conv_cp5022x_wchar,
        mbfl_filt_conv_cp5022x_wchar_flush,
        NULL,
 };
@@ -174,11 +147,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp50222 = {
 
 #define CK(statement)  do { if ((statement) < 0) return (-1); } while (0)
 
-/*
- * JIS-ms => wchar
- */
-int
-mbfl_filt_conv_jis_ms_wchar(int c, mbfl_convert_filter *filter)
+int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
 {
        int c1, s, w;
 
@@ -355,154 +324,6 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
        return 0;
 }
 
-/*
- * wchar => JIS
- */
-int
-mbfl_filt_conv_wchar_jis_ms(int c, mbfl_convert_filter *filter)
-{
-       int s = 0;
-
-       if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
-               s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
-       } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
-               s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
-       } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
-               s = ucs_i_jis_table[c - ucs_i_jis_table_min];
-       } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
-               s = ucs_r_jis_table[c - ucs_r_jis_table_min];
-       } else if (c >= 0xe000 && c < (0xe000 + 10 * 94)) {
-               /* PUE => Microsoft extended (pseudo 95ku - 114ku) */
-               /* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
-               s = c - 0xe000;
-               s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
-       } else if (c >= (0xe000 + 10 * 94) && c <= (0xe000 + 20 * 94)) {
-               /* PUE => JISX0212 user-defined (G3 85ku - 94ku) */
-               /* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
-               s = c - (0xe000 + 10 * 94);
-               s = (s / 94 + 0xf5) << 8 | (s % 94 + 0xa1);
-       }
-
-       /* do some transliteration */
-       if (s <= 0) {
-               if (c == 0xa5) {                /* YEN SIGN */
-                       s = 0x1005c;
-               } else if (c == 0x203e) {       /* OVER LINE */
-                       s = 0x1007e;
-               } else if (c == 0xff3c) {       /* FULLWIDTH REVERSE SOLIDUS */
-                       s = 0x2140;
-               } else if (c == 0x2225) {       /* PARALLEL TO */
-                       s = 0x2142;
-               } else if (c == 0xff0d) {       /* FULLWIDTH HYPHEN-MINUS */
-                       s = 0x215d;
-               } else if (c == 0xffe0) {       /* FULLWIDTH CENT SIGN */
-                       s = 0x2171;
-               } else if (c == 0xffe1) {       /* FULLWIDTH POUND SIGN */
-                       s = 0x2172;
-               } else if (c == 0xffe2) {       /* FULLWIDTH NOT SIGN */
-                       s = 0x224c;
-               }
-       }
-       if (s <= 0 || (s >= 0x8080 && s < 0x10000)) {
-               int i;
-               s = -1;
-
-               for (i = 0;
-                               i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
-                       const int oh = cp932ext1_ucs_table_min / 94;
-
-                       if (c == cp932ext1_ucs_table[i]) {
-                               s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
-                               break;
-                       }
-               }
-
-               if (s < 0) {
-                       const int oh = cp932ext2_ucs_table_min / 94;
-                       const int cp932ext2_ucs_table_size =
-                                       cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
-                       for (i = 0; i < cp932ext2_ucs_table_size; i++) {
-                               if (c == cp932ext2_ucs_table[i]) {
-                                       s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
-                                       break;
-                               }
-                       }
-               }
-
-               if (s < 0) {
-                       const int cp932ext3_ucs_table_size =
-                                       cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
-                       const int limit = cp932ext3_ucs_table_size >
-                                       cp932ext3_eucjp_table_size ?
-                                               cp932ext3_eucjp_table_size:
-                                               cp932ext3_ucs_table_size;
-                       for (i = 0; i < limit; i++) {
-                               if (c == cp932ext3_ucs_table[i]) {
-                                       s = cp932ext3_eucjp_table[i];
-                                       break;
-                               }
-                       }
-               }
-
-               if (c == 0) {
-                       s = 0;
-               } else if (s <= 0) {
-                       s = -1;
-               }
-       }
-
-       if (s >= 0) {
-               if (s < 0x80) { /* ASCII */
-                       if ((filter->status & 0xff00) != 0) {
-                               CK((*filter->output_function)(0x1b, filter->data));             /* ESC */
-                               CK((*filter->output_function)(0x28, filter->data));             /* '(' */
-                               CK((*filter->output_function)(0x42, filter->data));             /* 'B' */
-                       }
-                       filter->status = 0;
-                       CK((*filter->output_function)(s, filter->data));
-               } else if (s < 0x100) { /* kana */
-                       if ((filter->status & 0xff00) != 0x100) {
-                               CK((*filter->output_function)(0x1b, filter->data));             /* ESC */
-                               CK((*filter->output_function)(0x28, filter->data));             /* '(' */
-                               CK((*filter->output_function)(0x49, filter->data));             /* 'I' */
-                       }
-                       filter->status = 0x100;
-                       CK((*filter->output_function)(s & 0x7f, filter->data));
-               } else if (s < 0x8080) { /* X 0208 */
-                       if ((filter->status & 0xff00) != 0x200) {
-                               CK((*filter->output_function)(0x1b, filter->data));             /* ESC */
-                               CK((*filter->output_function)(0x24, filter->data));             /* '$' */
-                               CK((*filter->output_function)(0x42, filter->data));             /* 'B' */
-                       }
-                       filter->status = 0x200;
-                       CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
-                       CK((*filter->output_function)(s & 0xff, filter->data));
-               } else if (s < 0x10000) { /* X 0212 */
-                       if ((filter->status & 0xff00) != 0x300) {
-                               CK((*filter->output_function)(0x1b, filter->data));             /* ESC */
-                               CK((*filter->output_function)(0x24, filter->data));             /* '$' */
-                               CK((*filter->output_function)(0x28, filter->data));             /* '(' */
-                               CK((*filter->output_function)(0x44, filter->data));             /* 'D' */
-                       }
-                       filter->status = 0x300;
-                       CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
-                       CK((*filter->output_function)(s & 0x7f, filter->data));
-               } else { /* X 0201 latin */
-                       if ((filter->status & 0xff00) != 0x400) {
-                               CK((*filter->output_function)(0x1b, filter->data));             /* ESC */
-                               CK((*filter->output_function)(0x28, filter->data));             /* '(' */
-                               CK((*filter->output_function)(0x4a, filter->data));             /* 'J' */
-                       }
-                       filter->status = 0x400;
-                       CK((*filter->output_function)(s & 0x7f, filter->data));
-               }
-       } else {
-               CK(mbfl_filt_conv_illegal_output(c, filter));
-       }
-
-       return c;
-}
-
 /*
  * wchar => CP50220
  */
@@ -843,8 +664,8 @@ mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter)
        }
        filter->status &= 0xff;
 
-       if (filter->flush_function != NULL) {
-               return (*filter->flush_function)(filter->data);
+       if (filter->flush_function) {
+               (*filter->flush_function)(filter->data);
        }
 
        return 0;
index 12ab19d4974a44e01675f2af3796643f94026a03..fdbaad677557fc39c122140ef66a5e52954eec27 100644 (file)
 
 #include "mbfilter.h"
 
-extern const mbfl_encoding mbfl_encoding_jis_ms;
 extern const mbfl_encoding mbfl_encoding_cp50220;
 extern const mbfl_encoding mbfl_encoding_cp50221;
 extern const mbfl_encoding mbfl_encoding_cp50222;
 
-extern const struct mbfl_convert_vtbl vtbl_jis_ms_wchar;
-extern const struct mbfl_convert_vtbl vtbl_wchar_jis_ms;
 extern const struct mbfl_convert_vtbl vtbl_cp50220_wchar;
 extern const struct mbfl_convert_vtbl vtbl_wchar_cp50220;
 extern const struct mbfl_convert_vtbl vtbl_cp50221_wchar;
@@ -46,8 +43,7 @@ extern const struct mbfl_convert_vtbl vtbl_wchar_cp50221;
 extern const struct mbfl_convert_vtbl vtbl_cp50222_wchar;
 extern const struct mbfl_convert_vtbl vtbl_wchar_cp50222;
 
-int mbfl_filt_conv_jis_ms_wchar(int c, mbfl_convert_filter *filter);
-int mbfl_filt_conv_wchar_jis_ms(int c, mbfl_convert_filter *filter);
+int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter);
 int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter);
 int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter);
 int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter);
index 12239b96ea6150534f482e277bc8fa6518903e0c..1a0e65d95db563e232d8d9c7e2bc00501f83bbf3 100644 (file)
@@ -161,7 +161,6 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
        &mbfl_encoding_koi8u,
        &mbfl_encoding_armscii8,
        &mbfl_encoding_cp850,
-       &mbfl_encoding_jis_ms,
        &mbfl_encoding_2022jp_2004,
        &mbfl_encoding_2022jp_kddi,
        &mbfl_encoding_cp50220,
index 9f926d035ce4e6784c3a2c3a323c502d9b906412..dc8ae1d5e165bb28f3a18e2c6d0dc3881d9a518d 100644 (file)
@@ -113,7 +113,6 @@ enum mbfl_no_encoding {
        mbfl_no_encoding_8859_16,
        mbfl_no_encoding_armscii8,
        mbfl_no_encoding_cp850,
-       mbfl_no_encoding_jis_ms,
        mbfl_no_encoding_cp50220,
        mbfl_no_encoding_cp50221,
        mbfl_no_encoding_cp50222,