From: Alex Dowad Date: Sat, 14 Nov 2020 21:03:03 +0000 (+0200) Subject: Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) in some SJIS variants X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4f3bd2e235feffbdba7f4bb7e99502832eda5f5b;p=php Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) in some SJIS variants Converting U+203E to 0x7E was especially wrong for CP932, where 0x7E represents a tilde. For vanilla Shift-JIS and Shift-JIS-2004, converting to 0x7E is acceptable, since 0x7E does represent an overline/macron in those encodings. Follow the same principle in CP51932, which is closely related to CP932. --- diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c index aa52d05481..0be771c3b8 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c @@ -216,8 +216,6 @@ mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter) if (s1 <= 0) { if (c == 0xa5) { /* YEN SIGN */ s1 = 0x216F; /* FULLWIDTH YEN SIGN */ - } else if (c == 0x203e) { /* OVER LINE */ - s1 = 0x007e; /* FULLWIDTH MACRON */ } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ s1 = 0x2140; } else if (c == 0xff5e) { /* FULLWIDTH TILDE */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c index ec192faa2b..120c5e626d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c @@ -253,8 +253,6 @@ mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter) if (s1 <= 0) { if (c == 0xa5) { /* YEN SIGN */ s1 = 0x216F; /* FULLWIDTH YEN SIGN */ - } else if (c == 0x203e) { /* OVER LINE */ - s1 = 0x007e; /* FULLWIDTH MACRON */ } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ s1 = 0x2140; } else if (c == 0xff5e) { /* FULLWIDTH TILDE */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index 36f374a952..455c49cb9a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -211,6 +211,8 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter) /* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or * macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */ s1 = 0x2141; + } else if (c == 0x203E) { /* U+203E is OVERLINE */ + s1 = 0x7E; /* Halfwidth overline/macron */ } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { @@ -223,8 +225,6 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter) if (s1 <= 0) { if (c == 0xA5) { /* YEN SIGN */ s1 = 0x5C; - } else if (c == 0x203E) { /* OVER LINE */ - s1 = 0x7E; } else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ s1 = 0x2140; } else if (c == 0xFF5E) { /* FULLWIDTH TILDE */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c index 7f0ff31aff..bad0423503 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c @@ -749,8 +749,6 @@ int mbfl_filt_conv_wchar_sjis_mobile(int c, mbfl_convert_filter *filter) if (s1 <= 0) { if (c == 0xA5) { /* YEN SIGN */ s1 = 0x216F; /* FULLWIDTH YEN SIGN */ - } else if (c == 0x203E) { /* OVER LINE */ - s1 = 0x2131; /* FULLWIDTH MACRON */ } else if (c == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */ s1 = 0x2140; } else if (c == 0xFF5E) { /* FULLWIDTH TILDE */ diff --git a/ext/mbstring/libmbfl/filters/unicode_table_jis.h b/ext/mbstring/libmbfl/filters/unicode_table_jis.h index 3236578f9b..450428c578 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_jis.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_jis.h @@ -2444,7 +2444,7 @@ const unsigned short ucs_a2_jis_table[] = { 0x2277,0x2278,0x0000,0x0000,0x0000,0x2145,0x2144,0x0000, 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 0x2273,0x0000,0x216C,0x216D,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x2228,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x2228,0x0000,0x0000,0x2131,0x0000, 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, diff --git a/ext/mbstring/tests/cp51932_encoding.phpt b/ext/mbstring/tests/cp51932_encoding.phpt index 2fc25fd3c8..5bf9ffe42e 100644 --- a/ext/mbstring/tests/cp51932_encoding.phpt +++ b/ext/mbstring/tests/cp51932_encoding.phpt @@ -86,6 +86,8 @@ for ($i = 0; $i <= 0x7F; $i++) /* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */ $fromUnicode["\x00\xA5"] = "\xA1\xEF"; +/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */ +$fromUnicode["\x20\x3E"] = "\xA1\xB1"; testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false); testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false); diff --git a/ext/mbstring/tests/cp932_encoding.phpt b/ext/mbstring/tests/cp932_encoding.phpt index ec9e76f1f6..25844774de 100644 --- a/ext/mbstring/tests/cp932_encoding.phpt +++ b/ext/mbstring/tests/cp932_encoding.phpt @@ -47,6 +47,9 @@ $fromUnicode["\x20\x16"] = "\x81\x61"; * but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */ $fromUnicode["\x00\xAC"] = "\x81\xCA"; +/* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */ +$fromUnicode["\x20\x3E"] = "\x81\x50"; + findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2)); findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2)); diff --git a/ext/mbstring/tests/eucjp_encoding.phpt b/ext/mbstring/tests/eucjp_encoding.phpt index dc321d3bae..3a90431e4a 100644 --- a/ext/mbstring/tests/eucjp_encoding.phpt +++ b/ext/mbstring/tests/eucjp_encoding.phpt @@ -43,6 +43,9 @@ $fromUnicode["\x00\x00\x00\x7E"] = "\x7E"; /* Likewise with 0x005C */ $fromUnicode["\x00\x00\x00\x5C"] = "\x5C"; +/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */ +$fromUnicode["\x00\x00\x20\x3E"] = "\xA1\xB1"; + findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA1, 0xFE), 2) + array(0x8E => 2, 0x8F => 3)); /* In the JIS X 0212 character set, kuten code 0x2237 (EUC-JP 0x8FA2B7) diff --git a/ext/mbstring/tests/sjismac_encoding.phpt b/ext/mbstring/tests/sjismac_encoding.phpt index 2dedfa7970..e8b09d266f 100644 --- a/ext/mbstring/tests/sjismac_encoding.phpt +++ b/ext/mbstring/tests/sjismac_encoding.phpt @@ -62,6 +62,9 @@ $fromUnicode["\x00\x7F"] = "\x7F"; * and U+2015 */ $fromUnicode["\x20\x15"] = "\x81\x5C"; +/* Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) */ +$fromUnicode["\x20\x3E"] = "\x81\x50"; + testAllValidChars($validChars, 'SJIS-mac', 'UTF-32BE'); echo "MacJapanese verification and conversion works on all valid characters\n";