From: Alex Dowad Date: Sat, 14 Nov 2020 21:43:28 +0000 (+0200) Subject: Convert U+00AF (MACRON) to 0x8150 (FULLWIDTH MACRON) in some SJIS variants X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c9fea7db728ae0a22bf6f903fcbbea9468f222a4;p=php Convert U+00AF (MACRON) to 0x8150 (FULLWIDTH MACRON) in some SJIS variants Except for vanilla Shift-JIS, where 0x7E is a halfwidth overline/macron. As for Shift-JIS-2004, it has an added character (byte sequence 0x854A) which was defined as a halfwidth macron in JIS X 0213:2000, so we use that. --- diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c index 507a26a5b1..25ce6c92bc 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c @@ -194,7 +194,9 @@ mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter) { int s = 0; - if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { + if (c == 0xAF) { /* U+00AF is MACRON */ + s = 0xA2B4; /* Use JIS X 0212 overline */ + } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { s = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { s = ucs_a2_jis_table[c - ucs_a2_jis_table_min]; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index 455c49cb9a..bde382a6d3 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -211,7 +211,7 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter) /* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or * macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */ s1 = 0x2141; - } else if (c == 0x203E) { /* U+203E is OVERLINE */ + } else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */ s1 = 0x7E; /* Halfwidth overline/macron */ } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; diff --git a/ext/mbstring/libmbfl/filters/unicode_table_jis.h b/ext/mbstring/libmbfl/filters/unicode_table_jis.h index 640c5587d8..04e6a63b9e 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_jis.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_jis.h @@ -2303,7 +2303,7 @@ const unsigned short ucs_a1_jis_table[] = { 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 0x0000,0xA2C2,0x2171,0x2172,0xA2F0,0x0000,0xA2C3,0x2178, - 0x212F,0xA2ED,0xA2EC,0x0000,0x224C,0x0000,0xA2EE,0xA2B4, + 0x212F,0xA2ED,0xA2EC,0x0000,0x224C,0x0000,0xA2EE,0x2131, 0x216B,0x215E,0x0000,0x0000,0x212D,0x0000,0x2279,0x0000, 0xA2B1,0x0000,0xA2EB,0x0000,0x0000,0x0000,0x0000,0xA2C4, 0xAAA2,0xAAA1,0xAAA4,0xAAAA,0xAAA3,0xAAA9,0xA9A1,0xAAAE, diff --git a/ext/mbstring/tests/cp51932_encoding.phpt b/ext/mbstring/tests/cp51932_encoding.phpt index 5bf9ffe42e..bf7b60bcc3 100644 --- a/ext/mbstring/tests/cp51932_encoding.phpt +++ b/ext/mbstring/tests/cp51932_encoding.phpt @@ -88,6 +88,8 @@ for ($i = 0; $i <= 0x7F; $i++) $fromUnicode["\x00\xA5"] = "\xA1\xEF"; /* U+203E is OVERLINE; convert to FULLWIDTH MACRON */ $fromUnicode["\x20\x3E"] = "\xA1\xB1"; +/* U+00AF is MACRON; convert to FULLWIDTH MACRON */ +$fromUnicode["\x00\xAF"] = "\xA1\xB1"; testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false); testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false); diff --git a/ext/mbstring/tests/cp932_encoding.phpt b/ext/mbstring/tests/cp932_encoding.phpt index 25844774de..b426281f24 100644 --- a/ext/mbstring/tests/cp932_encoding.phpt +++ b/ext/mbstring/tests/cp932_encoding.phpt @@ -50,6 +50,9 @@ $fromUnicode["\x00\xAC"] = "\x81\xCA"; /* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */ $fromUnicode["\x20\x3E"] = "\x81\x50"; +/* U+00AF is MACRON; it can also go to FULLWIDTH MACRON */ +$fromUnicode["\x00\xAF"] = "\x81\x50"; + findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2)); findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2)); diff --git a/ext/mbstring/tests/sjis_encoding.phpt b/ext/mbstring/tests/sjis_encoding.phpt index 8ac3b0563e..d7d7d26457 100644 --- a/ext/mbstring/tests/sjis_encoding.phpt +++ b/ext/mbstring/tests/sjis_encoding.phpt @@ -24,6 +24,8 @@ $fromUnicode["\x00\x7E"] = "\x81\x60"; /* DEL character */ $validChars["\x7F"] = "\x00\x7F"; $fromUnicode["\x00\x7F"] = "\x7F"; +/* U+00AF is MACRON; Shift-JIS 0x7E is overline */ +$fromUnicode["\x00\xAF"] = "\x7E"; /* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */ $validChars["\x81\x5F"] = "\xFF\x3C"; $fromUnicode["\xFF\x3C"] = "\x81\x5F"; diff --git a/ext/mbstring/tests/sjismac_encoding.phpt b/ext/mbstring/tests/sjismac_encoding.phpt index 3c36484f4a..5803f0dc02 100644 --- a/ext/mbstring/tests/sjismac_encoding.phpt +++ b/ext/mbstring/tests/sjismac_encoding.phpt @@ -64,6 +64,8 @@ $fromUnicode["\x20\x15"] = "\x81\x5C"; /* Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) */ $fromUnicode["\x20\x3E"] = "\x81\x50"; +/* And also U+00AF (MACRON) */ +$fromUnicode["\x00\xAF"] = "\x81\x50"; /* Convert U+FF5E (FULLWIDTH TILDE) to 0x8160 (WAVE DASH) */ $fromUnicode["\xFF\x5E"] = "\x81\x60";