From e4ee97911132c6ad4dee372369472316a33b4eee Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sat, 14 Nov 2020 21:15:11 +0200 Subject: [PATCH] 0x5C is not a Yen sign in CP932 (or CP51932) When Microsoft created CP932 (their version of Shift-JIS), they explicitly used bytes 0-0x7F to represent ASCII characters rather than JIS X 0201 characters. So when converting Unicode to CP932, it is not correct to convert U+00A5 to CP932 0x5C. Fortunately, CP932 does have a multi-byte FULLWIDTH YEN SIGN character which we can use instead. CP51932 uses the same extended character set as CP932; while CP932 is MicroSoft's extended version of Shift-JIS, CP51932 is their extended version of EUC-JP. So the same reasoning applies to CP51932. --- ext/mbstring/libmbfl/filters/mbfilter_cp51932.c | 4 ++-- ext/mbstring/libmbfl/filters/mbfilter_cp932.c | 4 ++-- ext/mbstring/tests/cp51932_encoding.phpt | 3 +++ ext/mbstring/tests/cp932_encoding.phpt | 2 ++ 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c index d9fecc9d4d..aa52d05481 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c @@ -214,8 +214,8 @@ mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter) } if (s1 >= 0x8080) s1 = -1; /* we don't support JIS X0213 */ if (s1 <= 0) { - if (c == 0xa5) { /* YEN SIGN */ - s1 = 0x005c; /* YEN SIGN */ + if (c == 0xa5) { /* YEN SIGN */ + s1 = 0x216F; /* FULLWIDTH YEN SIGN */ } else if (c == 0x203e) { /* OVER LINE */ s1 = 0x007e; /* FULLWIDTH MACRON */ } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c index 6246600de8..ec192faa2b 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c @@ -251,8 +251,8 @@ mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter) s2 = 1; } if (s1 <= 0) { - if (c == 0xa5) { /* YEN SIGN */ - s1 = 0x005c; /* YEN SIGN */ + if (c == 0xa5) { /* YEN SIGN */ + s1 = 0x216F; /* FULLWIDTH YEN SIGN */ } else if (c == 0x203e) { /* OVER LINE */ s1 = 0x007e; /* FULLWIDTH MACRON */ } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ diff --git a/ext/mbstring/tests/cp51932_encoding.phpt b/ext/mbstring/tests/cp51932_encoding.phpt index 8dbbeb85a0..2fc25fd3c8 100644 --- a/ext/mbstring/tests/cp51932_encoding.phpt +++ b/ext/mbstring/tests/cp51932_encoding.phpt @@ -84,6 +84,9 @@ unset($fromUnicode["\x30\x94"]); // Don't map hiragana vu to katakana vu for ($i = 0; $i <= 0x7F; $i++) $validChars[chr($i)] = "\x00" . chr($i); +/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */ +$fromUnicode["\x00\xA5"] = "\xA1\xEF"; + testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false); testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false); echo "CP51932 verification and conversion works on all valid characters\n"; diff --git a/ext/mbstring/tests/cp932_encoding.phpt b/ext/mbstring/tests/cp932_encoding.phpt index ddcf8a449b..ec9e76f1f6 100644 --- a/ext/mbstring/tests/cp932_encoding.phpt +++ b/ext/mbstring/tests/cp932_encoding.phpt @@ -30,6 +30,8 @@ for ($i = 0xF0; $i <= 0xF9; $i++) { $fromUnicode["\x00\xA2"] = "\x81\x91"; /* U+00A3 is POUND SIGN; convert to FULLWIDTH POUND SIGN */ $fromUnicode["\x00\xA3"] = "\x81\x92"; +/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */ +$fromUnicode["\x00\xA5"] = "\x81\x8F"; /* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE) * But when converting Unicode to CP932, we also accept U+301C (WAVE DASH) */ -- 2.50.1