From: Alex Dowad Date: Sat, 14 Nov 2020 18:47:31 +0000 (+0200) Subject: 0x5C is not a backslash in Shift-JIS-2004 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=315d48b4340f79731882b7f87422801a065475b8;p=php 0x5C is not a backslash in Shift-JIS-2004 Shift-JIS-2004 is an extension of Shift-JIS, which uses 0x5C for the Yen sign. Therefore, it is not correct to convert ASCII 0x5C (backslash) to Shift-JIS-2004 0x5C (yen sign). JIS X 0208 does have a backslash, so we can convert ASCII backslash to SJIS-2004 backslash instead. From time immemorial, there has been confusion around the treatment of 0x5C bytes on systems using legacy Japanese encodings. JIS X 0201 specified that 0x5C means a yen sign, and thus fonts on Japanese systems, including early versions of Windows, displayed a 0x5C byte as a yen sign. This meant that when ASCII text files were displayed on such systems, what were meant to be backslashes would appear as yen signs. Japanese C programmers could write character escapes using yen signs, and C compilers built on the assumption that the input was ASCII would interpret these escapes as desired. Likewise for shell scripts. Et cetera, et cetera... Therefore, if the input to `mb_convert_encoding` is (for example) a C program, and after converting to Shift-JIS-2004, the user wishes to feed the output into a C compiler, *then* perhaps ASCII 0x5C should be mapped to SJIS 0x5C. However, this scenario is ridiculous and will never happen. A more realistic scenario might be: an article written in SJIS-2004 has embedded Windows file paths (like 'C:\Program Files'), with yen signs used as a path separator. If we convert SJIS-2004 0x5C to ASCII 0x5C, then the path separators will be 'fixed' by the conversion. For general written texts, it is much better to convert backslashes to... backslashes. And yen signs, to yen signs. --- diff --git a/ext/mbstring/libmbfl/filters/unicode_table_jis2004.h b/ext/mbstring/libmbfl/filters/unicode_table_jis2004.h index 01afcc1f2d..c8bc433d34 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_jis2004.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_jis2004.h @@ -1608,7 +1608,7 @@ static const unsigned short ucs_a1_jisx0213_table[] = { // 0x0000 - 0x045f 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047, 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F, 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057, -0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F, +0x0058,0x0059,0x005A,0x005B,0x2140,0x005D,0x005E,0x005F, 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067, 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F, 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077, diff --git a/ext/mbstring/tests/sjis2004_encoding.phpt b/ext/mbstring/tests/sjis2004_encoding.phpt index cf9ef7a91a..3278e85dfe 100644 --- a/ext/mbstring/tests/sjis2004_encoding.phpt +++ b/ext/mbstring/tests/sjis2004_encoding.phpt @@ -37,7 +37,10 @@ while ($line = fgets($fp, 256)) { } } $fromUnicode["\x00\x7E"] = "\x7E"; /* Not reversible; SJIS 0x7E -> U+203E */ -$fromUnicode["\x00\x5C"] = "\x5C"; /* Not reversible; SJIS 0x5C -> U+00A5 */ + +/* U+005C is backslash, Shift-JIS 0x815F is REVERSE SOLIDUS + * (ie. a fancy way to say "backslash") */ +$fromUnicode["\x00\x5C"] = "\x81\x5F"; testAllValidChars($validChars, 'SJIS-2004', 'UTF-32BE'); echo "SJIS-2004 verification and conversion works for all valid characters\n";