From c5903bd6ab73466d63dabba09da1f26c02419961 Mon Sep 17 00:00:00 2001 From: Moriyoshi Koizumi Date: Tue, 24 Feb 2009 13:22:47 +0000 Subject: [PATCH] - MFH: strictly check UTF-8 and UTF-32 validity --- ext/mbstring/libmbfl/filters/mbfilter_utf32.c | 16 +- ext/mbstring/libmbfl/filters/mbfilter_utf8.c | 9 +- ext/mbstring/libmbfl/mbfl/mbfl_consts.h | 1 + .../tests/illformed_utf_sequences.phpt | 148 ++++++++++++++++++ 4 files changed, 167 insertions(+), 7 deletions(-) create mode 100644 ext/mbstring/tests/illformed_utf_sequences.phpt diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c index 4b0e9b9e0f..56d6dd4c97 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c @@ -171,7 +171,9 @@ int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter) CK((*filter->output_function)(0xfeff, filter->data)); } else { filter->status &= ~0xff; - CK((*filter->output_function)(n, filter->data)); + if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) { + CK((*filter->output_function)(n, filter->data)); + } } break; } @@ -201,7 +203,9 @@ int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter) } else { filter->status = 0; n = (c & 0xff) | filter->cache; - CK((*filter->output_function)(n, filter->data)); + if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) { + CK((*filter->output_function)(n, filter->data)); + } } return c; } @@ -211,7 +215,7 @@ int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter) */ int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter) { - if (c >= 0 && c < MBFL_WCSGROUP_UCS4MAX) { + if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) { CK((*filter->output_function)((c >> 24) & 0xff, filter->data)); CK((*filter->output_function)((c >> 16) & 0xff, filter->data)); CK((*filter->output_function)((c >> 8) & 0xff, filter->data)); @@ -247,7 +251,9 @@ int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter) } else { filter->status = 0; n = ((c & 0xff) << 24) | filter->cache; - CK((*filter->output_function)(n, filter->data)); + if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) { + CK((*filter->output_function)(n, filter->data)); + } } return c; } @@ -257,7 +263,7 @@ int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter) */ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter) { - if (c >= 0 && c < MBFL_WCSGROUP_UCS4MAX) { + if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) { CK((*filter->output_function)(c & 0xff, filter->data)); CK((*filter->output_function)((c >> 8) & 0xff, filter->data)); CK((*filter->output_function)((c >> 16) & 0xff, filter->data)); diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 8b95897eac..20ff983e11 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -106,7 +106,8 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter) } filter->status = 0; } else if (c < 0xc0) { - switch (filter->status & 0xff) { + int status = filter->status & 0xff; + switch (status) { case 0x10: /* 2byte code 2nd char */ case 0x21: /* 3byte code 3rd char */ case 0x32: /* 4byte code 4th char */ @@ -114,7 +115,11 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter) case 0x54: /* 6byte code 6th char */ filter->status = 0; s = filter->cache | (c & 0x3f); - if (s >= 0x80) { + if ((status == 0x10 && s >= 0x80) || + (status == 0x21 && s >= 0x800 && (s < 0xd800 || s > 0xdfff)) || + (status == 0x32 && s >= 0x10000) || + (status == 0x43 && s >= 0x200000) || + (status == 0x54 && s >= 0x4000000 && s < MBFL_WCSGROUP_UCS4MAX)) { CK((*filter->output_function)(s, filter->data)); } break; diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_consts.h b/ext/mbstring/libmbfl/mbfl/mbfl_consts.h index f500766b49..cf4eaff1db 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_consts.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_consts.h @@ -47,6 +47,7 @@ /* wchar plane, special charactor */ #define MBFL_WCSPLANE_MASK 0xffff #define MBFL_WCSPLANE_UCS2MAX 0x00010000 +#define MBFL_WCSPLANE_UTF32MAX 0x00110000 #define MBFL_WCSPLANE_SUPMIN 0x00010000 #define MBFL_WCSPLANE_SUPMAX 0x00200000 #define MBFL_WCSPLANE_JIS0208 0x70e10000 /* JIS HEX : 2121h - 7E7Eh */ diff --git a/ext/mbstring/tests/illformed_utf_sequences.phpt b/ext/mbstring/tests/illformed_utf_sequences.phpt new file mode 100644 index 0000000000..a462cd0745 --- /dev/null +++ b/ext/mbstring/tests/illformed_utf_sequences.phpt @@ -0,0 +1,148 @@ +--TEST-- +Unicode standard conformance test (ill-formed UTF sequences.) +--SKIPIF-- + +--FILE-- +> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), "UCS-4BE", "UTF-8"); +} +var_dump(bin2hex($out)); + +echo "UTF-32 code range\n"; +var_dump(bin2hex(mb_convert_encoding("\x00\x11\x00\x00", "UCS-4BE", "UTF-32BE"))); +var_dump(bin2hex(mb_convert_encoding("\x00\x10\xff\xff", "UCS-4BE", "UTF-32BE"))); +var_dump(bin2hex(mb_convert_encoding("\x00\x00\x11\x00", "UCS-4BE", "UTF-32LE"))); +var_dump(bin2hex(mb_convert_encoding("\xff\xff\x10\x00", "UCS-4BE", "UTF-32LE"))); +var_dump(bin2hex(mb_convert_encoding("\x00\x11\x00\x00", "UCS-4BE", "UTF-32"))); +var_dump(bin2hex(mb_convert_encoding("\x00\x10\xff\xff", "UCS-4BE", "UTF-32"))); +var_dump(bin2hex(mb_convert_encoding("\x00\x00\xfe\xff\x00\x11\x00\x00", "UCS-4BE", "UTF-32"))); +var_dump(bin2hex(mb_convert_encoding("\x00\x00\xfe\xff\x00\x10\xff\xff", "UCS-4BE", "UTF-32"))); +var_dump(bin2hex(mb_convert_encoding("\xff\xfe\x00\x00\x00\x00\x11\x00", "UCS-4BE", "UTF-32"))); +var_dump(bin2hex(mb_convert_encoding("\xff\xfe\x00\x00\xff\xff\x10\x00", "UCS-4BE", "UTF-32"))); + +echo "UTF-32 and surrogates area\n"; +$out = ''; +for ($i = 0xd7ff; $i <= 0xe000; ++$i) { + $out .= mb_convert_encoding(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), "UCS-4BE", "UTF-32BE"); +} +var_dump(bin2hex($out)); + +$out = ''; +for ($i = 0xd7ff; $i <= 0xe000; ++$i) { + $out .= mb_convert_encoding(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), "UCS-4BE", "UTF-32LE"); +} +var_dump(bin2hex($out)); + +$out = ''; +for ($i = 0xd7ff; $i <= 0xe000; ++$i) { + $out .= mb_convert_encoding(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), "UCS-4BE", "UTF-32"); +} +var_dump(bin2hex($out)); + +$out = ''; +for ($i = 0xd7ff; $i <= 0xe000; ++$i) { + $out .= mb_convert_encoding("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), "UCS-4BE", "UTF-32"); +} +var_dump(bin2hex(str_replace("\x00\x00\xfe\xff", "", $out))); + + +$out = ''; +for ($i = 0xd7ff; $i <= 0xe000; ++$i) { + $out .= mb_convert_encoding("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), "UCS-4BE", "UTF-32"); +} +var_dump(bin2hex(str_replace("\x00\x00\xfe\xff", "", $out))); +?> +--EXPECT-- +UTF-8 redundancy +string(24) "000000310000003200000033" +string(24) "000000410000004200000043" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(24) "000000a2000000a3000000a5" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(8) "00000080" +string(8) "000007ff" +string(0) "" +string(8) "00000800" +string(8) "0000ffff" +string(0) "" +string(8) "00010000" +string(8) "001fffff" +string(0) "" +string(8) "00200000" +string(8) "03ffffff" +string(0) "" +string(8) "04000000" +string(8) "6fffffff" +string(0) "" +UTF-8 and surrogates area +string(16) "0000d7ff0000e000" +UTF-32 code range +string(0) "" +string(8) "0010ffff" +string(0) "" +string(8) "0010ffff" +string(0) "" +string(8) "0010ffff" +string(8) "0000feff" +string(16) "0000feff0010ffff" +string(8) "0000feff" +string(16) "0000feff0010ffff" +UTF-32 and surrogates area +string(16) "0000d7ff0000e000" +string(16) "0000d7ff0000e000" +string(16) "0000d7ff0000e000" +string(16) "0000d7ff0000e000" +string(16) "0000d7ff0000e000" -- 2.40.0