From 6dd75478d55e57f0c0f9876b000e54b114a12b52 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Fri, 4 Sep 2020 22:21:20 +0200 Subject: [PATCH] Leading BOM is stripped for UTF-32 For consistency with UTF-16 and UCS-4. Also, do some code cleanup. --- ext/mbstring/libmbfl/filters/mbfilter_utf32.c | 148 +++++------------- .../tests/illformed_utf_sequences.phpt | 21 ++- 2 files changed, 45 insertions(+), 124 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c index 51051a4db9..a9a7903b5d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c @@ -129,106 +129,53 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf32le = { #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) -/* - * UTF-32 => wchar - */ -int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter) +static int emit_char_if_valid(int n, mbfl_convert_filter *filter) { - int n, endian; + if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) { + CK((*filter->output_function)(n, filter->data)); + } else { + n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH; + CK((*filter->output_function)(n, filter->data)); + } + return 0; +} - endian = filter->status & 0xff00; - switch (filter->status & 0xff) { - case 0: - if (endian) { - n = c & 0xff; - } else { - n = (c & 0xffu) << 24; - } - filter->cache = n; - filter->status++; - break; - case 1: - if (endian) { - n = (c & 0xff) << 8; - } else { - n = (c & 0xff) << 16; - } - filter->cache |= n; - filter->status++; - break; - case 2: - if (endian) { - n = (c & 0xff) << 16; - } else { - n = (c & 0xff) << 8; - } - filter->cache |= n; +int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter) +{ + if (filter->status < 3) { + filter->cache = (filter->cache << 8) | (c & 0xFF); filter->status++; - break; - default: - if (endian) { - n = (c & 0xffu) << 24; - } else { - n = c & 0xff; - } - n |= filter->cache; - if ((n & 0xffff) == 0 && ((n >> 16) & 0xffff) == 0xfffe) { - if (endian) { - filter->status = 0; /* big-endian */ - } else { - filter->status = 0x100; /* little-endian */ - } - CK((*filter->output_function)(0xfeff, filter->data)); + } else { + int n = ((unsigned int)filter->cache << 8) | (c & 0xFF); + filter->cache = filter->status = 0; + + if (n == 0xFFFE0000) { + /* Found a little-endian byte order mark */ + filter->filter_function = mbfl_filt_conv_utf32le_wchar; } else { - filter->status &= ~0xff; - if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) { - CK((*filter->output_function)(n, filter->data)); - } else { - n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH; - CK((*filter->output_function)(n, filter->data)); + filter->filter_function = mbfl_filt_conv_utf32be_wchar; + if (n != 0xFEFF) { + CK(emit_char_if_valid(n, filter)); } } - break; } return c; } -/* - * UTF-32BE => wchar - */ int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter) { - int n; - - if (filter->status == 0) { - filter->status = 1; - n = (c & 0xffu) << 24; - filter->cache = n; - } else if (filter->status == 1) { - filter->status = 2; - n = (c & 0xff) << 16; - filter->cache |= n; - } else if (filter->status == 2) { - filter->status = 3; - n = (c & 0xff) << 8; - filter->cache |= n; + if (filter->status < 3) { + filter->cache = (filter->cache << 8) | (c & 0xFF); + filter->status++; } else { - filter->status = 0; - n = (c & 0xff) | filter->cache; - if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) { - CK((*filter->output_function)(n, filter->data)); - } else { - n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH; - CK((*filter->output_function)(n, filter->data)); - } + int n = ((unsigned int)filter->cache << 8) | (c & 0xFF); + filter->cache = filter->status = 0; + CK(emit_char_if_valid(n, filter)); } return c; } -/* - * wchar => UTF-32BE - */ int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter) { if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) { @@ -243,41 +190,19 @@ int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter) return c; } -/* - * UTF-32LE => wchar - */ int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter) { - int n; - - if (filter->status == 0) { - filter->status = 1; - n = (c & 0xff); - filter->cache = n; - } else if (filter->status == 1) { - filter->status = 2; - n = (c & 0xff) << 8; - filter->cache |= n; - } else if (filter->status == 2) { - filter->status = 3; - n = (c & 0xff) << 16; - filter->cache |= n; + if (filter->status < 3) { + filter->cache |= ((c & 0xFFU) << (8 * filter->status)); + filter->status++; } else { - filter->status = 0; - n = ((c & 0xffu) << 24) | filter->cache; - if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) { - CK((*filter->output_function)(n, filter->data)); - } else { - n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH; - CK((*filter->output_function)(n, filter->data)); - } + int n = ((c & 0xFFU) << 24) | filter->cache; + filter->cache = filter->status = 0; + CK(emit_char_if_valid(n, filter)); } return c; } -/* - * wchar => UTF-32LE - */ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter) { if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) { @@ -294,7 +219,7 @@ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter) static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter) { - if (filter->status & 0xF) { + if (filter->status) { /* Input string was truncated */ CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data)); } @@ -303,6 +228,5 @@ static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter) (*filter->flush_function)(filter->data); } - filter->status = filter->cache = 0; return 0; } diff --git a/ext/mbstring/tests/illformed_utf_sequences.phpt b/ext/mbstring/tests/illformed_utf_sequences.phpt index 3b7d431a0e..8f3b97e8ec 100644 --- a/ext/mbstring/tests/illformed_utf_sequences.phpt +++ b/ext/mbstring/tests/illformed_utf_sequences.phpt @@ -4,12 +4,9 @@ Unicode standard conformance test (ill-formed UTF sequences.) --FILE-- > 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), - 1, "UTF-32", true); + 1, "UTF-32"); if ($s === false) { $cnt++; } else { @@ -137,13 +134,13 @@ for ($i = 0xd7ff; $i <= 0xe000; ++$i) { } } var_dump($cnt); -var_dump(str_replace("0000feff","",$out)); +var_dump($out); $out = ''; $cnt = 0; for ($i = 0xd7ff; $i <= 0xe000; ++$i) { $s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), - 1, "UTF-32", true); + 1, "UTF-32"); if ($s === false) { $cnt++; } else { @@ -151,7 +148,7 @@ for ($i = 0xd7ff; $i <= 0xe000; ++$i) { } } var_dump($cnt); -var_dump(str_replace("0000feff","",$out)); +var_dump($out); ?> --EXPECT-- @@ -199,10 +196,10 @@ bool(false) string(8) "0010ffff" bool(false) string(8) "0010ffff" -string(16) "0000feff0000fffd" -string(16) "0000feff0010ffff" -string(16) "0000feff0000fffd" -string(16) "0000feff0010ffff" +string(8) "0000fffd" +string(8) "0010ffff" +string(8) "0000fffd" +string(8) "0010ffff" UTF-32 and surrogates area int(2048) string(16) "0000d7ff0000e000" -- 2.40.0