]> granicus.if.org Git - php/commitdiff
Leading BOM is stripped for UTF-32
authorAlex Dowad <alexinbeijing@gmail.com>
Fri, 4 Sep 2020 20:21:20 +0000 (22:21 +0200)
committerAlex Dowad <alexinbeijing@gmail.com>
Wed, 11 Nov 2020 09:18:59 +0000 (11:18 +0200)
For consistency with UTF-16 and UCS-4.

Also, do some code cleanup.

ext/mbstring/libmbfl/filters/mbfilter_utf32.c
ext/mbstring/tests/illformed_utf_sequences.phpt

index 51051a4db9194bda466b09510e1c3df8d202c688..a9a7903b5db7a031c88061ee4031d01d9db08033 100644 (file)
@@ -129,106 +129,53 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf32le = {
 
 #define CK(statement)  do { if ((statement) < 0) return (-1); } while (0)
 
-/*
- * UTF-32 => wchar
- */
-int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter)
+static int emit_char_if_valid(int n, mbfl_convert_filter *filter)
 {
-       int n, endian;
+       if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) {
+               CK((*filter->output_function)(n, filter->data));
+       } else {
+               n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
+               CK((*filter->output_function)(n, filter->data));
+       }
+       return 0;
+}
 
-       endian = filter->status & 0xff00;
-       switch (filter->status & 0xff) {
-       case 0:
-               if (endian) {
-                       n = c & 0xff;
-               } else {
-                       n = (c & 0xffu) << 24;
-               }
-               filter->cache = n;
-               filter->status++;
-               break;
-       case 1:
-               if (endian) {
-                       n = (c & 0xff) << 8;
-               } else {
-                       n = (c & 0xff) << 16;
-               }
-               filter->cache |= n;
-               filter->status++;
-               break;
-       case 2:
-               if (endian) {
-                       n = (c & 0xff) << 16;
-               } else {
-                       n = (c & 0xff) << 8;
-               }
-               filter->cache |= n;
+int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter)
+{
+       if (filter->status < 3) {
+               filter->cache = (filter->cache << 8) | (c & 0xFF);
                filter->status++;
-               break;
-       default:
-               if (endian) {
-                       n = (c & 0xffu) << 24;
-               } else {
-                       n = c & 0xff;
-               }
-               n |= filter->cache;
-               if ((n & 0xffff) == 0 && ((n >> 16) & 0xffff) == 0xfffe) {
-                       if (endian) {
-                               filter->status = 0;             /* big-endian */
-                       } else {
-                               filter->status = 0x100;         /* little-endian */
-                       }
-                       CK((*filter->output_function)(0xfeff, filter->data));
+       } else {
+               int n = ((unsigned int)filter->cache << 8) | (c & 0xFF);
+               filter->cache = filter->status = 0;
+
+               if (n == 0xFFFE0000) {
+                       /* Found a little-endian byte order mark */
+                       filter->filter_function = mbfl_filt_conv_utf32le_wchar;
                } else {
-                       filter->status &= ~0xff;
-                       if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
-                               CK((*filter->output_function)(n, filter->data));
-                       } else {
-                               n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
-                               CK((*filter->output_function)(n, filter->data));
+                       filter->filter_function = mbfl_filt_conv_utf32be_wchar;
+                       if (n != 0xFEFF) {
+                               CK(emit_char_if_valid(n, filter));
                        }
                }
-               break;
        }
 
        return c;
 }
 
-/*
- * UTF-32BE => wchar
- */
 int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter)
 {
-       int n;
-
-       if (filter->status == 0) {
-               filter->status = 1;
-               n = (c & 0xffu) << 24;
-               filter->cache = n;
-       } else if (filter->status == 1) {
-               filter->status = 2;
-               n = (c & 0xff) << 16;
-               filter->cache |= n;
-       } else if (filter->status == 2) {
-               filter->status = 3;
-               n = (c & 0xff) << 8;
-               filter->cache |= n;
+       if (filter->status < 3) {
+               filter->cache = (filter->cache << 8) | (c & 0xFF);
+               filter->status++;
        } else {
-               filter->status = 0;
-               n = (c & 0xff) | filter->cache;
-               if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
-                       CK((*filter->output_function)(n, filter->data));
-               } else {
-                       n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
-                       CK((*filter->output_function)(n, filter->data));
-               }
+               int n = ((unsigned int)filter->cache << 8) | (c & 0xFF);
+               filter->cache = filter->status = 0;
+               CK(emit_char_if_valid(n, filter));
        }
        return c;
 }
 
-/*
- * wchar => UTF-32BE
- */
 int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter)
 {
        if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
@@ -243,41 +190,19 @@ int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter)
        return c;
 }
 
-/*
- * UTF-32LE => wchar
- */
 int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter)
 {
-       int n;
-
-       if (filter->status == 0) {
-               filter->status = 1;
-               n = (c & 0xff);
-               filter->cache = n;
-       } else if (filter->status == 1) {
-               filter->status = 2;
-               n = (c & 0xff) << 8;
-               filter->cache |= n;
-       } else if (filter->status == 2) {
-               filter->status = 3;
-               n = (c & 0xff) << 16;
-               filter->cache |= n;
+       if (filter->status < 3) {
+               filter->cache |= ((c & 0xFFU) << (8 * filter->status));
+               filter->status++;
        } else {
-               filter->status = 0;
-               n = ((c & 0xffu) << 24) | filter->cache;
-               if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
-                       CK((*filter->output_function)(n, filter->data));
-               } else {
-                       n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
-                       CK((*filter->output_function)(n, filter->data));
-               }
+               int n = ((c & 0xFFU) << 24) | filter->cache;
+               filter->cache = filter->status = 0;
+               CK(emit_char_if_valid(n, filter));
        }
        return c;
 }
 
-/*
- * wchar => UTF-32LE
- */
 int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
 {
        if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
@@ -294,7 +219,7 @@ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
 
 static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter)
 {
-       if (filter->status & 0xF) {
+       if (filter->status) {
                /* Input string was truncated */
                CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
        }
@@ -303,6 +228,5 @@ static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter)
                (*filter->flush_function)(filter->data);
        }
 
-       filter->status = filter->cache = 0;
        return 0;
 }
index 3b7d431a0ed1b66759a2fc75451639c50040064c..8f3b97e8ec90d91418c4a51cc555619fa2252cf1 100644 (file)
@@ -4,12 +4,9 @@ Unicode standard conformance test (ill-formed UTF sequences.)
 <?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
 --FILE--
 <?php
-function chk_enc($str, $n, $enc = "UTF-8", $with_bom = false) {
+function chk_enc($str, $n, $enc = "UTF-8") {
     $src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc));
     $dst = str_repeat("0000fffd", $n);
-    if ($with_bom) {
-        $dst = "0000feff" . $dst;
-    }
     if ($dst == $src) {
         return false;
     } else {
@@ -129,7 +126,7 @@ $out = '';
 $cnt = 0;
 for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
     $s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff),
-                 1, "UTF-32", true);
+                 1, "UTF-32");
     if ($s === false) {
         $cnt++;
     } else {
@@ -137,13 +134,13 @@ for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
     }
 }
 var_dump($cnt);
-var_dump(str_replace("0000feff","",$out));
+var_dump($out);
 
 $out = '';
 $cnt = 0;
 for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
     $s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff),
-                 1, "UTF-32", true);
+                 1, "UTF-32");
     if ($s === false) {
         $cnt++;
     } else {
@@ -151,7 +148,7 @@ for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
     }
 }
 var_dump($cnt);
-var_dump(str_replace("0000feff","",$out));
+var_dump($out);
 
 ?>
 --EXPECT--
@@ -199,10 +196,10 @@ bool(false)
 string(8) "0010ffff"
 bool(false)
 string(8) "0010ffff"
-string(16) "0000feff0000fffd"
-string(16) "0000feff0010ffff"
-string(16) "0000feff0000fffd"
-string(16) "0000feff0010ffff"
+string(8) "0000fffd"
+string(8) "0010ffff"
+string(8) "0000fffd"
+string(8) "0010ffff"
 UTF-32 and surrogates area
 int(2048)
 string(16) "0000d7ff0000e000"