]> granicus.if.org Git - php/commitdiff
Add identify filter for UTF-16, UTF-16LE, UTF-16BE
authorAlex Dowad <alexinbeijing@gmail.com>
Sun, 6 Sep 2020 08:32:58 +0000 (10:32 +0200)
committerAlex Dowad <alexinbeijing@gmail.com>
Tue, 13 Oct 2020 18:26:13 +0000 (20:26 +0200)
There was one faulty test in the suite which only passed before because UTF-16 had no
identify filter. After this was fixed, it exposed the problem with the test.

ext/mbstring/libmbfl/filters/mbfilter_utf16.c
ext/mbstring/libmbfl/filters/mbfilter_utf16.h
ext/mbstring/libmbfl/mbfl/mbfilter.c
ext/mbstring/libmbfl/mbfl/mbfl_ident.c
ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt

index 54236769a23e8af69139e9f29f181867d0ac75ee..a44ea371b3c2f015e248a35a86b358a8876f0c33 100644 (file)
 #include "mbfilter.h"
 #include "mbfilter_utf16.h"
 
+static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter);
+static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter);
+static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter);
+
 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
 
 const mbfl_encoding mbfl_encoding_utf16 = {
@@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf16le = {
        &vtbl_wchar_utf16le
 };
 
+const struct mbfl_identify_vtbl vtbl_identify_utf16 = {
+       mbfl_no_encoding_utf16,
+       mbfl_filt_ident_common_ctor,
+       mbfl_filt_ident_utf16
+};
+
+const struct mbfl_identify_vtbl vtbl_identify_utf16le = {
+       mbfl_no_encoding_utf16le,
+       mbfl_filt_ident_common_ctor,
+       mbfl_filt_ident_utf16le
+};
+
+const struct mbfl_identify_vtbl vtbl_identify_utf16be = {
+       mbfl_no_encoding_utf16be,
+       mbfl_filt_ident_common_ctor,
+       mbfl_filt_ident_utf16be
+};
+
 const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
        mbfl_no_encoding_utf16,
        mbfl_no_encoding_wchar,
@@ -320,3 +342,108 @@ int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
 
        return c;
 }
+
+static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter)
+{
+       if (filter->status == 0) {
+               if (c >= 0xfe) { /* could be a byte-order mark */
+                       filter->status = c;
+               } else {
+                       /* no byte-order mark at beginning of input; assume UTF-16BE */
+                       filter->filter_function = mbfl_filt_ident_utf16be;
+                       return (filter->filter_function)(c, filter);
+               }
+       } else {
+               unsigned short n = (filter->status << 8) | c;
+               filter->status = 0;
+
+               if (n == 0xfeff) {
+                       /* it was a big-endian byte-order mark */
+                       filter->filter_function = mbfl_filt_ident_utf16be;
+               } else if (n == 0xfffe) {
+                       /* it was a little-endian byte-order mark */
+                       filter->filter_function = mbfl_filt_ident_utf16le;
+               } else {
+                       /* it wasn't a byte-order mark */
+                       filter->filter_function = mbfl_filt_ident_utf16be;
+                       (filter->filter_function)(n >> 8, filter);
+                       return (filter->filter_function)(c, filter);
+               }
+       }
+       return c;
+}
+
+static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter)
+{
+       switch (filter->status) {
+       case 0: /* 1st byte */
+               filter->status = 1;
+               break;
+
+       case 1: /* 2nd byte */
+               if ((c & 0xfc) == 0xd8) {
+                       /* Looks like a surrogate pair */
+                       filter->status = 2;
+               } else if ((c & 0xfc) == 0xdc) {
+                       /* This is wrong; the second part of the surrogate pair has come first */
+                       filter->flag = 1;
+               } else {
+                       filter->status = 0; /* Just an ordinary 2-byte character */
+               }
+               break;
+
+       case 2: /* 3rd byte */
+               filter->status = 3;
+               break;
+
+       case 3: /* 4th byte */
+               if ((c & 0xfc) == 0xdc) {
+                       filter->status = 0;
+               } else {
+                       filter->flag = 1; /* Surrogate pair wrongly encoded */
+               }
+               break;
+       }
+
+       return c;
+}
+
+static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter)
+{
+       switch (filter->status) {
+       case 0: /* 1st byte */
+               if ((c & 0xfc) == 0xd8) {
+                       /* Looks like a surrogate pair */
+                       filter->status = 2;
+               } else if ((c & 0xfc) == 0xdc) {
+                       /* This is wrong; the second part of the surrogate pair has come first */
+                       filter->flag = 1;
+               } else {
+                       /* Just an ordinary 2-byte character */
+                       filter->status = 1;
+               }
+               break;
+
+       case 1: /* 2nd byte, not surrogate pair */
+               filter->status = 0;
+               break;
+
+       case 2: /* 2nd byte, surrogate pair */
+               filter->status = 3;
+               break;
+
+       case 3: /* 3rd byte, surrogate pair */
+               if ((c & 0xfc) == 0xdc) {
+                       filter->status = 4;
+               } else {
+                       filter->flag = 1; /* Surrogate pair wrongly encoded */
+               }
+               break;
+
+       case 4: /* 4th byte, surrogate pair */
+               filter->status = 0;
+               break;
+       }
+
+       return c;
+}
index 4aa88619c7ce66e2a839702453142bcfdd68a9de..601779add41d68742f41e475169dda93508b0d0e 100644 (file)
@@ -33,6 +33,9 @@
 extern const mbfl_encoding mbfl_encoding_utf16;
 extern const mbfl_encoding mbfl_encoding_utf16be;
 extern const mbfl_encoding mbfl_encoding_utf16le;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf16;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf16le;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf16be;
 extern const struct mbfl_convert_vtbl vtbl_utf16_wchar;
 extern const struct mbfl_convert_vtbl vtbl_wchar_utf16;
 extern const struct mbfl_convert_vtbl vtbl_utf16be_wchar;
index 4bcaa7069480175c17045435c18bf74f9d65d9c3..d0e5494387860f7f5db7e4bb28995b5a26b6b85f 100644 (file)
@@ -396,18 +396,6 @@ const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd
                        }
                        n--;
                }
-
-               /* fallback judge */
-               if (!encoding) {
-                       n = identd->filter_list_size - 1;
-                       while (n >= 0) {
-                               filter = identd->filter_list[n];
-                               if (!filter->flag) {
-                                       encoding = filter->encoding;
-                               }
-                               n--;
-                       }
-               }
        }
 
        return encoding;
index d18de0445d85b7f5f26847be8c0c769270edd365..59bcdc84ca5355987a81e947029df2a053549230 100644 (file)
@@ -160,6 +160,9 @@ static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
        &vtbl_identify_cp50222,
        &vtbl_identify_gb18030,
        &vtbl_identify_7bit,
+       &vtbl_identify_utf16,
+       &vtbl_identify_utf16le,
+       &vtbl_identify_utf16be,
        &vtbl_identify_false,
        NULL
 };
index b3c8ba10bba6371d5efc9a3563155203acd39be9..45efe28766e78018194bccfbcc1213a624a2302b 100644 (file)
@@ -1,5 +1,7 @@
 --TEST--
 mb_convert_encoding() when encoding detection fails
+--INI--
+mbstring.strict_detection=1
 --FILE--
 <?php