Add identify filter for UTF-16, UTF-16LE, UTF-16BE

author Alex Dowad <alexinbeijing@gmail.com>

Sun, 6 Sep 2020 08:32:58 +0000 (10:32 +0200)

committer Alex Dowad <alexinbeijing@gmail.com>

Tue, 13 Oct 2020 18:26:13 +0000 (20:26 +0200)
author Alex Dowad <alexinbeijing@gmail.com>
Sun, 6 Sep 2020 08:32:58 +0000 (10:32 +0200)
committer Alex Dowad <alexinbeijing@gmail.com>
Tue, 13 Oct 2020 18:26:13 +0000 (20:26 +0200)
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c

index 54236769a23e8af69139e9f29f181867d0ac75ee..a44ea371b3c2f015e248a35a86b358a8876f0c33 100644 (file)
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
@@ -30,6 +30,10 @@
  #include "mbfilter.h"
  #include "mbfilter_utf16.h"
  
+static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter);
+static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter);
+static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter);
+
  static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
  
  const mbfl_encoding mbfl_encoding_utf16 = {
@@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf16le = {
         &vtbl_wchar_utf16le
  };
  
+const struct mbfl_identify_vtbl vtbl_identify_utf16 = {
+       mbfl_no_encoding_utf16,
+       mbfl_filt_ident_common_ctor,
+       mbfl_filt_ident_utf16
+};
+
+const struct mbfl_identify_vtbl vtbl_identify_utf16le = {
+       mbfl_no_encoding_utf16le,
+       mbfl_filt_ident_common_ctor,
+       mbfl_filt_ident_utf16le
+};
+
+const struct mbfl_identify_vtbl vtbl_identify_utf16be = {
+       mbfl_no_encoding_utf16be,
+       mbfl_filt_ident_common_ctor,
+       mbfl_filt_ident_utf16be
+};
+
  const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
         mbfl_no_encoding_utf16,
         mbfl_no_encoding_wchar,
@@ -320,3 +342,108 @@ int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
  
         return c;
  }
+
+static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter)
+{
+       if (filter->status == 0) {
+               if (c >= 0xfe) { /* could be a byte-order mark */
+                       filter->status = c;
+               } else {
+                       /* no byte-order mark at beginning of input; assume UTF-16BE */
+                       filter->filter_function = mbfl_filt_ident_utf16be;
+                       return (filter->filter_function)(c, filter);
+               }
+       } else {
+               unsigned short n = (filter->status << 8) | c;
+               filter->status = 0;
+
+               if (n == 0xfeff) {
+                       /* it was a big-endian byte-order mark */
+                       filter->filter_function = mbfl_filt_ident_utf16be;
+               } else if (n == 0xfffe) {
+                       /* it was a little-endian byte-order mark */
+                       filter->filter_function = mbfl_filt_ident_utf16le;
+               } else {
+                       /* it wasn't a byte-order mark */
+                       filter->filter_function = mbfl_filt_ident_utf16be;
+                       (filter->filter_function)(n >> 8, filter);
+                       return (filter->filter_function)(c, filter);
+               }
+       }
+       return c;
+}
+
+static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter)
+{
+       switch (filter->status) {
+       case 0: /* 1st byte */
+               filter->status = 1;
+               break;
+
+       case 1: /* 2nd byte */
+               if ((c & 0xfc) == 0xd8) {
+                       /* Looks like a surrogate pair */
+                       filter->status = 2;
+               } else if ((c & 0xfc) == 0xdc) {
+                       /* This is wrong; the second part of the surrogate pair has come first */
+                       filter->flag = 1;
+               } else {
+                       filter->status = 0; /* Just an ordinary 2-byte character */
+               }
+               break;
+
+       case 2: /* 3rd byte */
+               filter->status = 3;
+               break;
+
+       case 3: /* 4th byte */
+               if ((c & 0xfc) == 0xdc) {
+                       filter->status = 0;
+               } else {
+                       filter->flag = 1; /* Surrogate pair wrongly encoded */
+               }
+               break;
+       }
+
+       return c;
+}
+
+static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter)
+{
+       switch (filter->status) {
+       case 0: /* 1st byte */
+               if ((c & 0xfc) == 0xd8) {
+                       /* Looks like a surrogate pair */
+                       filter->status = 2;
+               } else if ((c & 0xfc) == 0xdc) {
+                       /* This is wrong; the second part of the surrogate pair has come first */
+                       filter->flag = 1;
+               } else {
+                       /* Just an ordinary 2-byte character */
+                       filter->status = 1;
+               }
+               break;
+
+       case 1: /* 2nd byte, not surrogate pair */
+               filter->status = 0;
+               break;
+
+       case 2: /* 2nd byte, surrogate pair */
+               filter->status = 3;
+               break;
+
+       case 3: /* 3rd byte, surrogate pair */
+               if ((c & 0xfc) == 0xdc) {
+                       filter->status = 4;
+               } else {
+                       filter->flag = 1; /* Surrogate pair wrongly encoded */
+               }
+               break;
+
+       case 4: /* 4th byte, surrogate pair */
+               filter->status = 0;
+               break;
+       }
+
+       return c;
+}
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.h b/ext/mbstring/libmbfl/filters/mbfilter_utf16.h

index 4aa88619c7ce66e2a839702453142bcfdd68a9de..601779add41d68742f41e475169dda93508b0d0e 100644 (file)
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.h
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.h
@@ -33,6 +33,9 @@
  extern const mbfl_encoding mbfl_encoding_utf16;
  extern const mbfl_encoding mbfl_encoding_utf16be;
  extern const mbfl_encoding mbfl_encoding_utf16le;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf16;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf16le;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf16be;
  extern const struct mbfl_convert_vtbl vtbl_utf16_wchar;
  extern const struct mbfl_convert_vtbl vtbl_wchar_utf16;
  extern const struct mbfl_convert_vtbl vtbl_utf16be_wchar;
diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c

index 4bcaa7069480175c17045435c18bf74f9d65d9c3..d0e5494387860f7f5db7e4bb28995b5a26b6b85f 100644 (file)
--- a/ext/mbstring/libmbfl/mbfl/mbfilter.c
+++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c
@@ -396,18 +396,6 @@ const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd
                         }
                         n--;
                 }
-
-               /* fallback judge */
-               if (!encoding) {
-                       n = identd->filter_list_size - 1;
-                       while (n >= 0) {
-                               filter = identd->filter_list[n];
-                               if (!filter->flag) {
-                                       encoding = filter->encoding;
-                               }
-                               n--;
-                       }
-               }
         }
  
         return encoding;
diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_ident.c b/ext/mbstring/libmbfl/mbfl/mbfl_ident.c

index d18de0445d85b7f5f26847be8c0c769270edd365..59bcdc84ca5355987a81e947029df2a053549230 100644 (file)
--- a/ext/mbstring/libmbfl/mbfl/mbfl_ident.c
+++ b/ext/mbstring/libmbfl/mbfl/mbfl_ident.c
@@ -160,6 +160,9 @@ static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
         &vtbl_identify_cp50222,
         &vtbl_identify_gb18030,
         &vtbl_identify_7bit,
+       &vtbl_identify_utf16,
+       &vtbl_identify_utf16le,
+       &vtbl_identify_utf16be,
         &vtbl_identify_false,
         NULL
  };
diff --git a/ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt b/ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt

index b3c8ba10bba6371d5efc9a3563155203acd39be9..45efe28766e78018194bccfbcc1213a624a2302b 100644 (file)
--- a/ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt
+++ b/ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt
@@ -1,5 +1,7 @@
  --TEST--
  mb_convert_encoding() when encoding detection fails
+--INI--
+mbstring.strict_detection=1
  --FILE--
  <?php
author	Alex Dowad <alexinbeijing@gmail.com>
	Sun, 6 Sep 2020 08:32:58 +0000 (10:32 +0200)
committer	Alex Dowad <alexinbeijing@gmail.com>
	Tue, 13 Oct 2020 18:26:13 +0000 (20:26 +0200)
ext/mbstring/libmbfl/filters/mbfilter_utf16.c		patch \| blob \| history
ext/mbstring/libmbfl/filters/mbfilter_utf16.h		patch \| blob \| history
ext/mbstring/libmbfl/mbfl/mbfilter.c		patch \| blob \| history
ext/mbstring/libmbfl/mbfl/mbfl_ident.c		patch \| blob \| history
ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt		patch \| blob \| history