Add identify filter for UTF-32{,BE,LE}

author Alex Dowad <alexinbeijing@gmail.com>

Sun, 6 Sep 2020 12:42:55 +0000 (14:42 +0200)

committer Alex Dowad <alexinbeijing@gmail.com>

Tue, 27 Oct 2020 08:19:01 +0000 (10:19 +0200)
author Alex Dowad <alexinbeijing@gmail.com>
Sun, 6 Sep 2020 12:42:55 +0000 (14:42 +0200)
committer Alex Dowad <alexinbeijing@gmail.com>
Tue, 27 Oct 2020 08:19:01 +0000 (10:19 +0200)
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c

index 2566ff54764ba2ac72bbc1cb81d595609eee7138..ddd942d6e9c754a558ddfc47b1fa4906202f9a2f 100644 (file)
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c
@@ -30,6 +30,10 @@
  #include "mbfilter.h"
  #include "mbfilter_utf32.h"
  
+static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter);
+static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter);
+static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter);
+
  static const char *mbfl_encoding_utf32_aliases[] = {"utf32", NULL};
  
  const mbfl_encoding mbfl_encoding_utf32 = {
@@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf32le = {
         &vtbl_wchar_utf32le
  };
  
+const struct mbfl_identify_vtbl vtbl_identify_utf32 = {
+       mbfl_no_encoding_utf32,
+       mbfl_filt_ident_common_ctor,
+       mbfl_filt_ident_utf32
+};
+
+const struct mbfl_identify_vtbl vtbl_identify_utf32be = {
+       mbfl_no_encoding_utf32be,
+       mbfl_filt_ident_common_ctor,
+       mbfl_filt_ident_utf32be
+};
+
+const struct mbfl_identify_vtbl vtbl_identify_utf32le = {
+       mbfl_no_encoding_utf32le,
+       mbfl_filt_ident_common_ctor,
+       mbfl_filt_ident_utf32le
+};
+
  const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
         mbfl_no_encoding_utf32,
         mbfl_no_encoding_wchar,
@@ -289,3 +311,128 @@ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
  
         return c;
  }
+
+static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter)
+{
+       /* The largest valid codepoint is 0x10FFFF; we don't want values above that
+        * Neither do we want to see surrogates
+        * For UTF-32 (not LE or BE), we do also need to look for a byte-order mark */
+       switch (filter->status) {
+       case 0: /* 1st byte */
+               if (c == 0xff) {
+                       filter->status = 1;
+                       return c;
+               }
+               filter->filter_function = mbfl_filt_ident_utf32be;
+               break;
+
+       case 1: /* 2nd byte */
+               if (c == 0xfe) {
+                       filter->status = 2;
+                       return c;
+               }
+               filter->filter_function = mbfl_filt_ident_utf32be;
+               (filter->filter_function)(0xff, filter);
+               break;
+
+       case 2: /* 3rd byte */
+               if (c == 0) {
+                       filter->status = 3;
+                       return c;
+               }
+               filter->filter_function = mbfl_filt_ident_utf32be;
+               (filter->filter_function)(0xff, filter);
+               (filter->filter_function)(0xfe, filter);
+               break;
+
+       case 3: /* 4th byte */
+               if (c == 0) {
+                       /* We found a little-endian byte-order mark! */
+                       filter->status = 0;
+                       filter->filter_function = mbfl_filt_ident_utf32le;
+                       return c;
+               }
+               filter->filter_function = mbfl_filt_ident_utf32be;
+               (filter->filter_function)(0xff, filter);
+               (filter->filter_function)(0xfe, filter);
+               (filter->filter_function)(0, filter);
+               break;
+       }
+
+       return (filter->filter_function)(c, filter);
+}
+
+static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter)
+{
+       switch (filter->status) {
+       case 0: /* 1st byte */
+               filter->status = 1;
+               break;
+
+       case 1: /* 2nd byte */
+               if (c >= 0xD8 && c <= 0xDF) {
+                       filter->status = 4; /* might be surrogate if we are in BMP */
+               } else {
+                       filter->status = 2;
+               }
+               break;
+
+       case 2: /* 3rd byte */
+               if (c > 0x10) {
+                       filter->flag = 1; /* too big */
+               }
+               filter->status = 3;
+               break;
+
+       case 3: /* 4th byte */
+               if (c) {
+                       filter->flag = 1; /* too big */
+               }
+               filter->status = 0;
+               break;
+
+       case 4: /* 3rd byte, previous byte looked like surrogate */
+               if (!c) {
+                       filter->flag = 1; /* yep, it's a surrogate */
+               }
+               filter->status = 3;
+       }
+       return c;
+}
+
+static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter)
+{
+       switch (filter->status) {
+       case 0: /* 1st byte */
+               if (c) {
+                       filter->flag = 1; /* too big */
+               }
+               filter->status = 1;
+               break;
+
+       case 1: /* 2nd byte */
+               if (c > 0x10) {
+                       filter->flag = 1; /* too big */
+               } if (c) {
+                       filter->status = 4; /* not in the BMP */
+               } else {
+                       filter->status = 2;
+               }
+               break;
+
+       case 2: /* 3rd byte */
+               if (c >= 0xD8 && c <= 0xDF) {
+                       filter->flag = 1; /* reserved range for surrogates */
+               }
+               filter->status = 3;
+               break;
+
+       case 3: /* 4th byte */
+               filter->status = 0;
+               break;
+
+       case 4: /* 3rd byte, not in BMP */
+               filter->status = 3;
+       }
+       return c;
+}
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.h b/ext/mbstring/libmbfl/filters/mbfilter_utf32.h

index 14b9fa4823245a8c154fdafc10416f08890dbd21..de1235e4519ac8439ddc1ea24c838a40262c46e9 100644 (file)
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.h
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.h
@@ -33,6 +33,9 @@
  extern const mbfl_encoding mbfl_encoding_utf32;
  extern const mbfl_encoding mbfl_encoding_utf32be;
  extern const mbfl_encoding mbfl_encoding_utf32le;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf32;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf32be;
+extern const struct mbfl_identify_vtbl vtbl_identify_utf32le;
  extern const struct mbfl_convert_vtbl vtbl_utf32_wchar;
  extern const struct mbfl_convert_vtbl vtbl_wchar_utf32;
  extern const struct mbfl_convert_vtbl vtbl_utf32be_wchar;
diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_ident.c b/ext/mbstring/libmbfl/mbfl/mbfl_ident.c

index d2d991249f202d9643a926b6b7cc34b5dac27274..a48762ee3d943bd66ead09b961580d0986c02882 100644 (file)
--- a/ext/mbstring/libmbfl/mbfl/mbfl_ident.c
+++ b/ext/mbstring/libmbfl/mbfl/mbfl_ident.c
@@ -169,6 +169,9 @@ static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
         &vtbl_identify_ucs2,
         &vtbl_identify_ucs2be,
         &vtbl_identify_ucs2le,
+       &vtbl_identify_utf32,
+       &vtbl_identify_utf32be,
+       &vtbl_identify_utf32le,
         &vtbl_identify_false,
         NULL
  };
author	Alex Dowad <alexinbeijing@gmail.com>
	Sun, 6 Sep 2020 12:42:55 +0000 (14:42 +0200)
committer	Alex Dowad <alexinbeijing@gmail.com>
	Tue, 27 Oct 2020 08:19:01 +0000 (10:19 +0200)
ext/mbstring/libmbfl/filters/mbfilter_utf32.c		patch \| blob \| history
ext/mbstring/libmbfl/filters/mbfilter_utf32.h		patch \| blob \| history
ext/mbstring/libmbfl/mbfl/mbfl_ident.c		patch \| blob \| history