Improve error handling for UTF-16{,BE,LE}

author Alex Dowad <alexinbeijing@gmail.com>

Wed, 14 Oct 2020 18:25:19 +0000 (20:25 +0200)

committer Alex Dowad <alexinbeijing@gmail.com>

Tue, 27 Oct 2020 08:19:01 +0000 (10:19 +0200)
author Alex Dowad <alexinbeijing@gmail.com>
Wed, 14 Oct 2020 18:25:19 +0000 (20:25 +0200)
committer Alex Dowad <alexinbeijing@gmail.com>
Tue, 27 Oct 2020 08:19:01 +0000 (10:19 +0200)
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c

index 58c63eb54c3cd0ffbfca77ffc4d2c120a52b6184..c23a43269516e4ed63e4838b183203688892f7f6 100644 (file)
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
@@ -150,111 +150,89 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
  
  #define CK(statement)  do { if ((statement) < 0) return (-1); } while (0)
  
-/*
- * UTF-16 => wchar
- */
  int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
  {
-       int n, endian;
-
-       endian = filter->status & 0xff00;
-       switch (filter->status & 0x0f) {
-       case 0:
-               if (endian) {
-                       n = c & 0xff;
-               } else {
-                       n = (c & 0xff) << 8;
-               }
-               filter->cache |= n;
-               filter->status++;
-               break;
-       default:
-               if (endian) {
-                       n = (c & 0xff) << 8;
+       /* Start with the assumption that the string is big-endian;
+        * If we find a little-endian BOM, then we will change that assumption */
+       if (filter->status == 0) {
+               filter->cache = c & 0xFF;
+               filter->status = 1;
+       } else {
+               int n = (filter->cache << 8) | (c & 0xFF);
+               if (n == 0xFFFE) {
+                       /* Switch to little-endian mode */
+                       filter->filter_function = mbfl_filt_conv_utf16le_wchar;
+                       filter->cache = filter->status = 0;
                 } else {
-                       n = c & 0xff;
-               }
-               n |= filter->cache & 0xffff;
-               filter->status &= ~0x0f;
-               if (n >= 0xd800 && n < 0xdc00) {
-                       filter->cache = ((n & 0x3ff) << 16) + 0x400000;
-               } else if (n >= 0xdc00 && n < 0xe000) {
-                       n &= 0x3ff;
-                       n |= (filter->cache & 0xfff0000) >> 6;
-                       filter->cache = 0;
-                       if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
-                               CK((*filter->output_function)(n, filter->data));
-                       } else {                /* illegal character */
-                               n &= MBFL_WCSGROUP_MASK;
-                               n |= MBFL_WCSGROUP_THROUGH;
+                       filter->filter_function = mbfl_filt_conv_utf16be_wchar;
+                       if (n >= 0xD800 && n <= 0xDBFF) {
+                               filter->cache = n & 0x3FF; /* Pick out 10 data bits */
+                               filter->status = 2;
+                               return c;
+                       } else if (n >= 0xDC00 && n <= 0xDFFF) {
+                               /* This is wrong; second part of surrogate pair has come first */
+                               CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
+                       } else if (n != 0xFEFF) {
                                 CK((*filter->output_function)(n, filter->data));
                         }
-               } else {
-                       int is_first = filter->status & 0x10;
-                       filter->cache = 0;
-                       filter->status |= 0x10;
-                       if (!is_first) {
-                               if (n == 0xfffe) {
-                                       if (endian) {
-                                               filter->status &= ~0x100;               /* big-endian */
-                                       } else {
-                                               filter->status |= 0x100;                /* little-endian */
-                                       }
-                                       break;
-                               } else if (n == 0xfeff) {
-                                       break;
-                               }
-                       }
-                       CK((*filter->output_function)(n, filter->data));
+                       filter->cache = filter->status = 0;
                 }
-               break;
         }
  
         return c;
  }
  
-/*
- * UTF-16BE => wchar
- */
  int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
  {
         int n;
  
         switch (filter->status) {
-       case 0:
+       case 0: /* First byte */
+               filter->cache = c & 0xFF;
                 filter->status = 1;
-               n = (c & 0xff) << 8;
-               filter->cache |= n;
                 break;
-       default:
-               filter->status = 0;
-               n = (filter->cache & 0xff00) | (c & 0xff);
-               if (n >= 0xd800 && n < 0xdc00) {
-                       filter->cache = ((n & 0x3ff) << 16) + 0x400000;
-               } else if (n >= 0xdc00 && n < 0xe000) {
-                       n &= 0x3ff;
-                       n |= (filter->cache & 0xfff0000) >> 6;
-                       filter->cache = 0;
-                       if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
-                               CK((*filter->output_function)(n, filter->data));
-                       } else {                /* illegal character */
-                               n &= MBFL_WCSGROUP_MASK;
-                               n |= MBFL_WCSGROUP_THROUGH;
-                               CK((*filter->output_function)(n, filter->data));
-                       }
+
+       case 1: /* Second byte */
+               n = (filter->cache << 8) | (c & 0xFF);
+               if (n >= 0xD800 && n <= 0xDBFF) {
+                       filter->cache = n & 0x3FF; /* Pick out 10 data bits */
+                       filter->status = 2;
+               } else if (n >= 0xDC00 && n <= 0xDFFF) {
+                       /* This is wrong; second part of surrogate pair has come first */
+                       CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
+                       filter->status = 0;
                 } else {
-                       filter->cache = 0;
                         CK((*filter->output_function)(n, filter->data));
+                       filter->status = 0;
                 }
                 break;
+
+       case 2: /* Second part of surrogate, first byte */
+               filter->cache = (filter->cache << 8) | (c & 0xFF);
+               filter->status = 3;
+               break;
+
+       case 3: /* Second part of surrogate, second byte */
+               n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
+               if (n >= 0xD800 && n <= 0xDBFF) {
+                       /* Wrong; that's the first half of a surrogate pair, not the second */
+                       CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
+                       filter->cache = n & 0x3FF;
+                       filter->status = 2;
+               } else if (n >= 0xDC00 && n <= 0xDFFF) {
+                       n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
+                       CK((*filter->output_function)(n, filter->data));
+                       filter->status = 0;
+               } else {
+                       CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
+                       CK((*filter->output_function)(n, filter->data));
+                       filter->status = 0;
+               }
         }
  
         return c;
  }
  
-/*
- * wchar => UTF-16BE
- */
  int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
  {
         int n;
@@ -276,11 +254,10 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
         return c;
  }
  
-/*
- * UTF-16LE => wchar
- */
  int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
  {
+       int n;
+
         switch (filter->status) {
         case 0:
                 filter->cache = c & 0xff;
@@ -296,12 +273,12 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
                         /* This is wrong; the second part of the surrogate pair has come first
                          * Flag it with `MBFL_WCSGROUP_THROUGH`; the following filter will handle
                          * the error */
-                       int n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
-                       filter->status = 0;
+                       n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
                         CK((*filter->output_function)(n, filter->data));
-               } else {
                         filter->status = 0;
+               } else {
                         CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
+                       filter->status = 0;
                 }
                 break;
  
@@ -311,18 +288,26 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
                 break;
  
         case 3:
-               filter->status = 0;
-               int n = filter->cache + ((c & 0x3) << 8) + 0x10000;
-               CK((*filter->output_function)(n, filter->data));
+               n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
+               if (n >= 0xD800 && n <= 0xDBFF) {
+                       CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
+                       filter->cache = n & 0x3FF;
+                       filter->status = 2;
+               } else if (n >= 0xDC00 && n <= 0xDFFF) {
+                       n = filter->cache + ((c & 0x3) << 8) + 0x10000;
+                       CK((*filter->output_function)(n, filter->data));
+                       filter->status = 0;
+               } else {
+                       CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
+                       CK((*filter->output_function)(n, filter->data));
+                       filter->status = 0;
+               }
                 break;
         }
  
         return c;
  }
  
-/*
- * wchar => UTF-16LE
- */
  int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
  {
         int n;
@@ -350,7 +335,7 @@ static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
         int cache = filter->cache;
         filter->status = filter->cache = 0;
  
-       if (status & 0xF) {
+       if (status) {
                 /* Input string was truncated */
                 CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
         }
diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c

index d0e5494387860f7f5db7e4bb28995b5a26b6b85f..68cb39fa900f9ddaa72ad7959a9f104304701e90 100644 (file)
--- a/ext/mbstring/libmbfl/mbfl/mbfilter.c
+++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c
@@ -200,7 +200,6 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
         size_t n;
         unsigned char *p;
         mbfl_convert_filter *filter;
-       int (*filter_function)(int c, mbfl_convert_filter *filter);
  
         ZEND_ASSERT(convd);
         ZEND_ASSERT(string);
@@ -212,9 +211,8 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
  
         filter = convd->filter1;
         if (filter != NULL) {
-               filter_function = filter->filter_function;
                 while (n > 0) {
-                       if ((*filter_function)(*p++, filter) < 0) {
+                       if ((*filter->filter_function)(*p++, filter) < 0) {
                                 return p - string->val;
                         }
                         n--;
author	Alex Dowad <alexinbeijing@gmail.com>
	Wed, 14 Oct 2020 18:25:19 +0000 (20:25 +0200)
committer	Alex Dowad <alexinbeijing@gmail.com>
	Tue, 27 Oct 2020 08:19:01 +0000 (10:19 +0200)
ext/mbstring/libmbfl/filters/mbfilter_utf16.c		patch \| blob \| history
ext/mbstring/libmbfl/mbfl/mbfilter.c		patch \| blob \| history