]> granicus.if.org Git - php/commitdiff
Improve error handling for UTF-16{,BE,LE}
authorAlex Dowad <alexinbeijing@gmail.com>
Wed, 14 Oct 2020 18:25:19 +0000 (20:25 +0200)
committerAlex Dowad <alexinbeijing@gmail.com>
Tue, 27 Oct 2020 08:19:01 +0000 (10:19 +0200)
Catch various errors such as the first part of a surrogate pair not being
followed by a proper second part, the first part of a surrogate pair appearing
at the end of a string, the second part of a surrogate pair appearing out
of place, and so on.

ext/mbstring/libmbfl/filters/mbfilter_utf16.c
ext/mbstring/libmbfl/mbfl/mbfilter.c

index 58c63eb54c3cd0ffbfca77ffc4d2c120a52b6184..c23a43269516e4ed63e4838b183203688892f7f6 100644 (file)
@@ -150,111 +150,89 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
 
 #define CK(statement)  do { if ((statement) < 0) return (-1); } while (0)
 
-/*
- * UTF-16 => wchar
- */
 int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
 {
-       int n, endian;
-
-       endian = filter->status & 0xff00;
-       switch (filter->status & 0x0f) {
-       case 0:
-               if (endian) {
-                       n = c & 0xff;
-               } else {
-                       n = (c & 0xff) << 8;
-               }
-               filter->cache |= n;
-               filter->status++;
-               break;
-       default:
-               if (endian) {
-                       n = (c & 0xff) << 8;
+       /* Start with the assumption that the string is big-endian;
+        * If we find a little-endian BOM, then we will change that assumption */
+       if (filter->status == 0) {
+               filter->cache = c & 0xFF;
+               filter->status = 1;
+       } else {
+               int n = (filter->cache << 8) | (c & 0xFF);
+               if (n == 0xFFFE) {
+                       /* Switch to little-endian mode */
+                       filter->filter_function = mbfl_filt_conv_utf16le_wchar;
+                       filter->cache = filter->status = 0;
                } else {
-                       n = c & 0xff;
-               }
-               n |= filter->cache & 0xffff;
-               filter->status &= ~0x0f;
-               if (n >= 0xd800 && n < 0xdc00) {
-                       filter->cache = ((n & 0x3ff) << 16) + 0x400000;
-               } else if (n >= 0xdc00 && n < 0xe000) {
-                       n &= 0x3ff;
-                       n |= (filter->cache & 0xfff0000) >> 6;
-                       filter->cache = 0;
-                       if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
-                               CK((*filter->output_function)(n, filter->data));
-                       } else {                /* illegal character */
-                               n &= MBFL_WCSGROUP_MASK;
-                               n |= MBFL_WCSGROUP_THROUGH;
+                       filter->filter_function = mbfl_filt_conv_utf16be_wchar;
+                       if (n >= 0xD800 && n <= 0xDBFF) {
+                               filter->cache = n & 0x3FF; /* Pick out 10 data bits */
+                               filter->status = 2;
+                               return c;
+                       } else if (n >= 0xDC00 && n <= 0xDFFF) {
+                               /* This is wrong; second part of surrogate pair has come first */
+                               CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
+                       } else if (n != 0xFEFF) {
                                CK((*filter->output_function)(n, filter->data));
                        }
-               } else {
-                       int is_first = filter->status & 0x10;
-                       filter->cache = 0;
-                       filter->status |= 0x10;
-                       if (!is_first) {
-                               if (n == 0xfffe) {
-                                       if (endian) {
-                                               filter->status &= ~0x100;               /* big-endian */
-                                       } else {
-                                               filter->status |= 0x100;                /* little-endian */
-                                       }
-                                       break;
-                               } else if (n == 0xfeff) {
-                                       break;
-                               }
-                       }
-                       CK((*filter->output_function)(n, filter->data));
+                       filter->cache = filter->status = 0;
                }
-               break;
        }
 
        return c;
 }
 
-/*
- * UTF-16BE => wchar
- */
 int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
 {
        int n;
 
        switch (filter->status) {
-       case 0:
+       case 0: /* First byte */
+               filter->cache = c & 0xFF;
                filter->status = 1;
-               n = (c & 0xff) << 8;
-               filter->cache |= n;
                break;
-       default:
-               filter->status = 0;
-               n = (filter->cache & 0xff00) | (c & 0xff);
-               if (n >= 0xd800 && n < 0xdc00) {
-                       filter->cache = ((n & 0x3ff) << 16) + 0x400000;
-               } else if (n >= 0xdc00 && n < 0xe000) {
-                       n &= 0x3ff;
-                       n |= (filter->cache & 0xfff0000) >> 6;
-                       filter->cache = 0;
-                       if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
-                               CK((*filter->output_function)(n, filter->data));
-                       } else {                /* illegal character */
-                               n &= MBFL_WCSGROUP_MASK;
-                               n |= MBFL_WCSGROUP_THROUGH;
-                               CK((*filter->output_function)(n, filter->data));
-                       }
+
+       case 1: /* Second byte */
+               n = (filter->cache << 8) | (c & 0xFF);
+               if (n >= 0xD800 && n <= 0xDBFF) {
+                       filter->cache = n & 0x3FF; /* Pick out 10 data bits */
+                       filter->status = 2;
+               } else if (n >= 0xDC00 && n <= 0xDFFF) {
+                       /* This is wrong; second part of surrogate pair has come first */
+                       CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
+                       filter->status = 0;
                } else {
-                       filter->cache = 0;
                        CK((*filter->output_function)(n, filter->data));
+                       filter->status = 0;
                }
                break;
+
+       case 2: /* Second part of surrogate, first byte */
+               filter->cache = (filter->cache << 8) | (c & 0xFF);
+               filter->status = 3;
+               break;
+
+       case 3: /* Second part of surrogate, second byte */
+               n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
+               if (n >= 0xD800 && n <= 0xDBFF) {
+                       /* Wrong; that's the first half of a surrogate pair, not the second */
+                       CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
+                       filter->cache = n & 0x3FF;
+                       filter->status = 2;
+               } else if (n >= 0xDC00 && n <= 0xDFFF) {
+                       n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
+                       CK((*filter->output_function)(n, filter->data));
+                       filter->status = 0;
+               } else {
+                       CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
+                       CK((*filter->output_function)(n, filter->data));
+                       filter->status = 0;
+               }
        }
 
        return c;
 }
 
-/*
- * wchar => UTF-16BE
- */
 int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
 {
        int n;
@@ -276,11 +254,10 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
        return c;
 }
 
-/*
- * UTF-16LE => wchar
- */
 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
 {
+       int n;
+
        switch (filter->status) {
        case 0:
                filter->cache = c & 0xff;
@@ -296,12 +273,12 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
                        /* This is wrong; the second part of the surrogate pair has come first
                         * Flag it with `MBFL_WCSGROUP_THROUGH`; the following filter will handle
                         * the error */
-                       int n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
-                       filter->status = 0;
+                       n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
                        CK((*filter->output_function)(n, filter->data));
-               } else {
                        filter->status = 0;
+               } else {
                        CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
+                       filter->status = 0;
                }
                break;
 
@@ -311,18 +288,26 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
                break;
 
        case 3:
-               filter->status = 0;
-               int n = filter->cache + ((c & 0x3) << 8) + 0x10000;
-               CK((*filter->output_function)(n, filter->data));
+               n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
+               if (n >= 0xD800 && n <= 0xDBFF) {
+                       CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
+                       filter->cache = n & 0x3FF;
+                       filter->status = 2;
+               } else if (n >= 0xDC00 && n <= 0xDFFF) {
+                       n = filter->cache + ((c & 0x3) << 8) + 0x10000;
+                       CK((*filter->output_function)(n, filter->data));
+                       filter->status = 0;
+               } else {
+                       CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
+                       CK((*filter->output_function)(n, filter->data));
+                       filter->status = 0;
+               }
                break;
        }
 
        return c;
 }
 
-/*
- * wchar => UTF-16LE
- */
 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
 {
        int n;
@@ -350,7 +335,7 @@ static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
        int cache = filter->cache;
        filter->status = filter->cache = 0;
 
-       if (status & 0xF) {
+       if (status) {
                /* Input string was truncated */
                CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
        }
index d0e5494387860f7f5db7e4bb28995b5a26b6b85f..68cb39fa900f9ddaa72ad7959a9f104304701e90 100644 (file)
@@ -200,7 +200,6 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
        size_t n;
        unsigned char *p;
        mbfl_convert_filter *filter;
-       int (*filter_function)(int c, mbfl_convert_filter *filter);
 
        ZEND_ASSERT(convd);
        ZEND_ASSERT(string);
@@ -212,9 +211,8 @@ size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *str
 
        filter = convd->filter1;
        if (filter != NULL) {
-               filter_function = filter->filter_function;
                while (n > 0) {
-                       if ((*filter_function)(*p++, filter) < 0) {
+                       if ((*filter->filter_function)(*p++, filter) < 0) {
                                return p - string->val;
                        }
                        n--;