From 97beecc2517252334fd13c3babeebb1bb4fc48e8 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 6 Sep 2020 10:32:58 +0200 Subject: [PATCH] Add identify filter for UTF-16, UTF-16LE, UTF-16BE There was one faulty test in the suite which only passed before because UTF-16 had no identify filter. After this was fixed, it exposed the problem with the test. --- ext/mbstring/libmbfl/filters/mbfilter_utf16.c | 127 ++++++++++++++++++ ext/mbstring/libmbfl/filters/mbfilter_utf16.h | 3 + ext/mbstring/libmbfl/mbfl/mbfilter.c | 12 -- ext/mbstring/libmbfl/mbfl/mbfl_ident.c | 3 + .../mb_convert_encoding_failed_detection.phpt | 2 + 5 files changed, 135 insertions(+), 12 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index 54236769a2..a44ea371b3 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -30,6 +30,10 @@ #include "mbfilter.h" #include "mbfilter_utf16.h" +static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter); +static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter); +static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter); + static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL}; const mbfl_encoding mbfl_encoding_utf16 = { @@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf16le = { &vtbl_wchar_utf16le }; +const struct mbfl_identify_vtbl vtbl_identify_utf16 = { + mbfl_no_encoding_utf16, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_utf16 +}; + +const struct mbfl_identify_vtbl vtbl_identify_utf16le = { + mbfl_no_encoding_utf16le, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_utf16le +}; + +const struct mbfl_identify_vtbl vtbl_identify_utf16be = { + mbfl_no_encoding_utf16be, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_utf16be +}; + const struct mbfl_convert_vtbl vtbl_utf16_wchar = { mbfl_no_encoding_utf16, mbfl_no_encoding_wchar, @@ -320,3 +342,108 @@ int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter) return c; } + +static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter) +{ + if (filter->status == 0) { + if (c >= 0xfe) { /* could be a byte-order mark */ + filter->status = c; + } else { + /* no byte-order mark at beginning of input; assume UTF-16BE */ + filter->filter_function = mbfl_filt_ident_utf16be; + return (filter->filter_function)(c, filter); + } + } else { + unsigned short n = (filter->status << 8) | c; + filter->status = 0; + + if (n == 0xfeff) { + /* it was a big-endian byte-order mark */ + filter->filter_function = mbfl_filt_ident_utf16be; + } else if (n == 0xfffe) { + /* it was a little-endian byte-order mark */ + filter->filter_function = mbfl_filt_ident_utf16le; + } else { + /* it wasn't a byte-order mark */ + filter->filter_function = mbfl_filt_ident_utf16be; + (filter->filter_function)(n >> 8, filter); + return (filter->filter_function)(c, filter); + } + } + return c; +} + +static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter) +{ + switch (filter->status) { + case 0: /* 1st byte */ + filter->status = 1; + break; + + case 1: /* 2nd byte */ + if ((c & 0xfc) == 0xd8) { + /* Looks like a surrogate pair */ + filter->status = 2; + } else if ((c & 0xfc) == 0xdc) { + /* This is wrong; the second part of the surrogate pair has come first */ + filter->flag = 1; + } else { + filter->status = 0; /* Just an ordinary 2-byte character */ + } + break; + + case 2: /* 3rd byte */ + filter->status = 3; + break; + + case 3: /* 4th byte */ + if ((c & 0xfc) == 0xdc) { + filter->status = 0; + } else { + filter->flag = 1; /* Surrogate pair wrongly encoded */ + } + break; + } + + return c; +} + +static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter) +{ + switch (filter->status) { + case 0: /* 1st byte */ + if ((c & 0xfc) == 0xd8) { + /* Looks like a surrogate pair */ + filter->status = 2; + } else if ((c & 0xfc) == 0xdc) { + /* This is wrong; the second part of the surrogate pair has come first */ + filter->flag = 1; + } else { + /* Just an ordinary 2-byte character */ + filter->status = 1; + } + break; + + case 1: /* 2nd byte, not surrogate pair */ + filter->status = 0; + break; + + case 2: /* 2nd byte, surrogate pair */ + filter->status = 3; + break; + + case 3: /* 3rd byte, surrogate pair */ + if ((c & 0xfc) == 0xdc) { + filter->status = 4; + } else { + filter->flag = 1; /* Surrogate pair wrongly encoded */ + } + break; + + case 4: /* 4th byte, surrogate pair */ + filter->status = 0; + break; + } + + return c; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.h b/ext/mbstring/libmbfl/filters/mbfilter_utf16.h index 4aa88619c7..601779add4 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.h @@ -33,6 +33,9 @@ extern const mbfl_encoding mbfl_encoding_utf16; extern const mbfl_encoding mbfl_encoding_utf16be; extern const mbfl_encoding mbfl_encoding_utf16le; +extern const struct mbfl_identify_vtbl vtbl_identify_utf16; +extern const struct mbfl_identify_vtbl vtbl_identify_utf16le; +extern const struct mbfl_identify_vtbl vtbl_identify_utf16be; extern const struct mbfl_convert_vtbl vtbl_utf16_wchar; extern const struct mbfl_convert_vtbl vtbl_wchar_utf16; extern const struct mbfl_convert_vtbl vtbl_utf16be_wchar; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index 4bcaa70694..d0e5494387 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -396,18 +396,6 @@ const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd } n--; } - - /* fallback judge */ - if (!encoding) { - n = identd->filter_list_size - 1; - while (n >= 0) { - filter = identd->filter_list[n]; - if (!filter->flag) { - encoding = filter->encoding; - } - n--; - } - } } return encoding; diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_ident.c b/ext/mbstring/libmbfl/mbfl/mbfl_ident.c index d18de0445d..59bcdc84ca 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_ident.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_ident.c @@ -160,6 +160,9 @@ static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = { &vtbl_identify_cp50222, &vtbl_identify_gb18030, &vtbl_identify_7bit, + &vtbl_identify_utf16, + &vtbl_identify_utf16le, + &vtbl_identify_utf16be, &vtbl_identify_false, NULL }; diff --git a/ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt b/ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt index b3c8ba10bb..45efe28766 100644 --- a/ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt +++ b/ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt @@ -1,5 +1,7 @@ --TEST-- mb_convert_encoding() when encoding detection fails +--INI-- +mbstring.strict_detection=1 --FILE--