From: Alex Dowad Date: Sun, 6 Sep 2020 12:42:55 +0000 (+0200) Subject: Add identify filter for UTF-32{,BE,LE} X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7047e5d2c409c073799139490b92ffde2e6b1afb;p=php Add identify filter for UTF-32{,BE,LE} --- diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c index 2566ff5476..ddd942d6e9 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c @@ -30,6 +30,10 @@ #include "mbfilter.h" #include "mbfilter_utf32.h" +static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter); +static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter); +static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter); + static const char *mbfl_encoding_utf32_aliases[] = {"utf32", NULL}; const mbfl_encoding mbfl_encoding_utf32 = { @@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf32le = { &vtbl_wchar_utf32le }; +const struct mbfl_identify_vtbl vtbl_identify_utf32 = { + mbfl_no_encoding_utf32, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_utf32 +}; + +const struct mbfl_identify_vtbl vtbl_identify_utf32be = { + mbfl_no_encoding_utf32be, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_utf32be +}; + +const struct mbfl_identify_vtbl vtbl_identify_utf32le = { + mbfl_no_encoding_utf32le, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_utf32le +}; + const struct mbfl_convert_vtbl vtbl_utf32_wchar = { mbfl_no_encoding_utf32, mbfl_no_encoding_wchar, @@ -289,3 +311,128 @@ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter) return c; } + +static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter) +{ + /* The largest valid codepoint is 0x10FFFF; we don't want values above that + * Neither do we want to see surrogates + * For UTF-32 (not LE or BE), we do also need to look for a byte-order mark */ + switch (filter->status) { + case 0: /* 1st byte */ + if (c == 0xff) { + filter->status = 1; + return c; + } + filter->filter_function = mbfl_filt_ident_utf32be; + break; + + case 1: /* 2nd byte */ + if (c == 0xfe) { + filter->status = 2; + return c; + } + filter->filter_function = mbfl_filt_ident_utf32be; + (filter->filter_function)(0xff, filter); + break; + + case 2: /* 3rd byte */ + if (c == 0) { + filter->status = 3; + return c; + } + filter->filter_function = mbfl_filt_ident_utf32be; + (filter->filter_function)(0xff, filter); + (filter->filter_function)(0xfe, filter); + break; + + case 3: /* 4th byte */ + if (c == 0) { + /* We found a little-endian byte-order mark! */ + filter->status = 0; + filter->filter_function = mbfl_filt_ident_utf32le; + return c; + } + filter->filter_function = mbfl_filt_ident_utf32be; + (filter->filter_function)(0xff, filter); + (filter->filter_function)(0xfe, filter); + (filter->filter_function)(0, filter); + break; + } + + return (filter->filter_function)(c, filter); +} + +static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter) +{ + switch (filter->status) { + case 0: /* 1st byte */ + filter->status = 1; + break; + + case 1: /* 2nd byte */ + if (c >= 0xD8 && c <= 0xDF) { + filter->status = 4; /* might be surrogate if we are in BMP */ + } else { + filter->status = 2; + } + break; + + case 2: /* 3rd byte */ + if (c > 0x10) { + filter->flag = 1; /* too big */ + } + filter->status = 3; + break; + + case 3: /* 4th byte */ + if (c) { + filter->flag = 1; /* too big */ + } + filter->status = 0; + break; + + case 4: /* 3rd byte, previous byte looked like surrogate */ + if (!c) { + filter->flag = 1; /* yep, it's a surrogate */ + } + filter->status = 3; + } + return c; +} + +static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter) +{ + switch (filter->status) { + case 0: /* 1st byte */ + if (c) { + filter->flag = 1; /* too big */ + } + filter->status = 1; + break; + + case 1: /* 2nd byte */ + if (c > 0x10) { + filter->flag = 1; /* too big */ + } if (c) { + filter->status = 4; /* not in the BMP */ + } else { + filter->status = 2; + } + break; + + case 2: /* 3rd byte */ + if (c >= 0xD8 && c <= 0xDF) { + filter->flag = 1; /* reserved range for surrogates */ + } + filter->status = 3; + break; + + case 3: /* 4th byte */ + filter->status = 0; + break; + + case 4: /* 3rd byte, not in BMP */ + filter->status = 3; + } + return c; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.h b/ext/mbstring/libmbfl/filters/mbfilter_utf32.h index 14b9fa4823..de1235e451 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.h @@ -33,6 +33,9 @@ extern const mbfl_encoding mbfl_encoding_utf32; extern const mbfl_encoding mbfl_encoding_utf32be; extern const mbfl_encoding mbfl_encoding_utf32le; +extern const struct mbfl_identify_vtbl vtbl_identify_utf32; +extern const struct mbfl_identify_vtbl vtbl_identify_utf32be; +extern const struct mbfl_identify_vtbl vtbl_identify_utf32le; extern const struct mbfl_convert_vtbl vtbl_utf32_wchar; extern const struct mbfl_convert_vtbl vtbl_wchar_utf32; extern const struct mbfl_convert_vtbl vtbl_utf32be_wchar; diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_ident.c b/ext/mbstring/libmbfl/mbfl/mbfl_ident.c index d2d991249f..a48762ee3d 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_ident.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_ident.c @@ -169,6 +169,9 @@ static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = { &vtbl_identify_ucs2, &vtbl_identify_ucs2be, &vtbl_identify_ucs2le, + &vtbl_identify_utf32, + &vtbl_identify_utf32be, + &vtbl_identify_utf32le, &vtbl_identify_false, NULL };