#include "mbfilter.h"
#include "mbfilter_utf32.h"
+static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter);
+static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter);
+static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter);
+
static const char *mbfl_encoding_utf32_aliases[] = {"utf32", NULL};
const mbfl_encoding mbfl_encoding_utf32 = {
&vtbl_wchar_utf32le
};
+const struct mbfl_identify_vtbl vtbl_identify_utf32 = {
+ mbfl_no_encoding_utf32,
+ mbfl_filt_ident_common_ctor,
+ mbfl_filt_ident_utf32
+};
+
+const struct mbfl_identify_vtbl vtbl_identify_utf32be = {
+ mbfl_no_encoding_utf32be,
+ mbfl_filt_ident_common_ctor,
+ mbfl_filt_ident_utf32be
+};
+
+const struct mbfl_identify_vtbl vtbl_identify_utf32le = {
+ mbfl_no_encoding_utf32le,
+ mbfl_filt_ident_common_ctor,
+ mbfl_filt_ident_utf32le
+};
+
const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
mbfl_no_encoding_utf32,
mbfl_no_encoding_wchar,
return c;
}
+
+static int mbfl_filt_ident_utf32(int c, mbfl_identify_filter *filter)
+{
+ /* The largest valid codepoint is 0x10FFFF; we don't want values above that
+ * Neither do we want to see surrogates
+ * For UTF-32 (not LE or BE), we do also need to look for a byte-order mark */
+ switch (filter->status) {
+ case 0: /* 1st byte */
+ if (c == 0xff) {
+ filter->status = 1;
+ return c;
+ }
+ filter->filter_function = mbfl_filt_ident_utf32be;
+ break;
+
+ case 1: /* 2nd byte */
+ if (c == 0xfe) {
+ filter->status = 2;
+ return c;
+ }
+ filter->filter_function = mbfl_filt_ident_utf32be;
+ (filter->filter_function)(0xff, filter);
+ break;
+
+ case 2: /* 3rd byte */
+ if (c == 0) {
+ filter->status = 3;
+ return c;
+ }
+ filter->filter_function = mbfl_filt_ident_utf32be;
+ (filter->filter_function)(0xff, filter);
+ (filter->filter_function)(0xfe, filter);
+ break;
+
+ case 3: /* 4th byte */
+ if (c == 0) {
+ /* We found a little-endian byte-order mark! */
+ filter->status = 0;
+ filter->filter_function = mbfl_filt_ident_utf32le;
+ return c;
+ }
+ filter->filter_function = mbfl_filt_ident_utf32be;
+ (filter->filter_function)(0xff, filter);
+ (filter->filter_function)(0xfe, filter);
+ (filter->filter_function)(0, filter);
+ break;
+ }
+
+ return (filter->filter_function)(c, filter);
+}
+
+static int mbfl_filt_ident_utf32le(int c, mbfl_identify_filter *filter)
+{
+ switch (filter->status) {
+ case 0: /* 1st byte */
+ filter->status = 1;
+ break;
+
+ case 1: /* 2nd byte */
+ if (c >= 0xD8 && c <= 0xDF) {
+ filter->status = 4; /* might be surrogate if we are in BMP */
+ } else {
+ filter->status = 2;
+ }
+ break;
+
+ case 2: /* 3rd byte */
+ if (c > 0x10) {
+ filter->flag = 1; /* too big */
+ }
+ filter->status = 3;
+ break;
+
+ case 3: /* 4th byte */
+ if (c) {
+ filter->flag = 1; /* too big */
+ }
+ filter->status = 0;
+ break;
+
+ case 4: /* 3rd byte, previous byte looked like surrogate */
+ if (!c) {
+ filter->flag = 1; /* yep, it's a surrogate */
+ }
+ filter->status = 3;
+ }
+ return c;
+}
+
+static int mbfl_filt_ident_utf32be(int c, mbfl_identify_filter *filter)
+{
+ switch (filter->status) {
+ case 0: /* 1st byte */
+ if (c) {
+ filter->flag = 1; /* too big */
+ }
+ filter->status = 1;
+ break;
+
+ case 1: /* 2nd byte */
+ if (c > 0x10) {
+ filter->flag = 1; /* too big */
+ } if (c) {
+ filter->status = 4; /* not in the BMP */
+ } else {
+ filter->status = 2;
+ }
+ break;
+
+ case 2: /* 3rd byte */
+ if (c >= 0xD8 && c <= 0xDF) {
+ filter->flag = 1; /* reserved range for surrogates */
+ }
+ filter->status = 3;
+ break;
+
+ case 3: /* 4th byte */
+ filter->status = 0;
+ break;
+
+ case 4: /* 3rd byte, not in BMP */
+ filter->status = 3;
+ }
+ return c;
+}