#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
-/*
- * UTF-32 => wchar
- */
-int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter)
+static int emit_char_if_valid(int n, mbfl_convert_filter *filter)
{
- int n, endian;
+ if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) {
+ CK((*filter->output_function)(n, filter->data));
+ } else {
+ n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
+ CK((*filter->output_function)(n, filter->data));
+ }
+ return 0;
+}
- endian = filter->status & 0xff00;
- switch (filter->status & 0xff) {
- case 0:
- if (endian) {
- n = c & 0xff;
- } else {
- n = (c & 0xffu) << 24;
- }
- filter->cache = n;
- filter->status++;
- break;
- case 1:
- if (endian) {
- n = (c & 0xff) << 8;
- } else {
- n = (c & 0xff) << 16;
- }
- filter->cache |= n;
- filter->status++;
- break;
- case 2:
- if (endian) {
- n = (c & 0xff) << 16;
- } else {
- n = (c & 0xff) << 8;
- }
- filter->cache |= n;
+int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter)
+{
+ if (filter->status < 3) {
+ filter->cache = (filter->cache << 8) | (c & 0xFF);
filter->status++;
- break;
- default:
- if (endian) {
- n = (c & 0xffu) << 24;
- } else {
- n = c & 0xff;
- }
- n |= filter->cache;
- if ((n & 0xffff) == 0 && ((n >> 16) & 0xffff) == 0xfffe) {
- if (endian) {
- filter->status = 0; /* big-endian */
- } else {
- filter->status = 0x100; /* little-endian */
- }
- CK((*filter->output_function)(0xfeff, filter->data));
+ } else {
+ int n = ((unsigned int)filter->cache << 8) | (c & 0xFF);
+ filter->cache = filter->status = 0;
+
+ if (n == 0xFFFE0000) {
+ /* Found a little-endian byte order mark */
+ filter->filter_function = mbfl_filt_conv_utf32le_wchar;
} else {
- filter->status &= ~0xff;
- if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
- CK((*filter->output_function)(n, filter->data));
- } else {
- n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
- CK((*filter->output_function)(n, filter->data));
+ filter->filter_function = mbfl_filt_conv_utf32be_wchar;
+ if (n != 0xFEFF) {
+ CK(emit_char_if_valid(n, filter));
}
}
- break;
}
return c;
}
-/*
- * UTF-32BE => wchar
- */
int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter)
{
- int n;
-
- if (filter->status == 0) {
- filter->status = 1;
- n = (c & 0xffu) << 24;
- filter->cache = n;
- } else if (filter->status == 1) {
- filter->status = 2;
- n = (c & 0xff) << 16;
- filter->cache |= n;
- } else if (filter->status == 2) {
- filter->status = 3;
- n = (c & 0xff) << 8;
- filter->cache |= n;
+ if (filter->status < 3) {
+ filter->cache = (filter->cache << 8) | (c & 0xFF);
+ filter->status++;
} else {
- filter->status = 0;
- n = (c & 0xff) | filter->cache;
- if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
- CK((*filter->output_function)(n, filter->data));
- } else {
- n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
- CK((*filter->output_function)(n, filter->data));
- }
+ int n = ((unsigned int)filter->cache << 8) | (c & 0xFF);
+ filter->cache = filter->status = 0;
+ CK(emit_char_if_valid(n, filter));
}
return c;
}
-/*
- * wchar => UTF-32BE
- */
int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter)
{
if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
return c;
}
-/*
- * UTF-32LE => wchar
- */
int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter)
{
- int n;
-
- if (filter->status == 0) {
- filter->status = 1;
- n = (c & 0xff);
- filter->cache = n;
- } else if (filter->status == 1) {
- filter->status = 2;
- n = (c & 0xff) << 8;
- filter->cache |= n;
- } else if (filter->status == 2) {
- filter->status = 3;
- n = (c & 0xff) << 16;
- filter->cache |= n;
+ if (filter->status < 3) {
+ filter->cache |= ((c & 0xFFU) << (8 * filter->status));
+ filter->status++;
} else {
- filter->status = 0;
- n = ((c & 0xffu) << 24) | filter->cache;
- if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
- CK((*filter->output_function)(n, filter->data));
- } else {
- n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
- CK((*filter->output_function)(n, filter->data));
- }
+ int n = ((c & 0xFFU) << 24) | filter->cache;
+ filter->cache = filter->status = 0;
+ CK(emit_char_if_valid(n, filter));
}
return c;
}
-/*
- * wchar => UTF-32LE
- */
int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
{
if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter)
{
- if (filter->status & 0xF) {
+ if (filter->status) {
/* Input string was truncated */
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
}
(*filter->flush_function)(filter->data);
}
- filter->status = filter->cache = 0;
return 0;
}
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
--FILE--
<?php
-function chk_enc($str, $n, $enc = "UTF-8", $with_bom = false) {
+function chk_enc($str, $n, $enc = "UTF-8") {
$src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc));
$dst = str_repeat("0000fffd", $n);
- if ($with_bom) {
- $dst = "0000feff" . $dst;
- }
if ($dst == $src) {
return false;
} else {
$cnt = 0;
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
$s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff),
- 1, "UTF-32", true);
+ 1, "UTF-32");
if ($s === false) {
$cnt++;
} else {
}
}
var_dump($cnt);
-var_dump(str_replace("0000feff","",$out));
+var_dump($out);
$out = '';
$cnt = 0;
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
$s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff),
- 1, "UTF-32", true);
+ 1, "UTF-32");
if ($s === false) {
$cnt++;
} else {
}
}
var_dump($cnt);
-var_dump(str_replace("0000feff","",$out));
+var_dump($out);
?>
--EXPECT--
string(8) "0010ffff"
bool(false)
string(8) "0010ffff"
-string(16) "0000feff0000fffd"
-string(16) "0000feff0010ffff"
-string(16) "0000feff0000fffd"
-string(16) "0000feff0010ffff"
+string(8) "0000fffd"
+string(8) "0010ffff"
+string(8) "0000fffd"
+string(8) "0010ffff"
UTF-32 and surrogates area
int(2048)
string(16) "0000d7ff0000e000"