From f4a896c20900f5ce3c290941afe5b53560f34196 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Gustavo=20Andr=C3=A9=20dos=20Santos=20Lopes?= Date: Mon, 11 Oct 2010 22:26:10 +0000 Subject: [PATCH] - PHP uses a big endian representation when it converts the code unit sequences to integers so as to store the entity maps. Code in traverse_for_entities assumed little endian. Fixed. (in practice, due to the absence of unicode and entity mappings for multi-byte encodings -- except UTF-8 --, this doesn't matter, so the relevant code was commented out for performance reasons). --- ext/standard/html.c | 69 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/ext/standard/html.c b/ext/standard/html.c index 0ad34e52c4..9ef6d0158f 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -456,7 +456,7 @@ det_charset: /* }}} */ /* {{{ php_utf32_utf8 */ -size_t php_utf32_utf8(unsigned char *buf, int k) +static size_t php_utf32_utf8(unsigned char *buf, unsigned k) { size_t retval = 0; @@ -487,6 +487,47 @@ size_t php_utf32_utf8(unsigned char *buf, int k) } /* }}} */ +/* {{{ php_mb2_int_to_char + * Convert back big endian int representation of sequence of one or two 8-bit code units. */ +static size_t php_mb2_int_to_char(unsigned char *buf, unsigned k) +{ + assert(k <= 0xFFFFU); + /* one or two bytes */ + if (k <= 0xFFU) { /* 1 */ + buf[0] = k; + return 1U; + } else { /* 2 */ + buf[0] = k >> 8; + buf[1] = k & 0xFFU; + return 2U; + } +} +/* }}} */ + +/* {{{ php_mb3_int_to_char + * Convert back big endian int representation of sequence of one to three 8-bit code units. + * For EUC-JP. */ +static size_t php_mb3_int_to_char(unsigned char *buf, unsigned k) +{ + assert(k <= 0xFFFFFFU); + /* one to three bytes */ + if (k <= 0xFFU) { /* 1 */ + buf[0] = k; + return 1U; + } else if (k <= 0xFFFFU) { /* 2 */ + buf[0] = k >> 8; + buf[1] = k & 0xFFU; + return 2U; + } else { + buf[0] = k >> 16; + buf[1] = (k >> 8) & 0xFFU; + buf[2] = k & 0xFFU; + return 3U; + } +} +/* }}} */ + + /* {{{ unimap_bsearc_cmp * Binary search of unicode code points in unicode <--> charset mapping. * Returns the code point in the target charset (whose mapping table was given) or 0 if @@ -817,21 +858,23 @@ static void traverse_for_entities(char *ret, int *retlen_p, int all, int quote_s case cs_big5hkscs: case cs_sjis: case cs_gb2312: - /* one or two bytes */ - *(q++) = (code & 0xFFU); - if (0xFF00U & code) { /* 2 */ - *(q++) = (code >> 8); - } + /* we don't have named entity or unicode mappings for these yet, + * so we're guaranteed code <= 0xFF */ +#if 0 + q += php_mb2_int_to_char((unsigned char*)q, code); +#else + assert(code <= 0xFFU); + *(q++) = code; +#endif break; case cs_eucjp: - /* one to three bytes */ - *(q++) = code & 0xFFU; - if (0xFFFF00U & code) { /* 2 */ - *(q++) = ((code >> 8) & 0xFFU); - if (0xFF0000U & code) /* 3 */ - *(q++) = (code >> 16); - } +#if 0 /* idem */ + q += php_mb2_int_to_char((unsigned char*)q, code); +#else + assert(code <= 0xFFU); + *(q++) = code; +#endif break; default: -- 2.40.0