From: Moriyoshi Koizumi Date: Thu, 8 May 2003 10:42:20 +0000 (+0000) Subject: Improved html_entity_decode() so it can handle multibyte charsets and X-Git-Tag: RELEASE_0_9b~59 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6ed4fd1666596b8e0a182d049a30bc63d8a554b1;p=php Improved html_entity_decode() so it can handle multibyte charsets and numeric entities. --- diff --git a/ext/standard/html.c b/ext/standard/html.c index 219781d939..33549a8757 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -777,13 +777,58 @@ det_charset: } /* }}} */ +/* {{{ php_utf32_utf8 */ +size_t php_utf32_utf8(unsigned char *buf, int k) +{ + size_t retval = 0; + + if (k < 0x80) { + buf[0] = k; + retval = 1; + } else if (k < 0x800) { + buf[0] = 0xc0 | (k >> 6); + buf[1] = 0x80 | (k & 0x3f); + retval = 2; + } else if (k < 0x10000) { + buf[0] = 0xe0 | (k >> 12); + buf[1] = 0x80 | ((k >> 6) & 0x3f); + buf[2] = 0x80 | (k & 0x3f); + retval = 3; + } else if (k < 0x200000) { + buf[0] = 0xf0 | (k >> 18); + buf[1] = 0x80 | ((k >> 12) & 0x3f); + buf[2] = 0x80 | ((k >> 6) & 0x3f); + buf[3] = 0x80 | (k & 0x3f); + retval = 4; + } else if (k < 0x4000000) { + buf[0] = 0xf8 | (k >> 24); + buf[1] = 0x80 | ((k >> 18) & 0x3f); + buf[2] = 0x80 | ((k >> 12) & 0x3f); + buf[3] = 0x80 | ((k >> 6) & 0x3f); + buf[4] = 0x80 | (k & 0x3f); + retval = 5; + } else { + buf[0] = 0xfc | (k >> 30); + buf[1] = 0x80 | ((k >> 24) & 0x3f); + buf[2] = 0x80 | ((k >> 18) & 0x3f); + buf[3] = 0x80 | ((k >> 12) & 0x3f); + buf[4] = 0x80 | ((k >> 6) & 0x3f); + buf[5] = 0x80 | (k & 0x3f); + retval = 6; + } + buf[retval] = '\0'; + + return retval; +} +/* }}} */ + /* {{{ php_unescape_html_entities */ PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) { int retlen; int j, k; - char *replaced, *ret; + char *replaced, *ret, *p, *q, *lim, *next; enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC); unsigned char replacement[15]; @@ -815,11 +860,34 @@ PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *new entity_length += 2; /* When we have MBCS entities in the tables above, this will need to handle it */ - if (k > 0xff) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!"); + switch (charset) { + case cs_8859_1: + case cs_cp1252: + case cs_8859_15: + case cs_cp1251: + case cs_8859_5: + case cs_cp866: + replacement[0] = k; + replacement[1] = '\0'; + + case cs_big5: + case cs_gb2312: + case cs_big5hkscs: + case cs_sjis: + case cs_eucjp: + replacement[0] = (char)((unsigned int)k >> 8); + replacement[1] = (k & 0xff); + replacement[2] = '\0'; + break; + + case cs_utf_8: + php_utf32_utf8(replacement, k); + break; + + default: + php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!"); + return 0; } - replacement[0] = k; - replacement[1] = '\0'; replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, 1, &retlen); efree(ret); @@ -840,6 +908,59 @@ PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *new efree(ret); ret = replaced; } + + /* replace numeric entities */ + lim = ret + retlen; + for (p = ret, q = ret; p < lim; p++) { + int code; + + if (p < lim - 1 && p[0] == '&' && p[1] == '#') { + code = strtol(p + 2, &next, 10); + if (next != NULL && *next == ';') { + switch (charset) { + case cs_utf_8: + q += php_utf32_utf8(q, code); + break; + + case cs_8859_1: + case cs_8859_5: + case cs_8859_15: + if (0xa0 <= code && code <= 0xff) { + *(q++) = code; + } + break; + + case cs_cp1252: + case cs_cp1251: + case cs_cp866: + if (0x80 <= code && code <= 0xff) { + *(q++) = code; + } + break; + + case cs_big5: + case cs_gb2312: + case cs_big5hkscs: + case cs_sjis: + case cs_eucjp: + if (code <= 0x7f) { + *(q++) = code; + } + break; + + default: + break; + } + p = next; + } else { + *(q++) = *p; + } + } else { + *(q++) = *p; + } + } + *q = '\0'; + retlen = (size_t)(q - ret); empty_source: *newlen = retlen; return ret;