From: Marcus Boerger Date: Sun, 4 Aug 2002 14:20:11 +0000 (+0000) Subject: -New encoding/decoding HTML-ENTITIES. X-Git-Tag: dev~37 X-Git-Url:;h=77abd9d55a38d3f68e25a55cad33d4b192fd094b;p=php -New encoding/decoding HTML-ENTITIES. #This allows to enforce named and numeric entities in output. #Example:CMS that reads UTF8 or ISO-8859-1 but exports HTML in ASCII. #Using mbstring.http_output=HTML-ENTITIES users receive entities which #can displayed correctly independant of any brwoser side encoding. @New mbstring encoding/decoding HTML-ENTITIES. (marcus) --- diff --git a/ext/mbstring/config.m4 b/ext/mbstring/config.m4 index 60cfb3396d..a0656c11f3 100644 --- a/ext/mbstring/config.m4 +++ b/ext/mbstring/config.m4 @@ -30,7 +30,7 @@ if test "$PHP_MBSTRING" != "no"; then AC_DEFINE(HAVE_MBSTR_KR,1,[whether to have korean support]) AC_DEFINE(HAVE_MBSTR_RU,1,[whether to have russian support]) fi - PHP_NEW_EXTENSION(mbstring, mbfilter_ja.c mbfilter_cn.c mbfilter_tw.c mbfilter_kr.c mbfilter_ru.c mbfilter.c mbstring.c mbregex.c php_mbregex.c, $ext_shared) + PHP_NEW_EXTENSION(mbstring, mbfilter_ja.c mbfilter_cn.c mbfilter_tw.c mbfilter_kr.c mbfilter_ru.c mbfilter.c mbstring.c mbregex.c php_mbregex.c html_entities.c, $ext_shared) else PHP_MBSTR_ENC_TRANS=no fi diff --git a/ext/mbstring/html_entities.c b/ext/mbstring/html_entities.c new file mode 100644 index 0000000000..482c001bfd --- /dev/null +++ b/ext/mbstring/html_entities.c @@ -0,0 +1,291 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 4 | + +----------------------------------------------------------------------+ + | Copyright (c) 2001 The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 2.02 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available at through the world-wide-web at | + | | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Marcus Boerger | + +----------------------------------------------------------------------+ + */ + +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "php.h" +#include "php_globals.h" + +#include +#include "mbfilter.h" + +const mbfl_html_entity mbfl_html_entity_list[] = { /* + {"quot", 34}, DO NOT CONVERT THESE AUTOMATICALLY + {"amp", 38}, + {"lt", 60}, + {"gt", 62}, */ + {"nbsp", 160}, + {"iexcl", 161}, + {"cent", 162}, + {"pound", 163}, + {"curren", 164}, + {"yen", 165}, + {"brvbar", 166}, + {"sect", 167}, + {"uml", 168}, + {"copy", 169}, + {"ordf", 170}, + {"laquo", 171}, + {"not", 172}, + {"shy", 173}, + {"reg", 174}, + {"macr", 175}, + {"deg", 176}, + {"plusmn", 177}, + {"sup2", 178}, + {"sup3", 179}, + {"acute", 180}, + {"micro", 181}, + {"para", 182}, + {"middot", 183}, + {"cedil", 184}, + {"sup1", 185}, + {"ordm", 186}, + {"raquo", 187}, + {"frac14", 188}, + {"frac12", 189}, + {"frac34", 190}, + {"iquest", 191}, + {"Agrave", 192}, + {"Aacute", 193}, + {"Acirc", 194}, + {"Atilde", 195}, + {"Auml", 196}, + {"Aring", 197}, + {"AElig", 198}, + {"Ccedil", 199}, + {"Egrave", 200}, + {"Eacute", 201}, + {"Ecirc", 202}, + {"Euml", 203}, + {"Igrave", 204}, + {"Iacute", 205}, + {"Icirc", 206}, + {"Iuml", 207}, + {"ETH", 208}, + {"Ntilde", 209}, + {"Ograve", 210}, + {"Oacute", 211}, + {"Ocirc", 212}, + {"Otilde", 213}, + {"Ouml", 214}, + {"times", 215}, + {"Oslash", 216}, + {"Ugrave", 217}, + {"Uacute", 218}, + {"Ucirc", 219}, + {"Uuml", 220}, + {"Yacute", 221}, + {"THORN", 222}, + {"szlig", 223}, + {"agrave", 224}, + {"aacute", 225}, + {"acirc", 226}, + {"atilde", 227}, + {"auml", 228}, + {"aring", 229}, + {"aelig", 230}, + {"ccedil", 231}, + {"egrave", 232}, + {"eacute", 233}, + {"ecirc", 234}, + {"euml", 235}, + {"igrave", 236}, + {"iacute", 237}, + {"icirc", 238}, + {"iuml", 239}, + {"eth", 240}, + {"ntilde", 241}, + {"ograve", 242}, + {"oacute", 243}, + {"ocirc", 244}, + {"otilde", 245}, + {"ouml", 246}, + {"divide", 247}, + {"oslash", 248}, + {"ugrave", 249}, + {"uacute", 250}, + {"ucirc", 251}, + {"uuml", 252}, + {"yacute", 253}, + {"thorn", 254}, + {"yuml", 255}, + {"OElig", 338}, + {"oelig", 339}, + {"Scaron", 352}, + {"scaron", 353}, + {"Yuml", 376}, + {"fnof", 402}, + {"circ", 710}, + {"tilde", 732}, + {"Alpha", 913}, + {"Beta", 914}, + {"Gamma", 915}, + {"Delta", 916}, + {"Epsilon", 917}, + {"Zeta", 918}, + {"Eta", 919}, + {"Theta", 920}, + {"Iota", 921}, + {"Kappa", 922}, + {"Lambda", 923}, + {"Mu", 924}, + {"Nu", 925}, + {"Xi", 926}, + {"Omicron", 927}, + {"Pi", 928}, + {"Rho", 929}, + {"Sigma", 931}, + {"Tau", 932}, + {"Upsilon", 933}, + {"Phi", 934}, + {"Chi", 935}, + {"Psi", 936}, + {"Omega", 937}, + {"beta", 946}, + {"gamma", 947}, + {"delta", 948}, + {"epsilon", 949}, + {"zeta", 950}, + {"eta", 951}, + {"theta", 952}, + {"iota", 953}, + {"kappa", 954}, + {"lambda", 955}, + {"mu", 956}, + {"nu", 957}, + {"xi", 958}, + {"omicron", 959}, + {"pi", 960}, + {"rho", 961}, + {"sigmaf", 962}, + {"sigma", 963}, + {"tau", 964}, + {"upsilon", 965}, + {"phi", 966}, + {"chi", 967}, + {"psi", 968}, + {"omega", 969}, + {"thetasym", 977}, + {"upsih", 978}, + {"piv", 982}, + {"ensp", 8194}, + {"emsp", 8195}, + {"thinsp", 8201}, + {"zwnj", 8204}, + {"zwj", 8205}, + {"lrm", 8206}, + {"rlm", 8207}, + {"ndash", 8211}, + {"mdash", 8212}, + {"lsquo", 8216}, + {"rsquo", 8217}, + {"sbquo", 8218}, + {"ldquo", 8220}, + {"rdquo", 8221}, + {"bdquo", 8222}, + {"dagger", 8224}, + {"Dagger", 8225}, + {"bull", 8226}, + {"hellip", 8230}, + {"permil", 8240}, + {"prime", 8242}, + {"Prime", 8243}, + {"lsaquo", 8249}, + {"rsaquo", 8250}, + {"oline", 8254}, + {"frasl", 8260}, + {"euro", 8364}, + {"weierp", 8472}, + {"image", 8465}, + {"real", 8476}, + {"trade", 8482}, + {"alefsym", 8501}, + {"larr", 8592}, + {"uarr", 8593}, + {"rarr", 8594}, + {"darr", 8595}, + {"harr", 8596}, + {"crarr", 8629}, + {"lArr", 8656}, + {"uArr", 8657}, + {"rArr", 8658}, + {"dArr", 8659}, + {"hArr", 8660}, + {"forall", 8704}, + {"part", 8706}, + {"exist", 8707}, + {"empty", 8709}, + {"nabla", 8711}, + {"isin", 8712}, + {"notin", 8713}, + {"ni", 8715}, + {"prod", 8719}, + {"sum", 8721}, + {"minus", 8722}, + {"lowast", 8727}, + {"radic", 8730}, + {"prop", 8733}, + {"infin", 8734}, + {"ang", 8736}, + {"and", 8743}, + {"or", 8744}, + {"cap", 8745}, + {"cup", 8746}, + {"int", 8747}, + {"there4", 8756}, + {"sim", 8764}, + {"cong", 8773}, + {"asymp", 8776}, + {"ne", 8800}, + {"equiv", 8801}, + {"le", 8804}, + {"ge", 8805}, + {"sub", 8834}, + {"sup", 8835}, + {"nsub", 8836}, + {"sube", 8838}, + {"supe", 8839}, + {"oplus", 8853}, + {"otimes", 8855}, + {"perp", 8869}, + {"sdot", 8901}, + {"lceil", 8968}, + {"rceil", 8969}, + {"lfloor", 8970}, + {"rfloor", 8971}, + {"lang", 9001}, + {"rang", 9002}, + {"loz", 9674}, + {"spades", 9824}, + {"clubs", 9827}, + {"hearts", 9829}, + {"diams", 9830}, + {NULL, -1} /* mark end of table */ +}; + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + */ diff --git a/ext/mbstring/mbfilter.c b/ext/mbstring/mbfilter.c index 041e4e03b9..87e645e835 100644 --- a/ext/mbstring/mbfilter.c +++ b/ext/mbstring/mbfilter.c @@ -397,6 +397,25 @@ static const unsigned char mblen_table_uhc[] = { /* 0x81-0xFE */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 }; +static const unsigned char mblen_table_html[] = { /* 0x00, 0x80 - 0xFF, only valid for numeric entities */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 +}; + /* encoding structure */ static const char *mbfl_encoding_pass_aliases[] = {"none", NULL}; @@ -483,6 +502,17 @@ static const mbfl_encoding mbfl_encoding_uuencode = { MBFL_ENCTYPE_SBCS }; +static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL}; + +static const mbfl_encoding mbfl_encoding_html_ent = { + mbfl_no_encoding_html_ent, + "HTML-ENTITIES", + "html", + (const char *(*)[])&mbfl_encoding_html_ent_aliases, + NULL, /* mblen_table_html, Do not use table instead calulate length based on entities actually used */ + MBFL_ENCTYPE_HTML_ENT +}; + static const char *mbfl_encoding_qprint_aliases[] = {"qprint", NULL}; static const mbfl_encoding mbfl_encoding_qprint = { @@ -1025,6 +1055,7 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = { &mbfl_encoding_byte4le, &mbfl_encoding_base64, &mbfl_encoding_uuencode, + &mbfl_encoding_html_ent, &mbfl_encoding_qprint, &mbfl_encoding_7bit, &mbfl_encoding_8bit, @@ -1115,6 +1146,13 @@ static int mbfl_filt_conv_base64dec(int c, mbfl_convert_filter *filter TSRMLS_DC static int mbfl_filt_conv_base64dec_flush(mbfl_convert_filter *filter TSRMLS_DC); static int mbfl_filt_conv_uudec(int c, mbfl_convert_filter *filter TSRMLS_DC); +static void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter TSRMLS_DC); +static void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter TSRMLS_DC); +static int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter TSRMLS_DC); +static int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter TSRMLS_DC); +static int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter TSRMLS_DC); +static int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter TSRMLS_DC); + static int mbfl_filt_conv_qprintenc(int c, mbfl_convert_filter *filter TSRMLS_DC); static int mbfl_filt_conv_qprintenc_flush(mbfl_convert_filter *filter TSRMLS_DC); static int mbfl_filt_conv_qprintdec(int c, mbfl_convert_filter *filter TSRMLS_DC); @@ -1307,6 +1345,22 @@ static const struct mbfl_convert_vtbl vtbl_uuencode_8bit = { mbfl_filt_conv_uudec, mbfl_filt_conv_common_flush }; +static const struct mbfl_convert_vtbl vtbl_wchar_html = { + mbfl_no_encoding_wchar, + mbfl_no_encoding_html_ent, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_html_enc, + mbfl_filt_conv_html_enc_flush }; + +static const struct mbfl_convert_vtbl vtbl_html_wchar = { + mbfl_no_encoding_html_ent, + mbfl_no_encoding_wchar, + mbfl_filt_conv_html_dec_ctor, + mbfl_filt_conv_html_dec_dtor, + mbfl_filt_conv_html_dec, + mbfl_filt_conv_html_dec_flush }; + static const struct mbfl_convert_vtbl vtbl_8bit_qprint = { mbfl_no_encoding_8bit, mbfl_no_encoding_qprint, @@ -2185,6 +2239,8 @@ static const struct mbfl_convert_vtbl *mbfl_convert_filter_list[] = { &vtbl_8bit_b64, &vtbl_b64_8bit, &vtbl_uuencode_8bit, + &vtbl_wchar_html, + &vtbl_html_wchar, &vtbl_8bit_qprint, &vtbl_qprint_8bit, &vtbl_8bit_7bit, @@ -3467,6 +3523,185 @@ mbfl_filt_conv_base64dec_flush(mbfl_convert_filter *filter TSRMLS_DC) return 0; } +/* + * any => HTML + */ +static int +mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int tmp[10]; + int i = 0, p = 0, e; + + if (c<256 && mblen_table_html[c]==1) { + CK((*filter->output_function)(c, filter->data TSRMLS_CC)); + } else { + /*php_error(E_NOTICE, "%s() mbfl_filt_conv_html_enc(0x%08X = %d)", get_active_function_name(TSRMLS_C), c, c);*/ + CK((*filter->output_function)('&', filter->data TSRMLS_CC)); + while (1) { + e = mbfl_html_entity_list[i].code; + if (c < e || e == -1) { + break; + } + if (c == e) { + while(mbfl_html_entity_list[i].name[p]) { + CK((*filter->output_function)((int)mbfl_html_entity_list[i].name[p++], filter->data TSRMLS_CC)); + } + break; + } + i++; + } + if (!p) { + CK((*filter->output_function)('#', filter->data TSRMLS_CC)); + do { + tmp[i++] = '0'+c%10; + c /= 10; + } while (c); + do { + CK((*filter->output_function)(tmp[--i], filter->data TSRMLS_CC)); + } while(i); + } + CK((*filter->output_function)(';', filter->data TSRMLS_CC)); + } + return c; +} + +static int +mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter TSRMLS_DC) +{ + filter->status = 0; + filter->cache = 0; + return 0; +} + +/* + * HTML => any + */ +#define html_enc_buffer_size 16 +static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; + +static void +mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter TSRMLS_DC) +{ + filter->status = 0; + filter->cache = (int)mbfl_malloc(html_enc_buffer_size); +} + +static void +mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter TSRMLS_DC) +{ + filter->status = 0; + if (filter->cache) + { + mbfl_free((void*)filter->cache); + } + filter->cache = 0; +} + +static int +mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int pos, ent = 0; + const mbfl_html_entity *entity; + char *buffer = (char*)filter->cache; + + if (!filter->status) + { + if (c == '&' ) + { + filter->status = 1; + buffer[0] = '&'; + } + else + { + CK((*filter->output_function)(c, filter->data TSRMLS_CC)); + } + } + else + { + if (c == ';') + { + buffer[filter->status] = 0; + if (buffer[1]=='#') + { + /* numeric entity */ + for (pos=2; posstatus; pos++) + ent = ent*10 + (buffer[pos] - '0'); + CK((*filter->output_function)(ent, filter->data TSRMLS_CC)); + filter->status = 0; + /*php_error(E_NOTICE,"%s() mbstring decoded '%s'=%d", get_active_function_name(TSRMLS_C), buffer, ent);*/ + } + else + { + /* named entity */ + entity = mbfl_html_entity_list; + while (entity->name) + { + if (!strcmp(buffer+1, entity->name)) + { + ent = entity->code; + break; + } + entity++; + } + if (ent) + { + /* decoded */ + CK((*filter->output_function)(ent, filter->data TSRMLS_CC)); + filter->status = 0; + /*php_error(E_NOTICE,"%s() mbstring decoded '%s'=%d", get_active_function_name(TSRMLS_C), buffer, ent);*/ + } + else + { + /* failure */ + buffer[filter->status++] = ';'; + buffer[filter->status] = 0; + php_error(E_WARNING, "%s() mbstring cannot decode '%s'", get_active_function_name(TSRMLS_C), buffer); + mbfl_filt_conv_html_dec_flush(filter TSRMLS_CC); + } + } + } + else + { + /* add character */ + buffer[filter->status++] = c; + /* add character and check */ + if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2)) + { + /* illegal character or end of buffer */ + if (c=='&') + filter->status--; + buffer[filter->status] = 0; + php_error(E_WARNING, "%s() mbstring cannot decode '%s'", get_active_function_name(TSRMLS_C), buffer); + mbfl_filt_conv_html_dec_flush(filter TSRMLS_CC); + if (c=='&') + { + filter->status = 1; + buffer[0] = '&'; + } + } + } + } + return c; +} + +static int +mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter TSRMLS_DC) +{ + int status, pos = 0; + char *buffer; + + buffer = (char*)filter->cache; + status = filter->status; + /* flush fragments */ + while (status--) + { + CK((*filter->output_function)(buffer[pos++], filter->data TSRMLS_CC)); + } + filter->status = 0; + /*filter->buffer = 0; of cause NOT*/ + return 0; +} + /* * any => Quoted-Printable */ diff --git a/ext/mbstring/mbfilter.h b/ext/mbstring/mbfilter.h index 3e3c9dd11c..563a11ea89 100644 --- a/ext/mbstring/mbfilter.h +++ b/ext/mbstring/mbfilter.h @@ -128,6 +128,7 @@ enum mbfl_no_encoding { mbfl_no_encoding_byte4le, mbfl_no_encoding_base64, mbfl_no_encoding_uuencode, + mbfl_no_encoding_html_ent, mbfl_no_encoding_qprint, mbfl_no_encoding_7bit, mbfl_no_encoding_8bit, @@ -222,6 +223,7 @@ typedef struct _mbfl_encoding { #define MBFL_ENCTYPE_MWC4BE 0x00000400 #define MBFL_ENCTYPE_MWC4LE 0x00000800 #define MBFL_ENCTYPE_SHFTCODE 0x00001000 +#define MBFL_ENCTYPE_HTML_ENT 0x00002000 /* wchar plane, special charactor */ #define MBFL_WCSPLANE_MASK 0xffff @@ -574,4 +576,14 @@ mbfl_html_numeric_entity(mbfl_string *string, mbfl_string *result, int *convmap, mbfl_string * mbfl_ja_jp_hantozen(mbfl_string *string, mbfl_string *result, int mode TSRMLS_DC); +/* + * HTML Entity table + */ +typedef struct _mbfl_html_entity { + char * name; + int code; +} mbfl_html_entity; + +extern const mbfl_html_entity mbfl_html_entity_list[]; + #endif /* MBFL_MBFILTER_H */ diff --git a/ext/mbstring/tests/020.phpt b/ext/mbstring/tests/020.phpt new file mode 100644 index 0000000000..cc13d9e3ff --- /dev/null +++ b/ext/mbstring/tests/020.phpt @@ -0,0 +1,33 @@ +--TEST-- +HTML input/output +--SKIPIF-- + +--INI-- +arg_separator.input="x" +error_reporting=0 +mbstring.http_input=HTML-ENTITIES +mbstring.internal_encoding=UTF8 +mbstring.http_output=HTML-ENTITIES +mbstring.encoding_translation=On +--FILE-- + +'.mb_internal_encoding().'>'.mb_http_output();?> + + +--EXPECT-- +HTML-ENTITIES>UTF-8>HTML-ENTITIES +test='&&;&@AB€‚äöü€⟨⟩' \ No newline at end of file