From 7aa43a8d834e386a1b38490644983c0c3587dcd5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Gustavo=20Andr=C3=A9=20dos=20Santos=20Lopes?= Date: Sun, 10 Oct 2010 19:04:59 +0000 Subject: [PATCH] - Revamp of the decoding portion of html.c. - Dramatic improvements on the performance of html_entity_decode and htmlspecialchars_decode, as the string is now traversed only once. Speedups of 20 to 25 times with Windows release builds and a ~250 characters string (for 2nd and subsequent calls). - Consistent behavior on html_entity_decode. For instance, the entity in "&<" would be decoded, but not "&é". Not anymore. The code path for "basic" and non-basic entities is now mostly shared. - Code of html_entity_decode and htmlspecialchars_decode is now shared. - [DOC] More consistent behavior of htmlspecialchars_decode. Instead of translating only <, >, &, ", ' and ', now e.g. ", ', ', ', etc. are also decoded. - [DOC] Previous translation of unicode code points in numerical entities was seriously broken. When the code points for some character were not the same in unicode and the target encoding, the behavior could be an erroneous translation (e.g. 0x80-0xA0 in win-1252) or no translation at all. Added unicode translation tables for all single-byte encodings. Entities are not translated for multi-byte entities, except for ASCII characters whose code points are shared. We could add the huge translation tables (several thousand elements) for those encodings in the future. - Fixed numerical entities that after # had text accepted by strcol being accepted. - Much more commented and well-structured code... - Tests for get_html_translation_table()) are broken. I stared fixing the tests, but then I realized it was completely helpless because get_html_translation_table() is broken by not handling multi-byte characters correctly. --- ext/standard/basic_functions.c | 5 + ext/standard/basic_functions.h | 4 + ext/standard/html.c | 1133 ++++----- ext/standard/html_tables.h | 2080 +++++++++++++++++ .../get_html_translation_table_basic1.phpt | 8 +- .../strings/html_entity_decode_cp866.phpt | 533 +++++ .../html_entity_decode_iso8859-15.phpt | 405 ++++ .../strings/html_entity_decode_iso8859-5.phpt | 405 ++++ .../strings/html_entity_decode_koi8-r.phpt | 533 +++++ .../strings/html_entity_decode_macroman.phpt | 540 +++++ .../strings/html_entity_decode_win1251.phpt | 537 +++++ .../strings/html_entity_decode_win1252.phpt | 169 ++ .../tests/strings/htmlentities17.phpt | 3 - 13 files changed, 5690 insertions(+), 665 deletions(-) create mode 100644 ext/standard/html_tables.h create mode 100644 ext/standard/tests/strings/html_entity_decode_cp866.phpt create mode 100644 ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt create mode 100644 ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt create mode 100644 ext/standard/tests/strings/html_entity_decode_koi8-r.phpt create mode 100644 ext/standard/tests/strings/html_entity_decode_macroman.phpt create mode 100644 ext/standard/tests/strings/html_entity_decode_win1251.phpt create mode 100644 ext/standard/tests/strings/html_entity_decode_win1252.phpt diff --git a/ext/standard/basic_functions.c b/ext/standard/basic_functions.c index 96201955be..a70a5b222b 100644 --- a/ext/standard/basic_functions.c +++ b/ext/standard/basic_functions.c @@ -3432,6 +3432,7 @@ static void basic_globals_ctor(php_basic_globals *basic_globals_p TSRMLS_DC) /* BG(left) = -1; BG(user_tick_functions) = NULL; BG(user_filter_map) = NULL; + BG(inverse_ent_maps) = NULL; memset(&BG(serialize), 0, sizeof(BG(serialize))); memset(&BG(unserialize), 0, sizeof(BG(unserialize))); @@ -3454,6 +3455,10 @@ static void basic_globals_dtor(php_basic_globals *basic_globals_p TSRMLS_DC) /* zend_hash_destroy(BG(url_adapt_state_ex).tags); free(BG(url_adapt_state_ex).tags); } + if (BG(inverse_ent_maps)) { + zend_hash_destroy(BG(inverse_ent_maps)); + pefree(BG(inverse_ent_maps), 1); + } } /* }}} */ diff --git a/ext/standard/basic_functions.h b/ext/standard/basic_functions.h index 4498e6cf8f..edc5846e0a 100644 --- a/ext/standard/basic_functions.h +++ b/ext/standard/basic_functions.h @@ -220,6 +220,10 @@ typedef struct _php_basic_globals { HashTable *user_filter_map; + /* html.c */ + /* map entities to characters. Stores hash table pointers for each charset */ + HashTable *inverse_ent_maps; + /* file.c */ #if defined(_REENTRANT) && defined(HAVE_MBRLEN) && defined(HAVE_MBSTATE_T) mbstate_t mblen_state; diff --git a/ext/standard/html.c b/ext/standard/html.c index 7a14f6b0ad..0ad34e52c4 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -14,7 +14,8 @@ +----------------------------------------------------------------------+ | Authors: Rasmus Lerdorf | | Jaakko Hyvätti | - | Wez Furlong | + | Wez Furlong | + | Gustavo Lopes | +----------------------------------------------------------------------+ */ @@ -28,7 +29,11 @@ * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT * * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2 - * + * + * From HTML 4.01 strict DTD: + * http://www.w3.org/TR/html4/HTMLlat1.ent + * http://www.w3.org/TR/html4/HTMLsymbol.ent + * http://www.w3.org/TR/html4/HTMLspecial.ent */ #include "php.h" @@ -37,7 +42,7 @@ #else #include #endif -#include "html.h" +#include "php_standard.h" #include "php_string.h" #include "SAPI.h" #if HAVE_LOCALE_H @@ -52,424 +57,8 @@ ZEND_EXTERN_MODULE_GLOBALS(mbstring) #endif -enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, - cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, - cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r, - cs_cp1251, cs_8859_5, cs_cp866, cs_macroman - }; -typedef const char *const entity_table_t; - -/* codepage 1252 is a Windows extension to iso-8859-1. */ -static entity_table_t ent_cp_1252[] = { - "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger", - "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig", - NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo", - "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo", - "oelig", NULL, NULL, "Yuml" -}; - -static entity_table_t ent_iso_8859_1[] = { - "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", - "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", - "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", - "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", - "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", - "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", - "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", - "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", - "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", - "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", - "atilde", "auml", "aring", "aelig", "ccedil", "egrave", - "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", - "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", - "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", - "uuml", "yacute", "thorn", "yuml" -}; - -static entity_table_t ent_iso_8859_15[] = { - "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron", - "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg", - "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */ - "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm", - "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute", - "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", - "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", - "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", - "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", - "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", - "atilde", "auml", "aring", "aelig", "ccedil", "egrave", - "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", - "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", - "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", - "uuml", "yacute", "thorn", "yuml" -}; - -static entity_table_t ent_uni_338_402[] = { - /* 338 (0x0152) */ - "OElig", "oelig", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 352 (0x0160) */ - "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 376 (0x0178) */ - "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 400 (0x0190) */ - NULL, NULL, "fnof" -}; - -static entity_table_t ent_uni_spacing[] = { - /* 710 */ - "circ", - /* 711 - 730 */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 731 - 732 */ - NULL, "tilde" -}; - -static entity_table_t ent_uni_greek[] = { - /* 913 */ - "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", - "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", - NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega", - /* 938 - 944 are not mapped */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", - "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", - "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", - /* 970 - 976 are not mapped */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "thetasym", "upsih", - NULL, NULL, NULL, - "piv" -}; - -static entity_table_t ent_uni_punct[] = { - /* 8194 */ - "ensp", "emsp", NULL, NULL, NULL, NULL, NULL, - "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", - NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, - /* 8216 */ - "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL, - "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, - /* 8242 */ - "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, - NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, - "frasl" -}; - -static entity_table_t ent_uni_euro[] = { - "euro" -}; - -static entity_table_t ent_uni_8465_8501[] = { - /* 8465 */ - "image", NULL, NULL, NULL, NULL, NULL, NULL, - /* 8472 */ - "weierp", NULL, NULL, NULL, - /* 8476 */ - "real", NULL, NULL, NULL, NULL, NULL, - /* 8482 */ - "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8501 */ - "alefsym", -}; - -static entity_table_t ent_uni_8592_9002[] = { - /* 8592 (0x2190) */ - "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8608 (0x21a0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8624 (0x21b0) */ - NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8640 (0x21c0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8656 (0x21d0) */ - "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8672 (0x21e0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8704 (0x2200) */ - "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla", - "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod", - /* 8720 (0x2210) */ - NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast", - NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL, - /* 8736 (0x2220) */ - "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and", - "or", "cap", "cup", "int", NULL, NULL, NULL, NULL, - /* 8752 (0x2230) */ - NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL, - /* 8768 (0x2240) */ - NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL, - "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8784 (0x2250) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8800 (0x2260) */ - "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8816 (0x2270) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8832 (0x2280) */ - NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8848 (0x2290) */ - NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8864 (0x22a0) */ - NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8880 (0x22b0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8896 (0x22c0) */ - NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8912 (0x22d0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8928 (0x22e0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8944 (0x22f0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8960 (0x2300) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL, - /* 8976 (0x2310) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8992 (0x2320) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, "lang", "rang" -}; - -static entity_table_t ent_uni_9674[] = { - /* 9674 */ - "loz" -}; - -static entity_table_t ent_uni_9824_9830[] = { - /* 9824 */ - "spades", NULL, NULL, "clubs", NULL, "hearts", "diams" -}; - -static entity_table_t ent_koi8r[] = { - "#1105", /* "jo "*/ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", - "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", - "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", - "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", - "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", - "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", - "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", - "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042", - "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", - "#1066" -}; - -static entity_table_t ent_cp_1251[] = { - "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger", - "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036", - "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220", - "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250", - "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118", - "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy", - "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn", - "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105", - "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111", - "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046", - "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053", - "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060", - "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067", - "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074", - "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081", - "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088", - "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095", - "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102", - "#1103" -}; - -static entity_table_t ent_iso_8859_5[] = { - "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062", - "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069", - "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076", - "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083", - "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090", - "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", - "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104", - "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111", - "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118", - "#1119" -}; - -static entity_table_t ent_cp_866[] = { - - "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", - "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", - "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", - "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", - "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", - "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", - "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", - "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", - "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632", - "#160" -}; - -/* MacRoman has a couple of low-ascii chars that need mapping too */ -/* Vertical tab (ASCII 11) is often used to store line breaks inside */ -/* DB exports, this mapping changes it to a space */ -static entity_table_t ent_macroman[] = { - "sp", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, "quot", NULL, - NULL, NULL, "amp", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "lt", NULL, "gt", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml", - "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring", - "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave", - "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml", - "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg", - "cent", "pound", "sect", "bull", "para", "szlig", "reg", - "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash", - "infin", "plusmn", "le", "ge", "yen", "micro", "part", - "sum", "prod", "pi", "int", "ordf", "ordm", "Omega", - "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof", - "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave", - "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo", - "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml", - "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger", - "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute", - "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute", - "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305", - "circ", "tilde", "macr", "#728", "#729", "#730", "cedil", - "#733", "#731", "#711" -}; - -struct html_entity_map { - enum entity_charset charset; /* charset identifier */ - unsigned int basechar; /* char code at start of table */ - unsigned int endchar; /* last char code in the table */ - entity_table_t *table; /* the table of mappings */ -}; - -static const struct html_entity_map entity_map[] = { - { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, - { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, - { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_utf_8, 338, 402, ent_uni_338_402 }, - { cs_utf_8, 710, 732, ent_uni_spacing }, - { cs_utf_8, 913, 982, ent_uni_greek }, - { cs_utf_8, 8194, 8260, ent_uni_punct }, - { cs_utf_8, 8364, 8364, ent_uni_euro }, - { cs_utf_8, 8465, 8501, ent_uni_8465_8501 }, - { cs_utf_8, 8592, 9002, ent_uni_8592_9002 }, - { cs_utf_8, 9674, 9674, ent_uni_9674 }, - { cs_utf_8, 9824, 9830, ent_uni_9824_9830 }, - { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_koi8r, 0xa3, 0xff, ent_koi8r }, - { cs_cp1251, 0x80, 0xff, ent_cp_1251 }, - { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 }, - { cs_cp866, 0xc0, 0xff, ent_cp_866 }, - { cs_macroman, 0x0b, 0xff, ent_macroman }, - { cs_terminator } -}; - -static const struct { - const char *codeset; - enum entity_charset charset; -} charset_map[] = { - { "ISO-8859-1", cs_8859_1 }, - { "ISO8859-1", cs_8859_1 }, - { "ISO-8859-15", cs_8859_15 }, - { "ISO8859-15", cs_8859_15 }, - { "utf-8", cs_utf_8 }, - { "cp1252", cs_cp1252 }, - { "Windows-1252", cs_cp1252 }, - { "1252", cs_cp1252 }, - { "BIG5", cs_big5 }, - { "950", cs_big5 }, - { "GB2312", cs_gb2312 }, - { "936", cs_gb2312 }, - { "BIG5-HKSCS", cs_big5hkscs }, - { "Shift_JIS", cs_sjis }, - { "SJIS", cs_sjis }, - { "932", cs_sjis }, - { "EUCJP", cs_eucjp }, - { "EUC-JP", cs_eucjp }, - { "KOI8-R", cs_koi8r }, - { "koi8-ru", cs_koi8r }, - { "koi8r", cs_koi8r }, - { "cp1251", cs_cp1251 }, - { "Windows-1251", cs_cp1251 }, - { "win-1251", cs_cp1251 }, - { "iso8859-5", cs_8859_5 }, - { "iso-8859-5", cs_8859_5 }, - { "cp866", cs_cp866 }, - { "866", cs_cp866 }, - { "ibm866", cs_cp866 }, - { "MacRoman", cs_macroman }, - { NULL } -}; - -static const struct { - unsigned short charcode; - char *entity; - int entitylen; - int flags; -} basic_entities[] = { - { '"', """, 6, ENT_HTML_QUOTE_DOUBLE }, - { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, - { '\'', "'", 5, ENT_HTML_QUOTE_SINGLE }, - { '<', "<", 4, 0 }, - { '>', ">", 4, 0 }, - { 0, NULL, 0, 0 } -}; - -struct basic_entities_dec { - unsigned short charcode; - char entity[8]; - int entitylen; -}; - +#include "html_tables.h" + #define MB_RETURN { \ *newpos = pos; \ mbseq[mbpos] = '\0'; \ @@ -871,6 +460,8 @@ size_t php_utf32_utf8(unsigned char *buf, int k) { size_t retval = 0; + /* assert(0x0 <= k <= 0x10FFFF); */ + if (k < 0x80) { buf[0] = k; retval = 1; @@ -883,226 +474,492 @@ size_t php_utf32_utf8(unsigned char *buf, int k) buf[1] = 0x80 | ((k >> 6) & 0x3f); buf[2] = 0x80 | (k & 0x3f); retval = 3; - } else if (k < 0x200000) { + } else { buf[0] = 0xf0 | (k >> 18); buf[1] = 0x80 | ((k >> 12) & 0x3f); buf[2] = 0x80 | ((k >> 6) & 0x3f); buf[3] = 0x80 | (k & 0x3f); retval = 4; - } else if (k < 0x4000000) { - buf[0] = 0xf8 | (k >> 24); - buf[1] = 0x80 | ((k >> 18) & 0x3f); - buf[2] = 0x80 | ((k >> 12) & 0x3f); - buf[3] = 0x80 | ((k >> 6) & 0x3f); - buf[4] = 0x80 | (k & 0x3f); - retval = 5; - } else { - buf[0] = 0xfc | (k >> 30); - buf[1] = 0x80 | ((k >> 24) & 0x3f); - buf[2] = 0x80 | ((k >> 18) & 0x3f); - buf[3] = 0x80 | ((k >> 12) & 0x3f); - buf[4] = 0x80 | ((k >> 6) & 0x3f); - buf[5] = 0x80 | (k & 0x3f); - retval = 6; } - buf[retval] = '\0'; + /* UTF-8 has been restricted to max 4 bytes since RFC 3629 */ return retval; } /* }}} */ -/* {{{ php_unescape_html_entities +/* {{{ unimap_bsearc_cmp + * Binary search of unicode code points in unicode <--> charset mapping. + * Returns the code point in the target charset (whose mapping table was given) or 0 if + * the unicode code point is not in the table. */ -PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) +static unsigned char unimap_bsearch(const unicode_mapping *table, unsigned code_key_a, size_t num) { - int retlen, j; - unsigned int k; - char *replaced, *ret, *p, *q, *lim, *next; - enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC); - unsigned char replacement[15]; - int replacement_len; + const unicode_mapping *l = table, + *h = &table[num-1], + *m; + unsigned short code_key; + + /* we have no mappings outside the BMP */ + if (code_key_a > 0xFFFFU) + return 0; + + code_key = (unsigned short) code_key_a; + + while (l <= h) { + m = l + (h - l) / 2; + if (code_key < m->un_code_point) + h = m - 1; + else if (code_key > m->un_code_point) + l = m + 1; + else + return m->cs_code; + } + return 0; +} +/* }}} */ - ret = estrndup(old, oldlen); - retlen = oldlen; - if (!retlen) { - goto empty_source; +/* {{{ map_from_unicode */ +static int map_from_unicode(unsigned code, enum entity_charset charset, unsigned *res) +{ + unsigned char found; + const unicode_mapping *table; + size_t table_size; + + switch (charset) { + case cs_8859_1: + /* identity mapping of code points to unicode */ + if (code > 0xFF) { + return FAILURE; + } + *res = code; + break; + + case cs_8859_5: + if (code <= 0xA0 || code == 0xAD /* soft hyphen */) { + *res = code; + } else if (code == 0x2116) { + *res = 0xF0; /* numero sign */ + } else if (code == 0xA7) { + *res = 0xFD; /* section sign */ + } else if (code >= 0x0401 && code <= 0x044F) { + if (code == 0x040D || code == 0x0450 || code == 0x045D) + return FAILURE; + *res = code - 0x360; + } else { + return FAILURE; + } + break; + + case cs_8859_15: + if (code < 0xA4 || (code > 0xBE && code <= 0xFF)) { + *res = code; + } else { /* between A4 and 0xBE */ + found = unimap_bsearch(unimap_iso885915, + code, sizeof(unimap_iso885915) / sizeof(*unimap_iso885915)); + if (found) + *res = found; + else + return FAILURE; + } + break; + + case cs_cp1252: + if (code <= 0x7F || (code >= 0xA0 && code <= 0xFF)) { + *res = code; + } else { + found = unimap_bsearch(unimap_win1252, + code, sizeof(unimap_win1252) / sizeof(*unimap_win1252)); + if (found) + *res = found; + else + return FAILURE; + } + break; + + case cs_macroman: + if (code == 0x7F) + return FAILURE; + table = unimap_macroman; + table_size = sizeof(unimap_macroman) / sizeof(*unimap_macroman); + goto table_over_7F; + case cs_cp1251: + table = unimap_win1251; + table_size = sizeof(unimap_win1251) / sizeof(*unimap_win1251); + goto table_over_7F; + case cs_koi8r: + table = unimap_koi8r; + table_size = sizeof(unimap_koi8r) / sizeof(*unimap_koi8r); + goto table_over_7F; + case cs_cp866: + table = unimap_cp866; + table_size = sizeof(unimap_cp866) / sizeof(*unimap_cp866); + +table_over_7F: + if (code <= 0x7F) { + *res = code; + } else { + found = unimap_bsearch(table, code, table_size); + if (found) + *res = found; + else + return FAILURE; + } + break; + + /* from here on, only map the possible characters in the ASCII range. + * to improve support here, it's a matter of building the unicode mappings. + * See */ + case cs_sjis: + case cs_eucjp: + if (code >= 0x20 && code <= 0x7D) { + if (code == 0x5C) /* 0x5C is mapped to the yen symbol */ + return FAILURE; + *res = code; + } else { + return FAILURE; + } + break; + + case cs_big5: + case cs_big5hkscs: + case cs_gb2312: + if (code >= 0x20 && code <= 0x7D) { + *res = code; + } else { + return FAILURE; + } + break; + + default: + return FAILURE; } - - if (all) { - /* look for a match in the maps for this charset */ - for (j = 0; entity_map[j].charset != cs_terminator; j++) { - if (entity_map[j].charset != charset) - continue; - for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) { - unsigned char entity[32]; - int entity_length = 0; + return SUCCESS; +} +/* }}} */ - if (entity_map[j].table[k - entity_map[j].basechar] == NULL) - continue; +/* {{{ process_numeric_entity + * Auxiliary function to traverse_for_entities. + * On input, *buf should point to the first character after # and on output, it's the last + * byte read, no matter if there was success or insuccess. + */ +static int process_numeric_entity(char **buf, unsigned *code_point, int all) +{ + long code_l; + int hexadecimal = (**buf == 'x' || **buf == 'X'); - entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]); - if (entity_length >= sizeof(entity)) { - continue; - } + if (hexadecimal) + (*buf)++; + + /* strtol allows whitespace and other stuff in the beginning + * we're not interested */ + if (hexadecimal && !isxdigit(**buf) || + !hexadecimal && !isdigit(**buf)) { + return FAILURE; + } - /* When we have MBCS entities in the tables above, this will need to handle it */ - replacement_len = 0; - switch (charset) { - case cs_8859_1: - case cs_cp1252: - case cs_8859_15: - case cs_cp1251: - case cs_8859_5: - case cs_cp866: - case cs_koi8r: - replacement[0] = k; - replacement[1] = '\0'; - replacement_len = 1; - break; + code_l = strtol(*buf, buf, hexadecimal ? 16 : 10); - case cs_big5: - case cs_gb2312: - case cs_big5hkscs: - case cs_sjis: - case cs_eucjp: - /* we cannot properly handle those multibyte encodings - * with php_str_to_str. skip it. */ - continue; + if (**buf != ';') + return FAILURE; - case cs_utf_8: - replacement_len = php_utf32_utf8(replacement, k); - break; + /* many more are invalid, but that depends on whether it's HTML + * (and which version) or XML. Rejecting 0 is handy because that's + * the return of strtol if no character was read */ + if (code_l <= 0L || code_l > 0x10FFFFL) + return FAILURE; + + *code_point = (unsigned)code_l; - default: - php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!"); - efree(ret); - return NULL; - } + if (!all) { + if (*code_point != '\'' && *code_point != '"') + return FAILURE; + } - if (php_memnstr(ret, entity, entity_length, ret+retlen)) { - replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen); - efree(ret); - ret = replaced; - } - } - } + return SUCCESS; +} +/* }}} */ + +/* {{{ process_named_entity */ +static int process_named_entity(char **buf, unsigned *code_unit_seq, HashTable *inv_map) +{ + size_t length; + char *start = *buf; + unsigned *stored_code; + + /* "&" is represented by a 0x26 in all supported encodings. That means + * the byte after represents a character or is the leading byte of an + * sequence of 8-bit code units. If in the ranges below, it represents + * necessarily a alpha character because none of the supported encodings + * has an overlap with ASCII in the leading byte (only on the second one) */ + while (**buf >= 'a' && **buf <= 'z' || + **buf >= 'A' && **buf <= 'Z' || + **buf >= '0' && **buf <= '9') { + (*buf)++; } - for (j = 0; basic_entities[j].charcode != 0; j++) { + if (**buf != ';') + return FAILURE; + + /* cast to size_t OK as the quantity is always non-negative */ + length = *buf - start; + if (length == 0 || length > 31) /* 31 is arbitrary */ + return FAILURE; + + if (zend_hash_find(inv_map, start, (uint)length, (void**)&stored_code) == FAILURE) + return FAILURE; + + *code_unit_seq = *stored_code; + + return SUCCESS; +} +/* }}} */ + +/* {{{ traverse_for_entities + * Auxiliary function to php_unescape_html_entities(). + * - The argument "all" determines if all numeric entities are decode or only those + * that correspond to quotes (depending on quote_style). Typically used with the inv_map + * stored under the key 0 in BG(inverse_ent_maps). + * - Using cs_terminator as charset is legal and has the effect of defaulting to UTF-8. Used + * when the encoding doesn't (or shouldn't...) matter. + */ +static void traverse_for_entities(char *ret, int *retlen_p, int all, int quote_style, HashTable *inv_map, enum entity_charset charset) +{ + int retlen; + char *p, *q, *lim; + + /* note: this function assumes the entities always take equal or more space + * than the characters they represent in whatever supported external encoding. + * The supported encoding that can generate the longest code unit sequences is + * UTF-8 (4 bytes). Theoretically, there could be entities with only 3 chars + * (e.g. &z;) that would map to outside-the-BMP unicode code points and hence + * needed 4 bytes and would overflow, but we have no such thing. */ + + if (charset == cs_terminator) /* caller doesn't care; we choose one */ + charset = cs_utf_8; + + retlen = *retlen_p; - if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0) + lim = ret + retlen; /* terminator address */ + assert(*lim == '\0'); + + for (p = ret, q = ret; p < lim;) { + unsigned code; + char *next = NULL; + /* code is unicode code point or a set of 8-bit code units packed into + * an integer with the least significant bit being the last byte? */ + int unicode; + + /* Shift JIS, Big5 and HKSCS use multi-byte encodings where an + * ASCII range byte can be part of a multi-byte sequence. + * However, they start at 0x40, therefore if we find a 0x26 byte, + * we're sure it represents the '&' character. */ + + /* assumes there are no single-char entities */ + if (p[0] != '&' || (p + 3 >= lim)) { + *(q++) = *(p++); continue; + } + + /* now p[3] is surely valid and is no terminator */ + + /* numerical entity */ + if (p[1] == '#') { + next = &p[2]; + if (process_numeric_entity(&next, &code, all) == FAILURE) + goto invalid_code; + unicode = 1; + } else if (inv_map != NULL) { + next = &p[1]; + if (process_named_entity(&next, &code, inv_map) == FAILURE) + goto invalid_code; + unicode = 0; + } else { + goto invalid_code; + } - replacement[0] = (unsigned char)basic_entities[j].charcode; - replacement[1] = '\0'; + assert(*next == ';'); + + if (code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE) || + code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE)) + goto invalid_code; + + if (unicode && charset != cs_utf_8) { + /* replace unicode code point */ + if (map_from_unicode(code, charset, &code) == FAILURE) + goto invalid_code; /* not representable in target charset */ + } + + switch (charset) { + case cs_utf_8: + { + size_t written; + written = php_utf32_utf8((unsigned char*)q, code); + q += written; + /* Since we're writing in place, we hope we didn't write more than we read */ + assert(written <= (size_t)(next - p) + 1); + break; + } + + case cs_8859_1: + case cs_cp1252: + case cs_8859_15: + case cs_koi8r: + case cs_cp1251: + case cs_8859_5: + case cs_cp866: + case cs_macroman: + /* single byte stuff */ + *(q++) = code; + break; + + case cs_big5: + case cs_big5hkscs: + case cs_sjis: + case cs_gb2312: + /* one or two bytes */ + *(q++) = (code & 0xFFU); + if (0xFF00U & code) { /* 2 */ + *(q++) = (code >> 8); + } + break; + + case cs_eucjp: + /* one to three bytes */ + *(q++) = code & 0xFFU; + if (0xFFFF00U & code) { /* 2 */ + *(q++) = ((code >> 8) & 0xFFU); + if (0xFF0000U & code) /* 3 */ + *(q++) = (code >> 16); + } + break; + + default: + /* for backwards compatilibity */ + goto invalid_code; + break; + } + + /* jump over the valid entity; may go beyond size of buffer; np */ + p = next + 1; + continue; - if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) { - replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen); - efree(ret); - ret = replaced; +invalid_code: + for (; p < next; p++) { + *(q++) = *p; } } + + *q = '\0'; + *retlen_p = (size_t)(q - ret); +} +/* }}} */ - /* replace numeric entities & "&" */ - lim = ret + retlen; - for (p = ret, q = ret; p < lim;) { - int code; +/* {{{ inv_ent_maps_dtor + * Hash table destructor for BG(inverse_ent_maps) + */ +static void inv_ent_maps_dtor(HashTable **ht) { + zend_hash_destroy(*ht); + pefree(*ht, 1); +} +/* }}} */ - if (p[0] == '&') { - if (p + 2 < lim) { - if (p[1] == '#') { - int invalid_code = 0; +/* {{{ unescape_inverse_map + * Auxiliary function to php_unescape_html_entities() + * charset can be cs_terminator for only basic entities. + */ +static HashTable *unescape_inverse_map(enum entity_charset charset TSRMLS_DC) +{ + HashTable **inverse_map; - if (p[2] == 'x' || p[2] == 'X') { - code = strtol(p + 3, &next, 16); - } else { - code = strtol(p + 2, &next, 10); - } + /* we accept charset = cs_terminator (for specialchars) */ - if (code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE) || - code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE)) { - invalid_code = 1; - } + if (!BG(inverse_ent_maps)) { + BG(inverse_ent_maps) = pemalloc(sizeof *BG(inverse_ent_maps), 1); + zend_hash_init(BG(inverse_ent_maps), cs_numelems, NULL, (dtor_func_t)inv_ent_maps_dtor, 1); + } + if (zend_hash_index_find(BG(inverse_ent_maps), (ulong)charset, (void**)&inverse_map) == FAILURE) { + HashTable *ht = pemalloc(sizeof *ht, 1); + uint capacity = 0; + int j, t; - if (next != NULL && *next == ';' && !invalid_code) { - switch (charset) { - case cs_utf_8: - q += php_utf32_utf8(q, code); - break; - - case cs_8859_1: - case cs_8859_5: - case cs_8859_15: - if ((code >= 0x80 && code < 0xa0) || code > 0xff) { - invalid_code = 1; - } else { - *(q++) = code; - } - break; + /* determine upper bound for capacity of hashtable */ + for (j = 0; entity_map[j].charset != cs_terminator; j++) { + if (entity_map[j].charset == charset) + capacity += entity_map[j].endchar - entity_map[j].basechar + 1; + } - case cs_cp1252: - if (code > 0xff) { - invalid_code = 1; - } else { - *(q++) = code; - } - break; - - case cs_cp1251: - case cs_cp866: - case cs_big5: - case cs_big5hkscs: - case cs_sjis: - case cs_eucjp: - if (code >= 0x80) { - invalid_code = 1; - } else { - *(q++) = code; - } - break; + /* no destructor as we'll be storing ints */ + zend_hash_init(ht, capacity, NULL, NULL, 1); - case cs_gb2312: - if (code >= 0x81) { - invalid_code = 1; - } else { - *(q++) = code; - } - break; + /* store new hash table */ + t = zend_hash_index_update(BG(inverse_ent_maps), (ulong)charset, &ht, sizeof(ht), (void**)&inverse_map); + assert(t == SUCCESS); - default: - /* for backwards compatilibity */ - invalid_code = 1; - break; - } - if (invalid_code) { - for (; p <= next; p++) { - *(q++) = *p; - } - } - p = next + 1; - } else { - *(q++) = *(p++); - *(q++) = *(p++); - } - } else if (p + 4 < lim && - p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' && - p[4] == ';') { - *(q++) = '&'; - p += 5; - } else { - *(q++) = *(p++); - *(q++) = *(p++); - } - } else { - *(q++) = *(p++); + /* build inverse map */ + for (j = 0; entity_map[j].charset != cs_terminator; j++) { + unsigned k; + + if (entity_map[j].charset != charset) + continue; + + for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) { + unsigned table_offset = k - entity_map[j].basechar; + const char* entity_name = entity_map[j].table[table_offset]; + + if (entity_name == NULL || *entity_name == '#') + continue; + + t = zend_hash_update(ht, entity_name, strlen(entity_name), &k, sizeof(k), NULL); + assert(t == SUCCESS); } - } else { - *(q++) = *(p++); + } + + /* and add the basic entitites */ + for (j = 0; basic_entities_ex[j].charcode != 0; j++) { + const basic_entity_t *ent = &basic_entities_ex[j]; + unsigned k = ent->charcode; + + t = zend_hash_update(ht, &ent->entity[1] /* skip & */, + ent->entitylen - 2 /* skip & and ; */, &k, sizeof(k), NULL); + assert(t == SUCCESS); } } - *q = '\0'; - retlen = (size_t)(q - ret); + + return *inverse_map; +} + +/* {{{ php_unescape_html_entities + * The parameter "all" should be true to decode all possible entities, false to decode + * only the basic ones, i.e., those in basic_entities_ex + the numeric entities + * that correspond to quotes. + */ +PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) +{ + int retlen; + char *ret; + enum entity_charset charset; + HashTable *inverse_map = NULL; + + if (all) { + charset = determine_charset(hint_charset TSRMLS_CC); + } else { + charset = cs_terminator; + } + + ret = estrndup(old, oldlen); + retlen = oldlen; + if (retlen == 0) { + goto empty_source; + } + + /* charset == cs_terminator if !all */ + inverse_map = unescape_inverse_map(charset TSRMLS_CC); + + /* replace numeric entities */ + /* !all implies charset == cs_terminator && inverse_map == BG(inverse_ent_maps)[0] */ + traverse_for_entities(ret, &retlen, all, quote_style, inverse_map, charset); + empty_source: *newlen = retlen; return ret; @@ -1315,65 +1172,20 @@ PHP_FUNCTION(htmlspecialchars) Convert special HTML entities back to characters */ PHP_FUNCTION(htmlspecialchars_decode) { - char *str, *new_str, *e, *p; - int len, j, i, new_len; + char *str; + int str_len, len; long quote_style = ENT_COMPAT; - struct basic_entities_dec basic_entities_dec[8]; + char *replaced; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, "e_style) == FAILURE) { + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len, "e_style) == FAILURE) { return; } - new_str = estrndup(str, len); - new_len = len; - e = new_str + new_len; - - if (!(p = memchr(new_str, '&', new_len))) { - RETURN_STRINGL(new_str, new_len, 0); - } - - for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) { - if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) { - continue; - } - basic_entities_dec[j].charcode = basic_entities[i].charcode; - memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1); - basic_entities_dec[j].entitylen = basic_entities[i].entitylen; - j++; + replaced = php_unescape_html_entities(str, str_len, &len, 0 /*!all*/, quote_style, NULL TSRMLS_CC); + if (replaced) { + RETURN_STRINGL(replaced, len, 0); } - basic_entities_dec[j].charcode = '&'; - basic_entities_dec[j].entitylen = sizeof("&") - 1; - memcpy(basic_entities_dec[j].entity, "&", sizeof("&")); - i = j + 1; - - do { - int l = e - p; - - for (j = 0; j < i; j++) { - if (basic_entities_dec[j].entitylen > l) { - continue; - } - if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) { - int e_len = basic_entities_dec[j].entitylen - 1; - - *p++ = (char) basic_entities_dec[j].charcode; - memmove(p, p + e_len, (e - p - e_len)); - e -= e_len; - goto done; - } - } - p++; - -done: - if (p >= e) { - break; - } - } while ((p = memchr(p, '&', (e - p)))); - - new_len = e - new_str; - - new_str[new_len] = '\0'; - RETURN_STRINGL(new_str, new_len, 0); + RETURN_FALSE; } /* }}} */ @@ -1391,7 +1203,7 @@ PHP_FUNCTION(html_entity_decode) return; } - replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC); + replaced = php_unescape_html_entities(str, str_len, &len, 1 /*all*/, quote_style, hint_charset TSRMLS_CC); if (replaced) { RETURN_STRINGL(replaced, len, 0); } @@ -1446,15 +1258,20 @@ PHP_FUNCTION(get_html_translation_table) /* break thru */ case HTML_SPECIALCHARS: - for (j = 0; basic_entities[j].charcode != 0; j++) { + for (j = 0; basic_entities_ex[j].charcode != 0; j++) { + void *dummy; - if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0) + if (basic_entities_ex[j].flags && (quote_style & basic_entities_ex[j].flags) == 0) continue; - ind[0] = (unsigned char)basic_entities[j].charcode; - add_assoc_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1); + ind[0] = (unsigned char)basic_entities_ex[j].charcode; + if (zend_hash_find(Z_ARRVAL_P(return_value), ind, sizeof(ind), &dummy) == FAILURE) { + /* in case of the single quote, which is repeated, the first one wins, + * so don't replace the existint mapping */ + add_assoc_stringl(return_value, ind, basic_entities_ex[j].entity, + basic_entities_ex[j].entitylen, 1); + } } - add_assoc_stringl(return_value, "&", "&", sizeof("&") - 1, 1); break; } diff --git a/ext/standard/html_tables.h b/ext/standard/html_tables.h new file mode 100644 index 0000000000..d3a638b695 --- /dev/null +++ b/ext/standard/html_tables.h @@ -0,0 +1,2080 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | Copyright (c) 1997-2010 The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Rasmus Lerdorf | + +----------------------------------------------------------------------+ +*/ + +/* $Id: html.h 293036 2010-01-03 09:23:27Z sebastian $ */ + +#ifndef HTML_TABLES_H +#define HTML_TABLES_H + +/* cs_terminator is overloaded in the following fashion: + * - It terminates the list entity maps. + * - In BG(inverse_ent_maps), it's the key of the inverse map that stores + * only the basic entities. + * - When passed to traverse_for_entities (or via php_unescape_entities with !all), + * we don't care about the encoding (UTF-8 is chosen, but it should be used + * when it doesn't matter). + */ +enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, + cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, + cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r, + cs_cp1251, cs_8859_5, cs_cp866, cs_macroman, + cs_numelems /* used to count the number of charsets */ + }; +typedef const char *const entity_table_t; + +/* codepage 1252 is a Windows extension to iso-8859-1. */ +static entity_table_t ent_cp_1252[] = { + "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger", + "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig", + NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo", + "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo", + "oelig", NULL, NULL, "Yuml" +}; + +static entity_table_t ent_iso_8859_1[] = { + "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", + "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", + "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", + "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", + "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", + "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", + "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", + "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", + "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", + "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", + "atilde", "auml", "aring", "aelig", "ccedil", "egrave", + "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", + "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", + "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", + "uuml", "yacute", "thorn", "yuml" +}; + +static entity_table_t ent_iso_8859_15[] = { + "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron", + "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg", + "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */ + "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm", + "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute", + "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", + "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", + "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", + "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", + "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", + "atilde", "auml", "aring", "aelig", "ccedil", "egrave", + "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", + "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", + "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", + "uuml", "yacute", "thorn", "yuml" +}; + +static entity_table_t ent_uni_338_402[] = { + /* 338 (0x0152) */ + "OElig", "oelig", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 352 (0x0160) */ + "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 376 (0x0178) */ + "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 400 (0x0190) */ + NULL, NULL, "fnof" +}; + +static entity_table_t ent_uni_spacing[] = { + /* 710 */ + "circ", + /* 711 - 730 */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 731 - 732 */ + NULL, "tilde" +}; + +static entity_table_t ent_uni_greek[] = { + /* 913 */ + "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", + "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", + NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega", + /* 938 - 944 are not mapped */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", + "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", + "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", + /* 970 - 976 are not mapped */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "thetasym", "upsih", + NULL, NULL, NULL, + "piv" +}; + +static entity_table_t ent_uni_punct[] = { + /* 8194 */ + "ensp", "emsp", NULL, NULL, NULL, NULL, NULL, + "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", + NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, + /* 8216 */ + "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL, + "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, + /* 8242 */ + "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, + NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, + "frasl" +}; + +static entity_table_t ent_uni_euro[] = { + "euro" +}; + +static entity_table_t ent_uni_8465_8501[] = { + /* 8465 */ + "image", NULL, NULL, NULL, NULL, NULL, NULL, + /* 8472 */ + "weierp", NULL, NULL, NULL, + /* 8476 */ + "real", NULL, NULL, NULL, NULL, NULL, + /* 8482 */ + "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8501 */ + "alefsym", +}; + +static entity_table_t ent_uni_8592_9002[] = { + /* 8592 (0x2190) */ + "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8608 (0x21a0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8624 (0x21b0) */ + NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8640 (0x21c0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8656 (0x21d0) */ + "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8672 (0x21e0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8704 (0x2200) */ + "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla", + "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod", + /* 8720 (0x2210) */ + NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast", + NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL, + /* 8736 (0x2220) */ + "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and", + "or", "cap", "cup", "int", NULL, NULL, NULL, NULL, + /* 8752 (0x2230) */ + NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL, + /* 8768 (0x2240) */ + NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL, + "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8784 (0x2250) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8800 (0x2260) */ + "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8816 (0x2270) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8832 (0x2280) */ + NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8848 (0x2290) */ + NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8864 (0x22a0) */ + NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8880 (0x22b0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8896 (0x22c0) */ + NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8912 (0x22d0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8928 (0x22e0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8944 (0x22f0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8960 (0x2300) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL, + /* 8976 (0x2310) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8992 (0x2320) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, "lang", "rang" +}; + +static entity_table_t ent_uni_9674[] = { + /* 9674 */ + "loz" +}; + +static entity_table_t ent_uni_9824_9830[] = { + /* 9824 */ + "spades", NULL, NULL, "clubs", NULL, "hearts", "diams" +}; + +static entity_table_t ent_koi8r[] = { + "#1105", /* "jo "*/ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", + "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", + "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", + "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", + "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", + "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", + "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", + "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042", + "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", + "#1066" +}; + +static entity_table_t ent_cp_1251[] = { + "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger", + "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036", + "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220", + "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250", + "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118", + "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy", + "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn", + "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105", + "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111", + "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046", + "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053", + "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060", + "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067", + "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074", + "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081", + "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088", + "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095", + "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102", + "#1103" +}; + +static entity_table_t ent_iso_8859_5[] = { + "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062", + "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069", + "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076", + "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083", + "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090", + "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", + "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104", + "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111", + "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118", + "#1119" +}; + +static entity_table_t ent_cp_866[] = { + + "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", + "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", + "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", + "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", + "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", + "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", + "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", + "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", + "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632", + "#160" +}; + +/* MacRoman has a couple of low-ascii chars that need mapping too */ +/* Vertical tab (ASCII 11) is often used to store line breaks inside */ +/* DB exports, this mapping changes it to a space */ +static entity_table_t ent_macroman[] = { + "sp", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "quot", NULL, + NULL, NULL, "amp", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "lt", NULL, "gt", NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml", + "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring", + "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave", + "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml", + "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg", + "cent", "pound", "sect", "bull", "para", "szlig", "reg", + "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash", + "infin", "plusmn", "le", "ge", "yen", "micro", "part", + "sum", "prod", "pi", "int", "ordf", "ordm", "Omega", + "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof", + "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave", + "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo", + "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml", + "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger", + "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute", + "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute", + "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305", + "circ", "tilde", "macr", "#728", "#729", "#730", "cedil", + "#733", "#731", "#711" +}; + +struct html_entity_map { + enum entity_charset charset; /* charset identifier */ + unsigned int basechar; /* char code at start of table */ + unsigned int endchar; /* last char code in the table */ + entity_table_t *table; /* the table of mappings */ +}; + +static const struct html_entity_map entity_map[] = { + { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, + { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, + { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_utf_8, 338, 402, ent_uni_338_402 }, + { cs_utf_8, 710, 732, ent_uni_spacing }, + { cs_utf_8, 913, 982, ent_uni_greek }, + { cs_utf_8, 8194, 8260, ent_uni_punct }, + { cs_utf_8, 8364, 8364, ent_uni_euro }, + { cs_utf_8, 8465, 8501, ent_uni_8465_8501 }, + { cs_utf_8, 8592, 9002, ent_uni_8592_9002 }, + { cs_utf_8, 9674, 9674, ent_uni_9674 }, + { cs_utf_8, 9824, 9830, ent_uni_9824_9830 }, + { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_koi8r, 0xa3, 0xff, ent_koi8r }, + { cs_cp1251, 0x80, 0xff, ent_cp_1251 }, + { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 }, + { cs_cp866, 0xc0, 0xff, ent_cp_866 }, + { cs_macroman, 0x0b, 0xff, ent_macroman }, + { cs_terminator } +}; + +static const struct { + const char *codeset; + enum entity_charset charset; +} charset_map[] = { + { "ISO-8859-1", cs_8859_1 }, + { "ISO8859-1", cs_8859_1 }, + { "ISO-8859-15", cs_8859_15 }, + { "ISO8859-15", cs_8859_15 }, + { "utf-8", cs_utf_8 }, + { "cp1252", cs_cp1252 }, + { "Windows-1252", cs_cp1252 }, + { "1252", cs_cp1252 }, + { "BIG5", cs_big5 }, + { "950", cs_big5 }, + { "GB2312", cs_gb2312 }, + { "936", cs_gb2312 }, + { "BIG5-HKSCS", cs_big5hkscs }, + { "Shift_JIS", cs_sjis }, + { "SJIS", cs_sjis }, + { "932", cs_sjis }, + { "EUCJP", cs_eucjp }, + { "EUC-JP", cs_eucjp }, + { "KOI8-R", cs_koi8r }, + { "koi8-ru", cs_koi8r }, + { "koi8r", cs_koi8r }, + { "cp1251", cs_cp1251 }, + { "Windows-1251", cs_cp1251 }, + { "win-1251", cs_cp1251 }, + { "iso8859-5", cs_8859_5 }, + { "iso-8859-5", cs_8859_5 }, + { "cp866", cs_cp866 }, + { "866", cs_cp866 }, + { "ibm866", cs_cp866 }, + { "MacRoman", cs_macroman }, + { NULL } +}; + +typedef struct { + unsigned short charcode; + char *entity; + int entitylen; + int flags; +} basic_entity_t; + +static const basic_entity_t basic_entities_ex[] = { + { '&', "&", 5, 0 }, + { '"', """, 6, ENT_HTML_QUOTE_DOUBLE }, + /* PHP traditionally encodes ' as ', not ', so leave this entry here */ + { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, + { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, + { '<', "<", 4, 0 }, + { '>', ">", 4, 0 }, + { 0, NULL, 0, 0 } +}; + +/* In some cases, we need to give special treatment to &, so we + * use this instead */ +static const basic_entity_t *basic_entities = &basic_entities_ex[1]; + +typedef struct { + unsigned short un_code_point; /* we don't need bigger */ + unsigned char cs_code; /* currently, we only have maps to single-byte encodings */ +} unicode_mapping; + +static const unicode_mapping unimap_iso885915[] = { + { 0xA5, 0xA5 }, /* yen sign */ + { 0xA7, 0xA7 }, /* section sign */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAA, 0xAA }, /* feminine ordinal indicator */ + { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xAC }, /* not sign */ + { 0xAD, 0xAD }, /* soft hyphen */ + { 0xAE, 0xAE }, /* registered sign */ + { 0xAF, 0xAF }, /* macron */ + { 0xB0, 0xB0 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB2, 0xB2 }, /* superscript two */ + { 0xB3, 0xB3 }, /* superscript three */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xB6 }, /* pilcrow sign */ + { 0xB7, 0xB7 }, /* middle dot */ + { 0xB9, 0xB9 }, /* superscript one */ + { 0xBA, 0xBA }, /* masculine ordinal indicator */ + { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */ + { 0x152, 0xBC }, /* latin capital ligature oe */ + { 0x153, 0xBD }, /* latin small ligature oe */ + { 0x160, 0xA6 }, /* latin capital letter s with caron */ + { 0x161, 0xA8 }, /* latin small letter s with caron */ + { 0x178, 0xBE }, /* latin capital letter y with diaeresis */ + { 0x17D, 0xB4 }, /* latin capital letter z with caron */ + { 0x17E, 0xB8 }, /* latin small letter z with caron */ + { 0x20AC, 0xA4 }, /* euro sign */ +}; + +static const unicode_mapping unimap_win1252[] = { + { 0x152, 0x8C }, /* latin capital ligature oe */ + { 0x153, 0x9C }, /* latin small ligature oe */ + { 0x160, 0x8A }, /* latin capital letter s with caron */ + { 0x161, 0x9A }, /* latin small letter s with caron */ + { 0x178, 0x9F }, /* latin capital letter y with diaeresis */ + { 0x17D, 0x8E }, /* latin capital letter z with caron */ + { 0x17E, 0x9E }, /* latin small letter z with caron */ + { 0x192, 0x83 }, /* latin small letter f with hook */ + { 0x2C6, 0x88 }, /* modifier letter circumflex accent */ + { 0x2DC, 0x98 }, /* small tilde */ + { 0x2013, 0x96 }, /* en dash */ + { 0x2014, 0x97 }, /* em dash */ + { 0x2018, 0x91 }, /* left single quotation mark */ + { 0x2019, 0x92 }, /* right single quotation mark */ + { 0x201A, 0x82 }, /* single low-9 quotation mark */ + { 0x201C, 0x93 }, /* left double quotation mark */ + { 0x201D, 0x94 }, /* right double quotation mark */ + { 0x201E, 0x84 }, /* double low-9 quotation mark */ + { 0x2020, 0x86 }, /* dagger */ + { 0x2021, 0x87 }, /* double dagger */ + { 0x2022, 0x95 }, /* bullet */ + { 0x2026, 0x85 }, /* horizontal ellipsis */ + { 0x2030, 0x89 }, /* per mille sign */ + { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */ + { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */ + { 0x20AC, 0x80 }, /* euro sign */ + { 0x2122, 0x99 }, /* trade mark sign */ +}; + +static const unicode_mapping unimap_win1251[] = { + { 0xA0, 0xA0 }, /* no-break space */ + { 0xA4, 0xA4 }, /* currency sign */ + { 0xA6, 0xA6 }, /* broken bar */ + { 0xA7, 0xA7 }, /* section sign */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xAC }, /* not sign */ + { 0xAD, 0xAD }, /* soft hyphen */ + { 0xAE, 0xAE }, /* registered sign */ + { 0xB0, 0xB0 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xB6 }, /* pilcrow sign */ + { 0xB7, 0xB7 }, /* middle dot */ + { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */ + { 0x401, 0xA8 }, /* cyrillic capital letter io */ + { 0x402, 0x80 }, /* cyrillic capital letter dje */ + { 0x403, 0x81 }, /* cyrillic capital letter gje */ + { 0x404, 0xAA }, /* cyrillic capital letter ukrainian ie */ + { 0x405, 0xBD }, /* cyrillic capital letter dze */ + { 0x406, 0xB2 }, /* cyrillic capital letter byelorussian-ukrainian i */ + { 0x407, 0xAF }, /* cyrillic capital letter yi */ + { 0x408, 0xA3 }, /* cyrillic capital letter je */ + { 0x409, 0x8A }, /* cyrillic capital letter lje */ + { 0x40A, 0x8C }, /* cyrillic capital letter nje */ + { 0x40B, 0x8E }, /* cyrillic capital letter tshe */ + { 0x40C, 0x8D }, /* cyrillic capital letter kje */ + { 0x40E, 0xA1 }, /* cyrillic capital letter short u */ + { 0x40F, 0x8F }, /* cyrillic capital letter dzhe */ + { 0x410, 0xC0 }, /* cyrillic capital letter a */ + { 0x411, 0xC1 }, /* cyrillic capital letter be */ + { 0x412, 0xC2 }, /* cyrillic capital letter ve */ + { 0x413, 0xC3 }, /* cyrillic capital letter ghe */ + { 0x414, 0xC4 }, /* cyrillic capital letter de */ + { 0x415, 0xC5 }, /* cyrillic capital letter ie */ + { 0x416, 0xC6 }, /* cyrillic capital letter zhe */ + { 0x417, 0xC7 }, /* cyrillic capital letter ze */ + { 0x418, 0xC8 }, /* cyrillic capital letter i */ + { 0x419, 0xC9 }, /* cyrillic capital letter short i */ + { 0x41A, 0xCA }, /* cyrillic capital letter ka */ + { 0x41B, 0xCB }, /* cyrillic capital letter el */ + { 0x41C, 0xCC }, /* cyrillic capital letter em */ + { 0x41D, 0xCD }, /* cyrillic capital letter en */ + { 0x41E, 0xCE }, /* cyrillic capital letter o */ + { 0x41F, 0xCF }, /* cyrillic capital letter pe */ + { 0x420, 0xD0 }, /* cyrillic capital letter er */ + { 0x421, 0xD1 }, /* cyrillic capital letter es */ + { 0x422, 0xD2 }, /* cyrillic capital letter te */ + { 0x423, 0xD3 }, /* cyrillic capital letter u */ + { 0x424, 0xD4 }, /* cyrillic capital letter ef */ + { 0x425, 0xD5 }, /* cyrillic capital letter ha */ + { 0x426, 0xD6 }, /* cyrillic capital letter tse */ + { 0x427, 0xD7 }, /* cyrillic capital letter che */ + { 0x428, 0xD8 }, /* cyrillic capital letter sha */ + { 0x429, 0xD9 }, /* cyrillic capital letter shcha */ + { 0x42A, 0xDA }, /* cyrillic capital letter hard sign */ + { 0x42B, 0xDB }, /* cyrillic capital letter yeru */ + { 0x42C, 0xDC }, /* cyrillic capital letter soft sign */ + { 0x42D, 0xDD }, /* cyrillic capital letter e */ + { 0x42E, 0xDE }, /* cyrillic capital letter yu */ + { 0x42F, 0xDF }, /* cyrillic capital letter ya */ + { 0x430, 0xE0 }, /* cyrillic small letter a */ + { 0x431, 0xE1 }, /* cyrillic small letter be */ + { 0x432, 0xE2 }, /* cyrillic small letter ve */ + { 0x433, 0xE3 }, /* cyrillic small letter ghe */ + { 0x434, 0xE4 }, /* cyrillic small letter de */ + { 0x435, 0xE5 }, /* cyrillic small letter ie */ + { 0x436, 0xE6 }, /* cyrillic small letter zhe */ + { 0x437, 0xE7 }, /* cyrillic small letter ze */ + { 0x438, 0xE8 }, /* cyrillic small letter i */ + { 0x439, 0xE9 }, /* cyrillic small letter short i */ + { 0x43A, 0xEA }, /* cyrillic small letter ka */ + { 0x43B, 0xEB }, /* cyrillic small letter el */ + { 0x43C, 0xEC }, /* cyrillic small letter em */ + { 0x43D, 0xED }, /* cyrillic small letter en */ + { 0x43E, 0xEE }, /* cyrillic small letter o */ + { 0x43F, 0xEF }, /* cyrillic small letter pe */ + { 0x440, 0xF0 }, /* cyrillic small letter er */ + { 0x441, 0xF1 }, /* cyrillic small letter es */ + { 0x442, 0xF2 }, /* cyrillic small letter te */ + { 0x443, 0xF3 }, /* cyrillic small letter u */ + { 0x444, 0xF4 }, /* cyrillic small letter ef */ + { 0x445, 0xF5 }, /* cyrillic small letter ha */ + { 0x446, 0xF6 }, /* cyrillic small letter tse */ + { 0x447, 0xF7 }, /* cyrillic small letter che */ + { 0x448, 0xF8 }, /* cyrillic small letter sha */ + { 0x449, 0xF9 }, /* cyrillic small letter shcha */ + { 0x44A, 0xFA }, /* cyrillic small letter hard sign */ + { 0x44B, 0xFB }, /* cyrillic small letter yeru */ + { 0x44C, 0xFC }, /* cyrillic small letter soft sign */ + { 0x44D, 0xFD }, /* cyrillic small letter e */ + { 0x44E, 0xFE }, /* cyrillic small letter yu */ + { 0x44F, 0xFF }, /* cyrillic small letter ya */ + { 0x451, 0xB8 }, /* cyrillic small letter io */ + { 0x452, 0x90 }, /* cyrillic small letter dje */ + { 0x453, 0x83 }, /* cyrillic small letter gje */ + { 0x454, 0xBA }, /* cyrillic small letter ukrainian ie */ + { 0x455, 0xBE }, /* cyrillic small letter dze */ + { 0x456, 0xB3 }, /* cyrillic small letter byelorussian-ukrainian i */ + { 0x457, 0xBF }, /* cyrillic small letter yi */ + { 0x458, 0xBC }, /* cyrillic small letter je */ + { 0x459, 0x9A }, /* cyrillic small letter lje */ + { 0x45A, 0x9C }, /* cyrillic small letter nje */ + { 0x45B, 0x9E }, /* cyrillic small letter tshe */ + { 0x45C, 0x9D }, /* cyrillic small letter kje */ + { 0x45E, 0xA2 }, /* cyrillic small letter short u */ + { 0x45F, 0x9F }, /* cyrillic small letter dzhe */ + { 0x490, 0xA5 }, /* cyrillic capital letter ghe with upturn */ + { 0x491, 0xB4 }, /* cyrillic small letter ghe with upturn */ + { 0x2013, 0x96 }, /* en dash */ + { 0x2014, 0x97 }, /* em dash */ + { 0x2018, 0x91 }, /* left single quotation mark */ + { 0x2019, 0x92 }, /* right single quotation mark */ + { 0x201A, 0x82 }, /* single low-9 quotation mark */ + { 0x201C, 0x93 }, /* left double quotation mark */ + { 0x201D, 0x94 }, /* right double quotation mark */ + { 0x201E, 0x84 }, /* double low-9 quotation mark */ + { 0x2020, 0x86 }, /* dagger */ + { 0x2021, 0x87 }, /* double dagger */ + { 0x2022, 0x95 }, /* bullet */ + { 0x2026, 0x85 }, /* horizontal ellipsis */ + { 0x2030, 0x89 }, /* per mille sign */ + { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */ + { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */ + { 0x20AC, 0x88 }, /* euro sign */ + { 0x2116, 0xB9 }, /* numero sign */ + { 0x2122, 0x99 }, /* trade mark sign */ +}; + +static const unicode_mapping unimap_koi8r[] = { + { 0xA0, 0x9A }, /* no-break space */ + { 0xA9, 0xBF }, /* copyright sign */ + { 0xB0, 0x9C }, /* degree sign */ + { 0xB2, 0x9D }, /* superscript two */ + { 0xB7, 0x9E }, /* middle dot */ + { 0xF7, 0x9F }, /* division sign */ + { 0x401, 0xB3 }, /* cyrillic capital letter io */ + { 0x410, 0xE1 }, /* cyrillic capital letter a */ + { 0x411, 0xE2 }, /* cyrillic capital letter be */ + { 0x412, 0xF7 }, /* cyrillic capital letter ve */ + { 0x413, 0xE7 }, /* cyrillic capital letter ghe */ + { 0x414, 0xE4 }, /* cyrillic capital letter de */ + { 0x415, 0xE5 }, /* cyrillic capital letter ie */ + { 0x416, 0xF6 }, /* cyrillic capital letter zhe */ + { 0x417, 0xFA }, /* cyrillic capital letter ze */ + { 0x418, 0xE9 }, /* cyrillic capital letter i */ + { 0x419, 0xEA }, /* cyrillic capital letter short i */ + { 0x41A, 0xEB }, /* cyrillic capital letter ka */ + { 0x41B, 0xEC }, /* cyrillic capital letter el */ + { 0x41C, 0xED }, /* cyrillic capital letter em */ + { 0x41D, 0xEE }, /* cyrillic capital letter en */ + { 0x41E, 0xEF }, /* cyrillic capital letter o */ + { 0x41F, 0xF0 }, /* cyrillic capital letter pe */ + { 0x420, 0xF2 }, /* cyrillic capital letter er */ + { 0x421, 0xF3 }, /* cyrillic capital letter es */ + { 0x422, 0xF4 }, /* cyrillic capital letter te */ + { 0x423, 0xF5 }, /* cyrillic capital letter u */ + { 0x424, 0xE6 }, /* cyrillic capital letter ef */ + { 0x425, 0xE8 }, /* cyrillic capital letter ha */ + { 0x426, 0xE3 }, /* cyrillic capital letter tse */ + { 0x427, 0xFE }, /* cyrillic capital letter che */ + { 0x428, 0xFB }, /* cyrillic capital letter sha */ + { 0x429, 0xFD }, /* cyrillic capital letter shcha */ + { 0x42A, 0xFF }, /* cyrillic capital letter hard sign */ + { 0x42B, 0xF9 }, /* cyrillic capital letter yeru */ + { 0x42C, 0xF8 }, /* cyrillic capital letter soft sign */ + { 0x42D, 0xFC }, /* cyrillic capital letter e */ + { 0x42E, 0xE0 }, /* cyrillic capital letter yu */ + { 0x42F, 0xF1 }, /* cyrillic capital letter ya */ + { 0x430, 0xC1 }, /* cyrillic small letter a */ + { 0x431, 0xC2 }, /* cyrillic small letter be */ + { 0x432, 0xD7 }, /* cyrillic small letter ve */ + { 0x433, 0xC7 }, /* cyrillic small letter ghe */ + { 0x434, 0xC4 }, /* cyrillic small letter de */ + { 0x435, 0xC5 }, /* cyrillic small letter ie */ + { 0x436, 0xD6 }, /* cyrillic small letter zhe */ + { 0x437, 0xDA }, /* cyrillic small letter ze */ + { 0x438, 0xC9 }, /* cyrillic small letter i */ + { 0x439, 0xCA }, /* cyrillic small letter short i */ + { 0x43A, 0xCB }, /* cyrillic small letter ka */ + { 0x43B, 0xCC }, /* cyrillic small letter el */ + { 0x43C, 0xCD }, /* cyrillic small letter em */ + { 0x43D, 0xCE }, /* cyrillic small letter en */ + { 0x43E, 0xCF }, /* cyrillic small letter o */ + { 0x43F, 0xD0 }, /* cyrillic small letter pe */ + { 0x440, 0xD2 }, /* cyrillic small letter er */ + { 0x441, 0xD3 }, /* cyrillic small letter es */ + { 0x442, 0xD4 }, /* cyrillic small letter te */ + { 0x443, 0xD5 }, /* cyrillic small letter u */ + { 0x444, 0xC6 }, /* cyrillic small letter ef */ + { 0x445, 0xC8 }, /* cyrillic small letter ha */ + { 0x446, 0xC3 }, /* cyrillic small letter tse */ + { 0x447, 0xDE }, /* cyrillic small letter che */ + { 0x448, 0xDB }, /* cyrillic small letter sha */ + { 0x449, 0xDD }, /* cyrillic small letter shcha */ + { 0x44A, 0xDF }, /* cyrillic small letter hard sign */ + { 0x44B, 0xD9 }, /* cyrillic small letter yeru */ + { 0x44C, 0xD8 }, /* cyrillic small letter soft sign */ + { 0x44D, 0xDC }, /* cyrillic small letter e */ + { 0x44E, 0xC0 }, /* cyrillic small letter yu */ + { 0x44F, 0xD1 }, /* cyrillic small letter ya */ + { 0x451, 0xA3 }, /* cyrillic small letter io */ + { 0x2219, 0x95 }, /* bullet operator */ + { 0x221A, 0x96 }, /* square root */ + { 0x2248, 0x97 }, /* almost equal to */ + { 0x2264, 0x98 }, /* less-than or equal to */ + { 0x2265, 0x99 }, /* greater-than or equal to */ + { 0x2320, 0x93 }, /* top half integral */ + { 0x2321, 0x9B }, /* bottom half integral */ + { 0x2500, 0x80 }, /* box drawings light horizontal */ + { 0x2502, 0x81 }, /* box drawings light vertical */ + { 0x250C, 0x82 }, /* box drawings light down and right */ + { 0x2510, 0x83 }, /* box drawings light down and left */ + { 0x2514, 0x84 }, /* box drawings light up and right */ + { 0x2518, 0x85 }, /* box drawings light up and left */ + { 0x251C, 0x86 }, /* box drawings light vertical and right */ + { 0x2524, 0x87 }, /* box drawings light vertical and left */ + { 0x252C, 0x88 }, /* box drawings light down and horizontal */ + { 0x2534, 0x89 }, /* box drawings light up and horizontal */ + { 0x253C, 0x8A }, /* box drawings light vertical and horizontal */ + { 0x2550, 0xA0 }, /* box drawings double horizontal */ + { 0x2551, 0xA1 }, /* box drawings double vertical */ + { 0x2552, 0xA2 }, /* box drawings down single and right double */ + { 0x2553, 0xA4 }, /* box drawings down double and right single */ + { 0x2554, 0xA5 }, /* box drawings double down and right */ + { 0x2555, 0xA6 }, /* box drawings down single and left double */ + { 0x2556, 0xA7 }, /* box drawings down double and left single */ + { 0x2557, 0xA8 }, /* box drawings double down and left */ + { 0x2558, 0xA9 }, /* box drawings up single and right double */ + { 0x2559, 0xAA }, /* box drawings up double and right single */ + { 0x255A, 0xAB }, /* box drawings double up and right */ + { 0x255B, 0xAC }, /* box drawings up single and left double */ + { 0x255C, 0xAD }, /* box drawings up double and left single */ + { 0x255D, 0xAE }, /* box drawings double up and left */ + { 0x255E, 0xAF }, /* box drawings vertical single and right double */ + { 0x255F, 0xB0 }, /* box drawings vertical double and right single */ + { 0x2560, 0xB1 }, /* box drawings double vertical and right */ + { 0x2561, 0xB2 }, /* box drawings vertical single and left double */ + { 0x2562, 0xB4 }, /* box drawings vertical double and left single */ + { 0x2563, 0xB5 }, /* box drawings double vertical and left */ + { 0x2564, 0xB6 }, /* box drawings down single and horizontal double */ + { 0x2565, 0xB7 }, /* box drawings down double and horizontal single */ + { 0x2566, 0xB8 }, /* box drawings double down and horizontal */ + { 0x2567, 0xB9 }, /* box drawings up single and horizontal double */ + { 0x2568, 0xBA }, /* box drawings up double and horizontal single */ + { 0x2569, 0xBB }, /* box drawings double up and horizontal */ + { 0x256A, 0xBC }, /* box drawings vertical single and horizontal double */ + { 0x256B, 0xBD }, /* box drawings vertical double and horizontal single */ + { 0x256C, 0xBE }, /* box drawings double vertical and horizontal */ + { 0x2580, 0x8B }, /* upper half block */ + { 0x2584, 0x8C }, /* lower half block */ + { 0x2588, 0x8D }, /* full block */ + { 0x258C, 0x8E }, /* left half block */ + { 0x2590, 0x8F }, /* right half block */ + { 0x2591, 0x90 }, /* light shade */ + { 0x2592, 0x91 }, /* medium shade */ + { 0x2593, 0x92 }, /* dark shade */ + { 0x25A0, 0x94 }, /* black square */ +}; + +static const unicode_mapping unimap_cp866[] = { + { 0xA0, 0xFF }, /* no-break space */ + { 0xA4, 0xFD }, /* currency sign */ + { 0xB0, 0xF8 }, /* degree sign */ + { 0xB7, 0xFA }, /* middle dot */ + { 0x401, 0xF0 }, /* cyrillic capital letter io */ + { 0x404, 0xF2 }, /* cyrillic capital letter ukrainian ie */ + { 0x407, 0xF4 }, /* cyrillic capital letter yi */ + { 0x40E, 0xF6 }, /* cyrillic capital letter short u */ + { 0x410, 0x80 }, /* cyrillic capital letter a */ + { 0x411, 0x81 }, /* cyrillic capital letter be */ + { 0x412, 0x82 }, /* cyrillic capital letter ve */ + { 0x413, 0x83 }, /* cyrillic capital letter ghe */ + { 0x414, 0x84 }, /* cyrillic capital letter de */ + { 0x415, 0x85 }, /* cyrillic capital letter ie */ + { 0x416, 0x86 }, /* cyrillic capital letter zhe */ + { 0x417, 0x87 }, /* cyrillic capital letter ze */ + { 0x418, 0x88 }, /* cyrillic capital letter i */ + { 0x419, 0x89 }, /* cyrillic capital letter short i */ + { 0x41A, 0x8A }, /* cyrillic capital letter ka */ + { 0x41B, 0x8B }, /* cyrillic capital letter el */ + { 0x41C, 0x8C }, /* cyrillic capital letter em */ + { 0x41D, 0x8D }, /* cyrillic capital letter en */ + { 0x41E, 0x8E }, /* cyrillic capital letter o */ + { 0x41F, 0x8F }, /* cyrillic capital letter pe */ + { 0x420, 0x90 }, /* cyrillic capital letter er */ + { 0x421, 0x91 }, /* cyrillic capital letter es */ + { 0x422, 0x92 }, /* cyrillic capital letter te */ + { 0x423, 0x93 }, /* cyrillic capital letter u */ + { 0x424, 0x94 }, /* cyrillic capital letter ef */ + { 0x425, 0x95 }, /* cyrillic capital letter ha */ + { 0x426, 0x96 }, /* cyrillic capital letter tse */ + { 0x427, 0x97 }, /* cyrillic capital letter che */ + { 0x428, 0x98 }, /* cyrillic capital letter sha */ + { 0x429, 0x99 }, /* cyrillic capital letter shcha */ + { 0x42A, 0x9A }, /* cyrillic capital letter hard sign */ + { 0x42B, 0x9B }, /* cyrillic capital letter yeru */ + { 0x42C, 0x9C }, /* cyrillic capital letter soft sign */ + { 0x42D, 0x9D }, /* cyrillic capital letter e */ + { 0x42E, 0x9F }, /* cyrillic capital letter ya */ + { 0x430, 0xA0 }, /* cyrillic small letter a */ + { 0x431, 0xA1 }, /* cyrillic small letter be */ + { 0x432, 0xA2 }, /* cyrillic small letter ve */ + { 0x433, 0xA3 }, /* cyrillic small letter ghe */ + { 0x434, 0xA4 }, /* cyrillic small letter de */ + { 0x435, 0xA5 }, /* cyrillic small letter ie */ + { 0x436, 0xA6 }, /* cyrillic small letter zhe */ + { 0x437, 0xA7 }, /* cyrillic small letter ze */ + { 0x438, 0xA8 }, /* cyrillic small letter i */ + { 0x439, 0xA9 }, /* cyrillic small letter short i */ + { 0x43A, 0xAA }, /* cyrillic small letter ka */ + { 0x43B, 0xAB }, /* cyrillic small letter el */ + { 0x43C, 0xAC }, /* cyrillic small letter em */ + { 0x43D, 0xAD }, /* cyrillic small letter en */ + { 0x43E, 0xAE }, /* cyrillic small letter o */ + { 0x43F, 0xAF }, /* cyrillic small letter pe */ + { 0x440, 0xE0 }, /* cyrillic small letter er */ + { 0x441, 0xE1 }, /* cyrillic small letter es */ + { 0x442, 0xE2 }, /* cyrillic small letter te */ + { 0x443, 0xE3 }, /* cyrillic small letter u */ + { 0x444, 0xE4 }, /* cyrillic small letter ef */ + { 0x445, 0xE5 }, /* cyrillic small letter ha */ + { 0x446, 0xE6 }, /* cyrillic small letter tse */ + { 0x447, 0xE7 }, /* cyrillic small letter che */ + { 0x448, 0xE8 }, /* cyrillic small letter sha */ + { 0x449, 0xE9 }, /* cyrillic small letter shcha */ + { 0x44A, 0xEA }, /* cyrillic small letter hard sign */ + { 0x44B, 0xEB }, /* cyrillic small letter yeru */ + { 0x44C, 0xEC }, /* cyrillic small letter soft sign */ + { 0x44D, 0xED }, /* cyrillic small letter e */ + { 0x44E, 0xEE }, /* cyrillic small letter yu */ + { 0x44F, 0xEF }, /* cyrillic small letter ya */ + { 0x451, 0xF1 }, /* cyrillic small letter io */ + { 0x454, 0xF3 }, /* cyrillic small letter ukrainian ie */ + { 0x457, 0xF5 }, /* cyrillic small letter yi */ + { 0x45E, 0xF7 }, /* cyrillic small letter short u */ + { 0x2116, 0xFC }, /* numero sign */ + { 0x2219, 0xF9 }, /* bullet operator */ + { 0x221A, 0xFB }, /* square root */ + { 0x2500, 0xC4 }, /* box drawings light horizontal */ + { 0x2502, 0xB3 }, /* box drawings light vertical */ + { 0x250C, 0xDA }, /* box drawings light down and right */ + { 0x2510, 0xBF }, /* box drawings light down and left */ + { 0x2514, 0xC0 }, /* box drawings light up and right */ + { 0x2518, 0xD9 }, /* box drawings light up and left */ + { 0x251C, 0xC3 }, /* box drawings light vertical and right */ + { 0x2524, 0xB4 }, /* box drawings light vertical and left */ + { 0x252C, 0xC2 }, /* box drawings light down and horizontal */ + { 0x2534, 0xC1 }, /* box drawings light up and horizontal */ + { 0x253C, 0xC5 }, /* box drawings light vertical and horizontal */ + { 0x2550, 0xCD }, /* box drawings double horizontal */ + { 0x2551, 0xBA }, /* box drawings double vertical */ + { 0x2552, 0xD5 }, /* box drawings down single and right double */ + { 0x2553, 0xD6 }, /* box drawings down double and right single */ + { 0x2554, 0xC9 }, /* box drawings double down and right */ + { 0x2555, 0xB8 }, /* box drawings down single and left double */ + { 0x2556, 0xB7 }, /* box drawings down double and left single */ + { 0x2557, 0xBB }, /* box drawings double down and left */ + { 0x2558, 0xD4 }, /* box drawings up single and right double */ + { 0x2559, 0xD3 }, /* box drawings up double and right single */ + { 0x255A, 0xC8 }, /* box drawings double up and right */ + { 0x255B, 0xBE }, /* box drawings up single and left double */ + { 0x255C, 0xBD }, /* box drawings up double and left single */ + { 0x255D, 0xBC }, /* box drawings double up and left */ + { 0x255E, 0xC6 }, /* box drawings vertical single and right double */ + { 0x255F, 0xC7 }, /* box drawings vertical double and right single */ + { 0x2560, 0xCC }, /* box drawings double vertical and right */ + { 0x2561, 0xB5 }, /* box drawings vertical single and left double */ + { 0x2562, 0xB6 }, /* box drawings vertical double and left single */ + { 0x2563, 0xB9 }, /* box drawings double vertical and left */ + { 0x2564, 0xD1 }, /* box drawings down single and horizontal double */ + { 0x2565, 0xD2 }, /* box drawings down double and horizontal single */ + { 0x2566, 0xCB }, /* box drawings double down and horizontal */ + { 0x2567, 0xCF }, /* box drawings up single and horizontal double */ + { 0x2568, 0xD0 }, /* box drawings up double and horizontal single */ + { 0x2569, 0xCA }, /* box drawings double up and horizontal */ + { 0x256A, 0xD8 }, /* box drawings vertical single and horizontal double */ + { 0x256B, 0xD7 }, /* box drawings vertical double and horizontal single */ + { 0x256C, 0xCE }, /* box drawings double vertical and horizontal */ + { 0x2580, 0xDF }, /* upper half block */ + { 0x2584, 0xDC }, /* lower half block */ + { 0x2588, 0xDB }, /* full block */ + { 0x258C, 0xDD }, /* left half block */ + { 0x2590, 0xDE }, /* right half block */ + { 0x2591, 0xB0 }, /* light shade */ + { 0x2592, 0xB1 }, /* medium shade */ + { 0x2593, 0xB2 }, /* dark shade */ + { 0x25A0, 0xFE }, /* black square */ +}; + +static const unicode_mapping unimap_macroman[] = { + { 0xA0, 0xCA }, /* no-break space */ + { 0xA1, 0xC1 }, /* inverted exclamation mark */ + { 0xA2, 0xA2 }, /* cent sign */ + { 0xA3, 0xA3 }, /* pound sign */ + { 0xA5, 0xB4 }, /* yen sign */ + { 0xA7, 0xA4 }, /* section sign */ + { 0xA8, 0xAC }, /* diaeresis */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAA, 0xBB }, /* feminine ordinal indicator */ + { 0xAB, 0xC7 }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xC2 }, /* not sign */ + { 0xAE, 0xA8 }, /* registered sign */ + { 0xAF, 0xF8 }, /* macron */ + { 0xB0, 0xA1 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB4, 0xAB }, /* acute accent */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xA6 }, /* pilcrow sign */ + { 0xB7, 0xE1 }, /* middle dot */ + { 0xB8, 0xFC }, /* cedilla */ + { 0xBA, 0xBC }, /* masculine ordinal indicator */ + { 0xBB, 0xC8 }, /* right-pointing double angle quotation mark */ + { 0xBF, 0xC0 }, /* inverted question mark */ + { 0xC0, 0xCB }, /* latin capital letter a with grave */ + { 0xC1, 0xE7 }, /* latin capital letter a with acute */ + { 0xC2, 0xE5 }, /* latin capital letter a with circumflex */ + { 0xC3, 0xCC }, /* latin capital letter a with tilde */ + { 0xC4, 0x80 }, /* latin capital letter a with diaeresis */ + { 0xC5, 0x81 }, /* latin capital letter a with ring above */ + { 0xC6, 0xAE }, /* latin capital letter ae */ + { 0xC7, 0x82 }, /* latin capital letter c with cedilla */ + { 0xC8, 0xE9 }, /* latin capital letter e with grave */ + { 0xC9, 0x83 }, /* latin capital letter e with acute */ + { 0xCA, 0xE6 }, /* latin capital letter e with circumflex */ + { 0xCB, 0xE8 }, /* latin capital letter e with diaeresis */ + { 0xCC, 0xED }, /* latin capital letter i with grave */ + { 0xCD, 0xEA }, /* latin capital letter i with acute */ + { 0xCE, 0xEB }, /* latin capital letter i with circumflex */ + { 0xCF, 0xEC }, /* latin capital letter i with diaeresis */ + { 0xD1, 0x84 }, /* latin capital letter n with tilde */ + { 0xD2, 0xF1 }, /* latin capital letter o with grave */ + { 0xD3, 0xEE }, /* latin capital letter o with acute */ + { 0xD4, 0xEF }, /* latin capital letter o with circumflex */ + { 0xD5, 0xCD }, /* latin capital letter o with tilde */ + { 0xD6, 0x85 }, /* latin capital letter o with diaeresis */ + { 0xD8, 0xAF }, /* latin capital letter o with stroke */ + { 0xD9, 0xF4 }, /* latin capital letter u with grave */ + { 0xDA, 0xF2 }, /* latin capital letter u with acute */ + { 0xDB, 0xF3 }, /* latin capital letter u with circumflex */ + { 0xDC, 0x86 }, /* latin capital letter u with diaeresis */ + { 0xDF, 0xA7 }, /* latin small letter sharp s */ + { 0xE0, 0x88 }, /* latin small letter a with grave */ + { 0xE1, 0x87 }, /* latin small letter a with acute */ + { 0xE2, 0x89 }, /* latin small letter a with circumflex */ + { 0xE3, 0x8B }, /* latin small letter a with tilde */ + { 0xE4, 0x8A }, /* latin small letter a with diaeresis */ + { 0xE5, 0x8C }, /* latin small letter a with ring above */ + { 0xE6, 0xBE }, /* latin small letter ae */ + { 0xE7, 0x8D }, /* latin small letter c with cedilla */ + { 0xE8, 0x8F }, /* latin small letter e with grave */ + { 0xE9, 0x8E }, /* latin small letter e with acute */ + { 0xEA, 0x90 }, /* latin small letter e with circumflex */ + { 0xEB, 0x91 }, /* latin small letter e with diaeresis */ + { 0xEC, 0x93 }, /* latin small letter i with grave */ + { 0xED, 0x92 }, /* latin small letter i with acute */ + { 0xEE, 0x94 }, /* latin small letter i with circumflex */ + { 0xEF, 0x95 }, /* latin small letter i with diaeresis */ + { 0xF1, 0x96 }, /* latin small letter n with tilde */ + { 0xF2, 0x98 }, /* latin small letter o with grave */ + { 0xF3, 0x97 }, /* latin small letter o with acute */ + { 0xF4, 0x99 }, /* latin small letter o with circumflex */ + { 0xF5, 0x9B }, /* latin small letter o with tilde */ + { 0xF6, 0x9A }, /* latin small letter o with diaeresis */ + { 0xF7, 0xD6 }, /* division sign */ + { 0xF8, 0xBF }, /* latin small letter o with stroke */ + { 0xF9, 0x9D }, /* latin small letter u with grave */ + { 0xFA, 0x9C }, /* latin small letter u with acute */ + { 0xFB, 0x9E }, /* latin small letter u with circumflex */ + { 0xFC, 0x9F }, /* latin small letter u with diaeresis */ + { 0xFF, 0xD8 }, /* latin small letter y with diaeresis */ + { 0x131, 0xF5 }, /* latin small letter dotless i */ + { 0x152, 0xCE }, /* latin capital ligature oe */ + { 0x153, 0xCF }, /* latin small ligature oe */ + { 0x178, 0xD9 }, /* latin capital letter y with diaeresis */ + { 0x192, 0xC4 }, /* latin small letter f with hook */ + { 0x2C6, 0xF6 }, /* modifier letter circumflex accent */ + { 0x2C7, 0xFF }, /* caron */ + { 0x2D8, 0xF9 }, /* breve */ + { 0x2D9, 0xFA }, /* dot above */ + { 0x2DA, 0xFB }, /* ring above */ + { 0x2DB, 0xFE }, /* ogonek */ + { 0x2DC, 0xF7 }, /* small tilde */ + { 0x2DD, 0xFD }, /* double acute accent */ + { 0x3A9, 0xBD }, /* greek capital letter omega */ + { 0x3C0, 0xB9 }, /* greek small letter pi */ + { 0x2013, 0xD0 }, /* en dash */ + { 0x2014, 0xD1 }, /* em dash */ + { 0x2018, 0xD4 }, /* left single quotation mark */ + { 0x2019, 0xD5 }, /* right single quotation mark */ + { 0x201A, 0xE2 }, /* single low-9 quotation mark */ + { 0x201C, 0xD2 }, /* left double quotation mark */ + { 0x201D, 0xD3 }, /* right double quotation mark */ + { 0x201E, 0xE3 }, /* double low-9 quotation mark */ + { 0x2020, 0xA0 }, /* dagger */ + { 0x2021, 0xE0 }, /* double dagger */ + { 0x2022, 0xA5 }, /* bullet */ + { 0x2026, 0xC9 }, /* horizontal ellipsis */ + { 0x2030, 0xE4 }, /* per mille sign */ + { 0x2039, 0xDC }, /* single left-pointing angle quotation mark */ + { 0x203A, 0xDD }, /* single right-pointing angle quotation mark */ + { 0x2044, 0xDA }, /* fraction slash */ + { 0x20AC, 0xDB }, /* euro sign */ + { 0x2122, 0xAA }, /* trade mark sign */ + { 0x2202, 0xB6 }, /* partial differential */ + { 0x2206, 0xC6 }, /* increment */ + { 0x220F, 0xB8 }, /* n-ary product */ + { 0x2211, 0xB7 }, /* n-ary summation */ + { 0x221A, 0xC3 }, /* square root */ + { 0x221E, 0xB0 }, /* infinity */ + { 0x222B, 0xBA }, /* integral */ + { 0x2248, 0xC5 }, /* almost equal to */ + { 0x2260, 0xAD }, /* not equal to */ + { 0x2264, 0xB2 }, /* less-than or equal to */ + { 0x2265, 0xB3 }, /* greater-than or equal to */ + { 0x25CA, 0xD7 }, /* lozenge */ + { 0xF8FF, 0xF0 }, /* apple logo */ + { 0xFB01, 0xDE }, /* latin small ligature fi */ + { 0xFB02, 0xDF }, /* latin small ligature fl */ +}; + +#endif /* HTML_TABLES_H */ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | Copyright (c) 1997-2010 The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Rasmus Lerdorf | + +----------------------------------------------------------------------+ +*/ + +/* $Id: html.h 293036 2010-01-03 09:23:27Z sebastian $ */ + +#ifndef HTML_TABLES_H +#define HTML_TABLES_H + +/* cs_terminator is overloaded in the following fashion: + * - It terminates the list entity maps. + * - In BG(inverse_ent_maps), it's the key of the inverse map that stores + * only the basic entities. + * - When passed to traverse_for_entities (or via php_unescape_entities with !all), + * we don't care about the encoding (UTF-8 is chosen, but it should be used + * when it doesn't matter). + */ +enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, + cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, + cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r, + cs_cp1251, cs_8859_5, cs_cp866, cs_macroman, + cs_numelems /* used to count the number of charsets */ + }; +typedef const char *const entity_table_t; + +/* codepage 1252 is a Windows extension to iso-8859-1. */ +static entity_table_t ent_cp_1252[] = { + "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger", + "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig", + NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo", + "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo", + "oelig", NULL, NULL, "Yuml" +}; + +static entity_table_t ent_iso_8859_1[] = { + "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", + "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", + "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", + "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", + "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", + "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", + "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", + "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", + "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", + "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", + "atilde", "auml", "aring", "aelig", "ccedil", "egrave", + "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", + "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", + "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", + "uuml", "yacute", "thorn", "yuml" +}; + +static entity_table_t ent_iso_8859_15[] = { + "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron", + "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg", + "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */ + "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm", + "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute", + "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", + "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", + "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", + "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", + "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", + "atilde", "auml", "aring", "aelig", "ccedil", "egrave", + "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", + "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", + "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", + "uuml", "yacute", "thorn", "yuml" +}; + +static entity_table_t ent_uni_338_402[] = { + /* 338 (0x0152) */ + "OElig", "oelig", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 352 (0x0160) */ + "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 376 (0x0178) */ + "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 400 (0x0190) */ + NULL, NULL, "fnof" +}; + +static entity_table_t ent_uni_spacing[] = { + /* 710 */ + "circ", + /* 711 - 730 */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 731 - 732 */ + NULL, "tilde" +}; + +static entity_table_t ent_uni_greek[] = { + /* 913 */ + "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", + "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", + NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega", + /* 938 - 944 are not mapped */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", + "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", + "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", + /* 970 - 976 are not mapped */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "thetasym", "upsih", + NULL, NULL, NULL, + "piv" +}; + +static entity_table_t ent_uni_punct[] = { + /* 8194 */ + "ensp", "emsp", NULL, NULL, NULL, NULL, NULL, + "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", + NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, + /* 8216 */ + "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL, + "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, + /* 8242 */ + "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, + NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, + "frasl" +}; + +static entity_table_t ent_uni_euro[] = { + "euro" +}; + +static entity_table_t ent_uni_8465_8501[] = { + /* 8465 */ + "image", NULL, NULL, NULL, NULL, NULL, NULL, + /* 8472 */ + "weierp", NULL, NULL, NULL, + /* 8476 */ + "real", NULL, NULL, NULL, NULL, NULL, + /* 8482 */ + "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8501 */ + "alefsym", +}; + +static entity_table_t ent_uni_8592_9002[] = { + /* 8592 (0x2190) */ + "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8608 (0x21a0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8624 (0x21b0) */ + NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8640 (0x21c0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8656 (0x21d0) */ + "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8672 (0x21e0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8704 (0x2200) */ + "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla", + "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod", + /* 8720 (0x2210) */ + NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast", + NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL, + /* 8736 (0x2220) */ + "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and", + "or", "cap", "cup", "int", NULL, NULL, NULL, NULL, + /* 8752 (0x2230) */ + NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL, + /* 8768 (0x2240) */ + NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL, + "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8784 (0x2250) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8800 (0x2260) */ + "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8816 (0x2270) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8832 (0x2280) */ + NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8848 (0x2290) */ + NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8864 (0x22a0) */ + NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8880 (0x22b0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8896 (0x22c0) */ + NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8912 (0x22d0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8928 (0x22e0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8944 (0x22f0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8960 (0x2300) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL, + /* 8976 (0x2310) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8992 (0x2320) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, "lang", "rang" +}; + +static entity_table_t ent_uni_9674[] = { + /* 9674 */ + "loz" +}; + +static entity_table_t ent_uni_9824_9830[] = { + /* 9824 */ + "spades", NULL, NULL, "clubs", NULL, "hearts", "diams" +}; + +static entity_table_t ent_koi8r[] = { + "#1105", /* "jo "*/ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", + "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", + "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", + "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", + "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", + "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", + "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", + "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042", + "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", + "#1066" +}; + +static entity_table_t ent_cp_1251[] = { + "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger", + "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036", + "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220", + "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250", + "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118", + "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy", + "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn", + "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105", + "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111", + "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046", + "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053", + "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060", + "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067", + "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074", + "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081", + "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088", + "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095", + "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102", + "#1103" +}; + +static entity_table_t ent_iso_8859_5[] = { + "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062", + "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069", + "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076", + "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083", + "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090", + "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", + "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104", + "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111", + "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118", + "#1119" +}; + +static entity_table_t ent_cp_866[] = { + + "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", + "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", + "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", + "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", + "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", + "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", + "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", + "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", + "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632", + "#160" +}; + +/* MacRoman has a couple of low-ascii chars that need mapping too */ +/* Vertical tab (ASCII 11) is often used to store line breaks inside */ +/* DB exports, this mapping changes it to a space */ +static entity_table_t ent_macroman[] = { + "sp", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "quot", NULL, + NULL, NULL, "amp", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "lt", NULL, "gt", NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml", + "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring", + "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave", + "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml", + "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg", + "cent", "pound", "sect", "bull", "para", "szlig", "reg", + "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash", + "infin", "plusmn", "le", "ge", "yen", "micro", "part", + "sum", "prod", "pi", "int", "ordf", "ordm", "Omega", + "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof", + "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave", + "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo", + "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml", + "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger", + "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute", + "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute", + "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305", + "circ", "tilde", "macr", "#728", "#729", "#730", "cedil", + "#733", "#731", "#711" +}; + +struct html_entity_map { + enum entity_charset charset; /* charset identifier */ + unsigned int basechar; /* char code at start of table */ + unsigned int endchar; /* last char code in the table */ + entity_table_t *table; /* the table of mappings */ +}; + +static const struct html_entity_map entity_map[] = { + { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, + { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, + { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_utf_8, 338, 402, ent_uni_338_402 }, + { cs_utf_8, 710, 732, ent_uni_spacing }, + { cs_utf_8, 913, 982, ent_uni_greek }, + { cs_utf_8, 8194, 8260, ent_uni_punct }, + { cs_utf_8, 8364, 8364, ent_uni_euro }, + { cs_utf_8, 8465, 8501, ent_uni_8465_8501 }, + { cs_utf_8, 8592, 9002, ent_uni_8592_9002 }, + { cs_utf_8, 9674, 9674, ent_uni_9674 }, + { cs_utf_8, 9824, 9830, ent_uni_9824_9830 }, + { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_koi8r, 0xa3, 0xff, ent_koi8r }, + { cs_cp1251, 0x80, 0xff, ent_cp_1251 }, + { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 }, + { cs_cp866, 0xc0, 0xff, ent_cp_866 }, + { cs_macroman, 0x0b, 0xff, ent_macroman }, + { cs_terminator } +}; + +static const struct { + const char *codeset; + enum entity_charset charset; +} charset_map[] = { + { "ISO-8859-1", cs_8859_1 }, + { "ISO8859-1", cs_8859_1 }, + { "ISO-8859-15", cs_8859_15 }, + { "ISO8859-15", cs_8859_15 }, + { "utf-8", cs_utf_8 }, + { "cp1252", cs_cp1252 }, + { "Windows-1252", cs_cp1252 }, + { "1252", cs_cp1252 }, + { "BIG5", cs_big5 }, + { "950", cs_big5 }, + { "GB2312", cs_gb2312 }, + { "936", cs_gb2312 }, + { "BIG5-HKSCS", cs_big5hkscs }, + { "Shift_JIS", cs_sjis }, + { "SJIS", cs_sjis }, + { "932", cs_sjis }, + { "EUCJP", cs_eucjp }, + { "EUC-JP", cs_eucjp }, + { "KOI8-R", cs_koi8r }, + { "koi8-ru", cs_koi8r }, + { "koi8r", cs_koi8r }, + { "cp1251", cs_cp1251 }, + { "Windows-1251", cs_cp1251 }, + { "win-1251", cs_cp1251 }, + { "iso8859-5", cs_8859_5 }, + { "iso-8859-5", cs_8859_5 }, + { "cp866", cs_cp866 }, + { "866", cs_cp866 }, + { "ibm866", cs_cp866 }, + { "MacRoman", cs_macroman }, + { NULL } +}; + +typedef struct { + unsigned short charcode; + char *entity; + int entitylen; + int flags; +} basic_entity_t; + +static const basic_entity_t basic_entities_ex[] = { + { '&', "&", 5, 0 }, + { '"', """, 6, ENT_HTML_QUOTE_DOUBLE }, + /* PHP traditionally encodes ' as ', not ', so leave this entry here */ + { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, + { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, + { '<', "<", 4, 0 }, + { '>', ">", 4, 0 }, + { 0, NULL, 0, 0 } +}; + +/* In some cases, we need to give special treatment to &, so we + * use this instead */ +static const basic_entity_t *basic_entities = &basic_entities_ex[1]; + +typedef struct { + unsigned short un_code_point; /* we don't need bigger */ + unsigned char cs_code; /* currently, we only have maps to single-byte encodings */ +} unicode_mapping; + +static const unicode_mapping unimap_iso885915[] = { + { 0xA5, 0xA5 }, /* yen sign */ + { 0xA7, 0xA7 }, /* section sign */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAA, 0xAA }, /* feminine ordinal indicator */ + { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xAC }, /* not sign */ + { 0xAD, 0xAD }, /* soft hyphen */ + { 0xAE, 0xAE }, /* registered sign */ + { 0xAF, 0xAF }, /* macron */ + { 0xB0, 0xB0 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB2, 0xB2 }, /* superscript two */ + { 0xB3, 0xB3 }, /* superscript three */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xB6 }, /* pilcrow sign */ + { 0xB7, 0xB7 }, /* middle dot */ + { 0xB9, 0xB9 }, /* superscript one */ + { 0xBA, 0xBA }, /* masculine ordinal indicator */ + { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */ + { 0x152, 0xBC }, /* latin capital ligature oe */ + { 0x153, 0xBD }, /* latin small ligature oe */ + { 0x160, 0xA6 }, /* latin capital letter s with caron */ + { 0x161, 0xA8 }, /* latin small letter s with caron */ + { 0x178, 0xBE }, /* latin capital letter y with diaeresis */ + { 0x17D, 0xB4 }, /* latin capital letter z with caron */ + { 0x17E, 0xB8 }, /* latin small letter z with caron */ + { 0x20AC, 0xA4 }, /* euro sign */ +}; + +static const unicode_mapping unimap_win1252[] = { + { 0x152, 0x8C }, /* latin capital ligature oe */ + { 0x153, 0x9C }, /* latin small ligature oe */ + { 0x160, 0x8A }, /* latin capital letter s with caron */ + { 0x161, 0x9A }, /* latin small letter s with caron */ + { 0x178, 0x9F }, /* latin capital letter y with diaeresis */ + { 0x17D, 0x8E }, /* latin capital letter z with caron */ + { 0x17E, 0x9E }, /* latin small letter z with caron */ + { 0x192, 0x83 }, /* latin small letter f with hook */ + { 0x2C6, 0x88 }, /* modifier letter circumflex accent */ + { 0x2DC, 0x98 }, /* small tilde */ + { 0x2013, 0x96 }, /* en dash */ + { 0x2014, 0x97 }, /* em dash */ + { 0x2018, 0x91 }, /* left single quotation mark */ + { 0x2019, 0x92 }, /* right single quotation mark */ + { 0x201A, 0x82 }, /* single low-9 quotation mark */ + { 0x201C, 0x93 }, /* left double quotation mark */ + { 0x201D, 0x94 }, /* right double quotation mark */ + { 0x201E, 0x84 }, /* double low-9 quotation mark */ + { 0x2020, 0x86 }, /* dagger */ + { 0x2021, 0x87 }, /* double dagger */ + { 0x2022, 0x95 }, /* bullet */ + { 0x2026, 0x85 }, /* horizontal ellipsis */ + { 0x2030, 0x89 }, /* per mille sign */ + { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */ + { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */ + { 0x20AC, 0x80 }, /* euro sign */ + { 0x2122, 0x99 }, /* trade mark sign */ +}; + +static const unicode_mapping unimap_win1251[] = { + { 0xA0, 0xA0 }, /* no-break space */ + { 0xA4, 0xA4 }, /* currency sign */ + { 0xA6, 0xA6 }, /* broken bar */ + { 0xA7, 0xA7 }, /* section sign */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xAC }, /* not sign */ + { 0xAD, 0xAD }, /* soft hyphen */ + { 0xAE, 0xAE }, /* registered sign */ + { 0xB0, 0xB0 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xB6 }, /* pilcrow sign */ + { 0xB7, 0xB7 }, /* middle dot */ + { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */ + { 0x401, 0xA8 }, /* cyrillic capital letter io */ + { 0x402, 0x80 }, /* cyrillic capital letter dje */ + { 0x403, 0x81 }, /* cyrillic capital letter gje */ + { 0x404, 0xAA }, /* cyrillic capital letter ukrainian ie */ + { 0x405, 0xBD }, /* cyrillic capital letter dze */ + { 0x406, 0xB2 }, /* cyrillic capital letter byelorussian-ukrainian i */ + { 0x407, 0xAF }, /* cyrillic capital letter yi */ + { 0x408, 0xA3 }, /* cyrillic capital letter je */ + { 0x409, 0x8A }, /* cyrillic capital letter lje */ + { 0x40A, 0x8C }, /* cyrillic capital letter nje */ + { 0x40B, 0x8E }, /* cyrillic capital letter tshe */ + { 0x40C, 0x8D }, /* cyrillic capital letter kje */ + { 0x40E, 0xA1 }, /* cyrillic capital letter short u */ + { 0x40F, 0x8F }, /* cyrillic capital letter dzhe */ + { 0x410, 0xC0 }, /* cyrillic capital letter a */ + { 0x411, 0xC1 }, /* cyrillic capital letter be */ + { 0x412, 0xC2 }, /* cyrillic capital letter ve */ + { 0x413, 0xC3 }, /* cyrillic capital letter ghe */ + { 0x414, 0xC4 }, /* cyrillic capital letter de */ + { 0x415, 0xC5 }, /* cyrillic capital letter ie */ + { 0x416, 0xC6 }, /* cyrillic capital letter zhe */ + { 0x417, 0xC7 }, /* cyrillic capital letter ze */ + { 0x418, 0xC8 }, /* cyrillic capital letter i */ + { 0x419, 0xC9 }, /* cyrillic capital letter short i */ + { 0x41A, 0xCA }, /* cyrillic capital letter ka */ + { 0x41B, 0xCB }, /* cyrillic capital letter el */ + { 0x41C, 0xCC }, /* cyrillic capital letter em */ + { 0x41D, 0xCD }, /* cyrillic capital letter en */ + { 0x41E, 0xCE }, /* cyrillic capital letter o */ + { 0x41F, 0xCF }, /* cyrillic capital letter pe */ + { 0x420, 0xD0 }, /* cyrillic capital letter er */ + { 0x421, 0xD1 }, /* cyrillic capital letter es */ + { 0x422, 0xD2 }, /* cyrillic capital letter te */ + { 0x423, 0xD3 }, /* cyrillic capital letter u */ + { 0x424, 0xD4 }, /* cyrillic capital letter ef */ + { 0x425, 0xD5 }, /* cyrillic capital letter ha */ + { 0x426, 0xD6 }, /* cyrillic capital letter tse */ + { 0x427, 0xD7 }, /* cyrillic capital letter che */ + { 0x428, 0xD8 }, /* cyrillic capital letter sha */ + { 0x429, 0xD9 }, /* cyrillic capital letter shcha */ + { 0x42A, 0xDA }, /* cyrillic capital letter hard sign */ + { 0x42B, 0xDB }, /* cyrillic capital letter yeru */ + { 0x42C, 0xDC }, /* cyrillic capital letter soft sign */ + { 0x42D, 0xDD }, /* cyrillic capital letter e */ + { 0x42E, 0xDE }, /* cyrillic capital letter yu */ + { 0x42F, 0xDF }, /* cyrillic capital letter ya */ + { 0x430, 0xE0 }, /* cyrillic small letter a */ + { 0x431, 0xE1 }, /* cyrillic small letter be */ + { 0x432, 0xE2 }, /* cyrillic small letter ve */ + { 0x433, 0xE3 }, /* cyrillic small letter ghe */ + { 0x434, 0xE4 }, /* cyrillic small letter de */ + { 0x435, 0xE5 }, /* cyrillic small letter ie */ + { 0x436, 0xE6 }, /* cyrillic small letter zhe */ + { 0x437, 0xE7 }, /* cyrillic small letter ze */ + { 0x438, 0xE8 }, /* cyrillic small letter i */ + { 0x439, 0xE9 }, /* cyrillic small letter short i */ + { 0x43A, 0xEA }, /* cyrillic small letter ka */ + { 0x43B, 0xEB }, /* cyrillic small letter el */ + { 0x43C, 0xEC }, /* cyrillic small letter em */ + { 0x43D, 0xED }, /* cyrillic small letter en */ + { 0x43E, 0xEE }, /* cyrillic small letter o */ + { 0x43F, 0xEF }, /* cyrillic small letter pe */ + { 0x440, 0xF0 }, /* cyrillic small letter er */ + { 0x441, 0xF1 }, /* cyrillic small letter es */ + { 0x442, 0xF2 }, /* cyrillic small letter te */ + { 0x443, 0xF3 }, /* cyrillic small letter u */ + { 0x444, 0xF4 }, /* cyrillic small letter ef */ + { 0x445, 0xF5 }, /* cyrillic small letter ha */ + { 0x446, 0xF6 }, /* cyrillic small letter tse */ + { 0x447, 0xF7 }, /* cyrillic small letter che */ + { 0x448, 0xF8 }, /* cyrillic small letter sha */ + { 0x449, 0xF9 }, /* cyrillic small letter shcha */ + { 0x44A, 0xFA }, /* cyrillic small letter hard sign */ + { 0x44B, 0xFB }, /* cyrillic small letter yeru */ + { 0x44C, 0xFC }, /* cyrillic small letter soft sign */ + { 0x44D, 0xFD }, /* cyrillic small letter e */ + { 0x44E, 0xFE }, /* cyrillic small letter yu */ + { 0x44F, 0xFF }, /* cyrillic small letter ya */ + { 0x451, 0xB8 }, /* cyrillic small letter io */ + { 0x452, 0x90 }, /* cyrillic small letter dje */ + { 0x453, 0x83 }, /* cyrillic small letter gje */ + { 0x454, 0xBA }, /* cyrillic small letter ukrainian ie */ + { 0x455, 0xBE }, /* cyrillic small letter dze */ + { 0x456, 0xB3 }, /* cyrillic small letter byelorussian-ukrainian i */ + { 0x457, 0xBF }, /* cyrillic small letter yi */ + { 0x458, 0xBC }, /* cyrillic small letter je */ + { 0x459, 0x9A }, /* cyrillic small letter lje */ + { 0x45A, 0x9C }, /* cyrillic small letter nje */ + { 0x45B, 0x9E }, /* cyrillic small letter tshe */ + { 0x45C, 0x9D }, /* cyrillic small letter kje */ + { 0x45E, 0xA2 }, /* cyrillic small letter short u */ + { 0x45F, 0x9F }, /* cyrillic small letter dzhe */ + { 0x490, 0xA5 }, /* cyrillic capital letter ghe with upturn */ + { 0x491, 0xB4 }, /* cyrillic small letter ghe with upturn */ + { 0x2013, 0x96 }, /* en dash */ + { 0x2014, 0x97 }, /* em dash */ + { 0x2018, 0x91 }, /* left single quotation mark */ + { 0x2019, 0x92 }, /* right single quotation mark */ + { 0x201A, 0x82 }, /* single low-9 quotation mark */ + { 0x201C, 0x93 }, /* left double quotation mark */ + { 0x201D, 0x94 }, /* right double quotation mark */ + { 0x201E, 0x84 }, /* double low-9 quotation mark */ + { 0x2020, 0x86 }, /* dagger */ + { 0x2021, 0x87 }, /* double dagger */ + { 0x2022, 0x95 }, /* bullet */ + { 0x2026, 0x85 }, /* horizontal ellipsis */ + { 0x2030, 0x89 }, /* per mille sign */ + { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */ + { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */ + { 0x20AC, 0x88 }, /* euro sign */ + { 0x2116, 0xB9 }, /* numero sign */ + { 0x2122, 0x99 }, /* trade mark sign */ +}; + +static const unicode_mapping unimap_koi8r[] = { + { 0xA0, 0x9A }, /* no-break space */ + { 0xA9, 0xBF }, /* copyright sign */ + { 0xB0, 0x9C }, /* degree sign */ + { 0xB2, 0x9D }, /* superscript two */ + { 0xB7, 0x9E }, /* middle dot */ + { 0xF7, 0x9F }, /* division sign */ + { 0x401, 0xB3 }, /* cyrillic capital letter io */ + { 0x410, 0xE1 }, /* cyrillic capital letter a */ + { 0x411, 0xE2 }, /* cyrillic capital letter be */ + { 0x412, 0xF7 }, /* cyrillic capital letter ve */ + { 0x413, 0xE7 }, /* cyrillic capital letter ghe */ + { 0x414, 0xE4 }, /* cyrillic capital letter de */ + { 0x415, 0xE5 }, /* cyrillic capital letter ie */ + { 0x416, 0xF6 }, /* cyrillic capital letter zhe */ + { 0x417, 0xFA }, /* cyrillic capital letter ze */ + { 0x418, 0xE9 }, /* cyrillic capital letter i */ + { 0x419, 0xEA }, /* cyrillic capital letter short i */ + { 0x41A, 0xEB }, /* cyrillic capital letter ka */ + { 0x41B, 0xEC }, /* cyrillic capital letter el */ + { 0x41C, 0xED }, /* cyrillic capital letter em */ + { 0x41D, 0xEE }, /* cyrillic capital letter en */ + { 0x41E, 0xEF }, /* cyrillic capital letter o */ + { 0x41F, 0xF0 }, /* cyrillic capital letter pe */ + { 0x420, 0xF2 }, /* cyrillic capital letter er */ + { 0x421, 0xF3 }, /* cyrillic capital letter es */ + { 0x422, 0xF4 }, /* cyrillic capital letter te */ + { 0x423, 0xF5 }, /* cyrillic capital letter u */ + { 0x424, 0xE6 }, /* cyrillic capital letter ef */ + { 0x425, 0xE8 }, /* cyrillic capital letter ha */ + { 0x426, 0xE3 }, /* cyrillic capital letter tse */ + { 0x427, 0xFE }, /* cyrillic capital letter che */ + { 0x428, 0xFB }, /* cyrillic capital letter sha */ + { 0x429, 0xFD }, /* cyrillic capital letter shcha */ + { 0x42A, 0xFF }, /* cyrillic capital letter hard sign */ + { 0x42B, 0xF9 }, /* cyrillic capital letter yeru */ + { 0x42C, 0xF8 }, /* cyrillic capital letter soft sign */ + { 0x42D, 0xFC }, /* cyrillic capital letter e */ + { 0x42E, 0xE0 }, /* cyrillic capital letter yu */ + { 0x42F, 0xF1 }, /* cyrillic capital letter ya */ + { 0x430, 0xC1 }, /* cyrillic small letter a */ + { 0x431, 0xC2 }, /* cyrillic small letter be */ + { 0x432, 0xD7 }, /* cyrillic small letter ve */ + { 0x433, 0xC7 }, /* cyrillic small letter ghe */ + { 0x434, 0xC4 }, /* cyrillic small letter de */ + { 0x435, 0xC5 }, /* cyrillic small letter ie */ + { 0x436, 0xD6 }, /* cyrillic small letter zhe */ + { 0x437, 0xDA }, /* cyrillic small letter ze */ + { 0x438, 0xC9 }, /* cyrillic small letter i */ + { 0x439, 0xCA }, /* cyrillic small letter short i */ + { 0x43A, 0xCB }, /* cyrillic small letter ka */ + { 0x43B, 0xCC }, /* cyrillic small letter el */ + { 0x43C, 0xCD }, /* cyrillic small letter em */ + { 0x43D, 0xCE }, /* cyrillic small letter en */ + { 0x43E, 0xCF }, /* cyrillic small letter o */ + { 0x43F, 0xD0 }, /* cyrillic small letter pe */ + { 0x440, 0xD2 }, /* cyrillic small letter er */ + { 0x441, 0xD3 }, /* cyrillic small letter es */ + { 0x442, 0xD4 }, /* cyrillic small letter te */ + { 0x443, 0xD5 }, /* cyrillic small letter u */ + { 0x444, 0xC6 }, /* cyrillic small letter ef */ + { 0x445, 0xC8 }, /* cyrillic small letter ha */ + { 0x446, 0xC3 }, /* cyrillic small letter tse */ + { 0x447, 0xDE }, /* cyrillic small letter che */ + { 0x448, 0xDB }, /* cyrillic small letter sha */ + { 0x449, 0xDD }, /* cyrillic small letter shcha */ + { 0x44A, 0xDF }, /* cyrillic small letter hard sign */ + { 0x44B, 0xD9 }, /* cyrillic small letter yeru */ + { 0x44C, 0xD8 }, /* cyrillic small letter soft sign */ + { 0x44D, 0xDC }, /* cyrillic small letter e */ + { 0x44E, 0xC0 }, /* cyrillic small letter yu */ + { 0x44F, 0xD1 }, /* cyrillic small letter ya */ + { 0x451, 0xA3 }, /* cyrillic small letter io */ + { 0x2219, 0x95 }, /* bullet operator */ + { 0x221A, 0x96 }, /* square root */ + { 0x2248, 0x97 }, /* almost equal to */ + { 0x2264, 0x98 }, /* less-than or equal to */ + { 0x2265, 0x99 }, /* greater-than or equal to */ + { 0x2320, 0x93 }, /* top half integral */ + { 0x2321, 0x9B }, /* bottom half integral */ + { 0x2500, 0x80 }, /* box drawings light horizontal */ + { 0x2502, 0x81 }, /* box drawings light vertical */ + { 0x250C, 0x82 }, /* box drawings light down and right */ + { 0x2510, 0x83 }, /* box drawings light down and left */ + { 0x2514, 0x84 }, /* box drawings light up and right */ + { 0x2518, 0x85 }, /* box drawings light up and left */ + { 0x251C, 0x86 }, /* box drawings light vertical and right */ + { 0x2524, 0x87 }, /* box drawings light vertical and left */ + { 0x252C, 0x88 }, /* box drawings light down and horizontal */ + { 0x2534, 0x89 }, /* box drawings light up and horizontal */ + { 0x253C, 0x8A }, /* box drawings light vertical and horizontal */ + { 0x2550, 0xA0 }, /* box drawings double horizontal */ + { 0x2551, 0xA1 }, /* box drawings double vertical */ + { 0x2552, 0xA2 }, /* box drawings down single and right double */ + { 0x2553, 0xA4 }, /* box drawings down double and right single */ + { 0x2554, 0xA5 }, /* box drawings double down and right */ + { 0x2555, 0xA6 }, /* box drawings down single and left double */ + { 0x2556, 0xA7 }, /* box drawings down double and left single */ + { 0x2557, 0xA8 }, /* box drawings double down and left */ + { 0x2558, 0xA9 }, /* box drawings up single and right double */ + { 0x2559, 0xAA }, /* box drawings up double and right single */ + { 0x255A, 0xAB }, /* box drawings double up and right */ + { 0x255B, 0xAC }, /* box drawings up single and left double */ + { 0x255C, 0xAD }, /* box drawings up double and left single */ + { 0x255D, 0xAE }, /* box drawings double up and left */ + { 0x255E, 0xAF }, /* box drawings vertical single and right double */ + { 0x255F, 0xB0 }, /* box drawings vertical double and right single */ + { 0x2560, 0xB1 }, /* box drawings double vertical and right */ + { 0x2561, 0xB2 }, /* box drawings vertical single and left double */ + { 0x2562, 0xB4 }, /* box drawings vertical double and left single */ + { 0x2563, 0xB5 }, /* box drawings double vertical and left */ + { 0x2564, 0xB6 }, /* box drawings down single and horizontal double */ + { 0x2565, 0xB7 }, /* box drawings down double and horizontal single */ + { 0x2566, 0xB8 }, /* box drawings double down and horizontal */ + { 0x2567, 0xB9 }, /* box drawings up single and horizontal double */ + { 0x2568, 0xBA }, /* box drawings up double and horizontal single */ + { 0x2569, 0xBB }, /* box drawings double up and horizontal */ + { 0x256A, 0xBC }, /* box drawings vertical single and horizontal double */ + { 0x256B, 0xBD }, /* box drawings vertical double and horizontal single */ + { 0x256C, 0xBE }, /* box drawings double vertical and horizontal */ + { 0x2580, 0x8B }, /* upper half block */ + { 0x2584, 0x8C }, /* lower half block */ + { 0x2588, 0x8D }, /* full block */ + { 0x258C, 0x8E }, /* left half block */ + { 0x2590, 0x8F }, /* right half block */ + { 0x2591, 0x90 }, /* light shade */ + { 0x2592, 0x91 }, /* medium shade */ + { 0x2593, 0x92 }, /* dark shade */ + { 0x25A0, 0x94 }, /* black square */ +}; + +static const unicode_mapping unimap_cp866[] = { + { 0xA0, 0xFF }, /* no-break space */ + { 0xA4, 0xFD }, /* currency sign */ + { 0xB0, 0xF8 }, /* degree sign */ + { 0xB7, 0xFA }, /* middle dot */ + { 0x401, 0xF0 }, /* cyrillic capital letter io */ + { 0x404, 0xF2 }, /* cyrillic capital letter ukrainian ie */ + { 0x407, 0xF4 }, /* cyrillic capital letter yi */ + { 0x40E, 0xF6 }, /* cyrillic capital letter short u */ + { 0x410, 0x80 }, /* cyrillic capital letter a */ + { 0x411, 0x81 }, /* cyrillic capital letter be */ + { 0x412, 0x82 }, /* cyrillic capital letter ve */ + { 0x413, 0x83 }, /* cyrillic capital letter ghe */ + { 0x414, 0x84 }, /* cyrillic capital letter de */ + { 0x415, 0x85 }, /* cyrillic capital letter ie */ + { 0x416, 0x86 }, /* cyrillic capital letter zhe */ + { 0x417, 0x87 }, /* cyrillic capital letter ze */ + { 0x418, 0x88 }, /* cyrillic capital letter i */ + { 0x419, 0x89 }, /* cyrillic capital letter short i */ + { 0x41A, 0x8A }, /* cyrillic capital letter ka */ + { 0x41B, 0x8B }, /* cyrillic capital letter el */ + { 0x41C, 0x8C }, /* cyrillic capital letter em */ + { 0x41D, 0x8D }, /* cyrillic capital letter en */ + { 0x41E, 0x8E }, /* cyrillic capital letter o */ + { 0x41F, 0x8F }, /* cyrillic capital letter pe */ + { 0x420, 0x90 }, /* cyrillic capital letter er */ + { 0x421, 0x91 }, /* cyrillic capital letter es */ + { 0x422, 0x92 }, /* cyrillic capital letter te */ + { 0x423, 0x93 }, /* cyrillic capital letter u */ + { 0x424, 0x94 }, /* cyrillic capital letter ef */ + { 0x425, 0x95 }, /* cyrillic capital letter ha */ + { 0x426, 0x96 }, /* cyrillic capital letter tse */ + { 0x427, 0x97 }, /* cyrillic capital letter che */ + { 0x428, 0x98 }, /* cyrillic capital letter sha */ + { 0x429, 0x99 }, /* cyrillic capital letter shcha */ + { 0x42A, 0x9A }, /* cyrillic capital letter hard sign */ + { 0x42B, 0x9B }, /* cyrillic capital letter yeru */ + { 0x42C, 0x9C }, /* cyrillic capital letter soft sign */ + { 0x42D, 0x9D }, /* cyrillic capital letter e */ + { 0x42E, 0x9F }, /* cyrillic capital letter ya */ + { 0x430, 0xA0 }, /* cyrillic small letter a */ + { 0x431, 0xA1 }, /* cyrillic small letter be */ + { 0x432, 0xA2 }, /* cyrillic small letter ve */ + { 0x433, 0xA3 }, /* cyrillic small letter ghe */ + { 0x434, 0xA4 }, /* cyrillic small letter de */ + { 0x435, 0xA5 }, /* cyrillic small letter ie */ + { 0x436, 0xA6 }, /* cyrillic small letter zhe */ + { 0x437, 0xA7 }, /* cyrillic small letter ze */ + { 0x438, 0xA8 }, /* cyrillic small letter i */ + { 0x439, 0xA9 }, /* cyrillic small letter short i */ + { 0x43A, 0xAA }, /* cyrillic small letter ka */ + { 0x43B, 0xAB }, /* cyrillic small letter el */ + { 0x43C, 0xAC }, /* cyrillic small letter em */ + { 0x43D, 0xAD }, /* cyrillic small letter en */ + { 0x43E, 0xAE }, /* cyrillic small letter o */ + { 0x43F, 0xAF }, /* cyrillic small letter pe */ + { 0x440, 0xE0 }, /* cyrillic small letter er */ + { 0x441, 0xE1 }, /* cyrillic small letter es */ + { 0x442, 0xE2 }, /* cyrillic small letter te */ + { 0x443, 0xE3 }, /* cyrillic small letter u */ + { 0x444, 0xE4 }, /* cyrillic small letter ef */ + { 0x445, 0xE5 }, /* cyrillic small letter ha */ + { 0x446, 0xE6 }, /* cyrillic small letter tse */ + { 0x447, 0xE7 }, /* cyrillic small letter che */ + { 0x448, 0xE8 }, /* cyrillic small letter sha */ + { 0x449, 0xE9 }, /* cyrillic small letter shcha */ + { 0x44A, 0xEA }, /* cyrillic small letter hard sign */ + { 0x44B, 0xEB }, /* cyrillic small letter yeru */ + { 0x44C, 0xEC }, /* cyrillic small letter soft sign */ + { 0x44D, 0xED }, /* cyrillic small letter e */ + { 0x44E, 0xEE }, /* cyrillic small letter yu */ + { 0x44F, 0xEF }, /* cyrillic small letter ya */ + { 0x451, 0xF1 }, /* cyrillic small letter io */ + { 0x454, 0xF3 }, /* cyrillic small letter ukrainian ie */ + { 0x457, 0xF5 }, /* cyrillic small letter yi */ + { 0x45E, 0xF7 }, /* cyrillic small letter short u */ + { 0x2116, 0xFC }, /* numero sign */ + { 0x2219, 0xF9 }, /* bullet operator */ + { 0x221A, 0xFB }, /* square root */ + { 0x2500, 0xC4 }, /* box drawings light horizontal */ + { 0x2502, 0xB3 }, /* box drawings light vertical */ + { 0x250C, 0xDA }, /* box drawings light down and right */ + { 0x2510, 0xBF }, /* box drawings light down and left */ + { 0x2514, 0xC0 }, /* box drawings light up and right */ + { 0x2518, 0xD9 }, /* box drawings light up and left */ + { 0x251C, 0xC3 }, /* box drawings light vertical and right */ + { 0x2524, 0xB4 }, /* box drawings light vertical and left */ + { 0x252C, 0xC2 }, /* box drawings light down and horizontal */ + { 0x2534, 0xC1 }, /* box drawings light up and horizontal */ + { 0x253C, 0xC5 }, /* box drawings light vertical and horizontal */ + { 0x2550, 0xCD }, /* box drawings double horizontal */ + { 0x2551, 0xBA }, /* box drawings double vertical */ + { 0x2552, 0xD5 }, /* box drawings down single and right double */ + { 0x2553, 0xD6 }, /* box drawings down double and right single */ + { 0x2554, 0xC9 }, /* box drawings double down and right */ + { 0x2555, 0xB8 }, /* box drawings down single and left double */ + { 0x2556, 0xB7 }, /* box drawings down double and left single */ + { 0x2557, 0xBB }, /* box drawings double down and left */ + { 0x2558, 0xD4 }, /* box drawings up single and right double */ + { 0x2559, 0xD3 }, /* box drawings up double and right single */ + { 0x255A, 0xC8 }, /* box drawings double up and right */ + { 0x255B, 0xBE }, /* box drawings up single and left double */ + { 0x255C, 0xBD }, /* box drawings up double and left single */ + { 0x255D, 0xBC }, /* box drawings double up and left */ + { 0x255E, 0xC6 }, /* box drawings vertical single and right double */ + { 0x255F, 0xC7 }, /* box drawings vertical double and right single */ + { 0x2560, 0xCC }, /* box drawings double vertical and right */ + { 0x2561, 0xB5 }, /* box drawings vertical single and left double */ + { 0x2562, 0xB6 }, /* box drawings vertical double and left single */ + { 0x2563, 0xB9 }, /* box drawings double vertical and left */ + { 0x2564, 0xD1 }, /* box drawings down single and horizontal double */ + { 0x2565, 0xD2 }, /* box drawings down double and horizontal single */ + { 0x2566, 0xCB }, /* box drawings double down and horizontal */ + { 0x2567, 0xCF }, /* box drawings up single and horizontal double */ + { 0x2568, 0xD0 }, /* box drawings up double and horizontal single */ + { 0x2569, 0xCA }, /* box drawings double up and horizontal */ + { 0x256A, 0xD8 }, /* box drawings vertical single and horizontal double */ + { 0x256B, 0xD7 }, /* box drawings vertical double and horizontal single */ + { 0x256C, 0xCE }, /* box drawings double vertical and horizontal */ + { 0x2580, 0xDF }, /* upper half block */ + { 0x2584, 0xDC }, /* lower half block */ + { 0x2588, 0xDB }, /* full block */ + { 0x258C, 0xDD }, /* left half block */ + { 0x2590, 0xDE }, /* right half block */ + { 0x2591, 0xB0 }, /* light shade */ + { 0x2592, 0xB1 }, /* medium shade */ + { 0x2593, 0xB2 }, /* dark shade */ + { 0x25A0, 0xFE }, /* black square */ +}; + +static const unicode_mapping unimap_macroman[] = { + { 0xA0, 0xCA }, /* no-break space */ + { 0xA1, 0xC1 }, /* inverted exclamation mark */ + { 0xA2, 0xA2 }, /* cent sign */ + { 0xA3, 0xA3 }, /* pound sign */ + { 0xA5, 0xB4 }, /* yen sign */ + { 0xA7, 0xA4 }, /* section sign */ + { 0xA8, 0xAC }, /* diaeresis */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAA, 0xBB }, /* feminine ordinal indicator */ + { 0xAB, 0xC7 }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xC2 }, /* not sign */ + { 0xAE, 0xA8 }, /* registered sign */ + { 0xAF, 0xF8 }, /* macron */ + { 0xB0, 0xA1 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB4, 0xAB }, /* acute accent */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xA6 }, /* pilcrow sign */ + { 0xB7, 0xE1 }, /* middle dot */ + { 0xB8, 0xFC }, /* cedilla */ + { 0xBA, 0xBC }, /* masculine ordinal indicator */ + { 0xBB, 0xC8 }, /* right-pointing double angle quotation mark */ + { 0xBF, 0xC0 }, /* inverted question mark */ + { 0xC0, 0xCB }, /* latin capital letter a with grave */ + { 0xC1, 0xE7 }, /* latin capital letter a with acute */ + { 0xC2, 0xE5 }, /* latin capital letter a with circumflex */ + { 0xC3, 0xCC }, /* latin capital letter a with tilde */ + { 0xC4, 0x80 }, /* latin capital letter a with diaeresis */ + { 0xC5, 0x81 }, /* latin capital letter a with ring above */ + { 0xC6, 0xAE }, /* latin capital letter ae */ + { 0xC7, 0x82 }, /* latin capital letter c with cedilla */ + { 0xC8, 0xE9 }, /* latin capital letter e with grave */ + { 0xC9, 0x83 }, /* latin capital letter e with acute */ + { 0xCA, 0xE6 }, /* latin capital letter e with circumflex */ + { 0xCB, 0xE8 }, /* latin capital letter e with diaeresis */ + { 0xCC, 0xED }, /* latin capital letter i with grave */ + { 0xCD, 0xEA }, /* latin capital letter i with acute */ + { 0xCE, 0xEB }, /* latin capital letter i with circumflex */ + { 0xCF, 0xEC }, /* latin capital letter i with diaeresis */ + { 0xD1, 0x84 }, /* latin capital letter n with tilde */ + { 0xD2, 0xF1 }, /* latin capital letter o with grave */ + { 0xD3, 0xEE }, /* latin capital letter o with acute */ + { 0xD4, 0xEF }, /* latin capital letter o with circumflex */ + { 0xD5, 0xCD }, /* latin capital letter o with tilde */ + { 0xD6, 0x85 }, /* latin capital letter o with diaeresis */ + { 0xD8, 0xAF }, /* latin capital letter o with stroke */ + { 0xD9, 0xF4 }, /* latin capital letter u with grave */ + { 0xDA, 0xF2 }, /* latin capital letter u with acute */ + { 0xDB, 0xF3 }, /* latin capital letter u with circumflex */ + { 0xDC, 0x86 }, /* latin capital letter u with diaeresis */ + { 0xDF, 0xA7 }, /* latin small letter sharp s */ + { 0xE0, 0x88 }, /* latin small letter a with grave */ + { 0xE1, 0x87 }, /* latin small letter a with acute */ + { 0xE2, 0x89 }, /* latin small letter a with circumflex */ + { 0xE3, 0x8B }, /* latin small letter a with tilde */ + { 0xE4, 0x8A }, /* latin small letter a with diaeresis */ + { 0xE5, 0x8C }, /* latin small letter a with ring above */ + { 0xE6, 0xBE }, /* latin small letter ae */ + { 0xE7, 0x8D }, /* latin small letter c with cedilla */ + { 0xE8, 0x8F }, /* latin small letter e with grave */ + { 0xE9, 0x8E }, /* latin small letter e with acute */ + { 0xEA, 0x90 }, /* latin small letter e with circumflex */ + { 0xEB, 0x91 }, /* latin small letter e with diaeresis */ + { 0xEC, 0x93 }, /* latin small letter i with grave */ + { 0xED, 0x92 }, /* latin small letter i with acute */ + { 0xEE, 0x94 }, /* latin small letter i with circumflex */ + { 0xEF, 0x95 }, /* latin small letter i with diaeresis */ + { 0xF1, 0x96 }, /* latin small letter n with tilde */ + { 0xF2, 0x98 }, /* latin small letter o with grave */ + { 0xF3, 0x97 }, /* latin small letter o with acute */ + { 0xF4, 0x99 }, /* latin small letter o with circumflex */ + { 0xF5, 0x9B }, /* latin small letter o with tilde */ + { 0xF6, 0x9A }, /* latin small letter o with diaeresis */ + { 0xF7, 0xD6 }, /* division sign */ + { 0xF8, 0xBF }, /* latin small letter o with stroke */ + { 0xF9, 0x9D }, /* latin small letter u with grave */ + { 0xFA, 0x9C }, /* latin small letter u with acute */ + { 0xFB, 0x9E }, /* latin small letter u with circumflex */ + { 0xFC, 0x9F }, /* latin small letter u with diaeresis */ + { 0xFF, 0xD8 }, /* latin small letter y with diaeresis */ + { 0x131, 0xF5 }, /* latin small letter dotless i */ + { 0x152, 0xCE }, /* latin capital ligature oe */ + { 0x153, 0xCF }, /* latin small ligature oe */ + { 0x178, 0xD9 }, /* latin capital letter y with diaeresis */ + { 0x192, 0xC4 }, /* latin small letter f with hook */ + { 0x2C6, 0xF6 }, /* modifier letter circumflex accent */ + { 0x2C7, 0xFF }, /* caron */ + { 0x2D8, 0xF9 }, /* breve */ + { 0x2D9, 0xFA }, /* dot above */ + { 0x2DA, 0xFB }, /* ring above */ + { 0x2DB, 0xFE }, /* ogonek */ + { 0x2DC, 0xF7 }, /* small tilde */ + { 0x2DD, 0xFD }, /* double acute accent */ + { 0x3A9, 0xBD }, /* greek capital letter omega */ + { 0x3C0, 0xB9 }, /* greek small letter pi */ + { 0x2013, 0xD0 }, /* en dash */ + { 0x2014, 0xD1 }, /* em dash */ + { 0x2018, 0xD4 }, /* left single quotation mark */ + { 0x2019, 0xD5 }, /* right single quotation mark */ + { 0x201A, 0xE2 }, /* single low-9 quotation mark */ + { 0x201C, 0xD2 }, /* left double quotation mark */ + { 0x201D, 0xD3 }, /* right double quotation mark */ + { 0x201E, 0xE3 }, /* double low-9 quotation mark */ + { 0x2020, 0xA0 }, /* dagger */ + { 0x2021, 0xE0 }, /* double dagger */ + { 0x2022, 0xA5 }, /* bullet */ + { 0x2026, 0xC9 }, /* horizontal ellipsis */ + { 0x2030, 0xE4 }, /* per mille sign */ + { 0x2039, 0xDC }, /* single left-pointing angle quotation mark */ + { 0x203A, 0xDD }, /* single right-pointing angle quotation mark */ + { 0x2044, 0xDA }, /* fraction slash */ + { 0x20AC, 0xDB }, /* euro sign */ + { 0x2122, 0xAA }, /* trade mark sign */ + { 0x2202, 0xB6 }, /* partial differential */ + { 0x2206, 0xC6 }, /* increment */ + { 0x220F, 0xB8 }, /* n-ary product */ + { 0x2211, 0xB7 }, /* n-ary summation */ + { 0x221A, 0xC3 }, /* square root */ + { 0x221E, 0xB0 }, /* infinity */ + { 0x222B, 0xBA }, /* integral */ + { 0x2248, 0xC5 }, /* almost equal to */ + { 0x2260, 0xAD }, /* not equal to */ + { 0x2264, 0xB2 }, /* less-than or equal to */ + { 0x2265, 0xB3 }, /* greater-than or equal to */ + { 0x25CA, 0xD7 }, /* lozenge */ + { 0xF8FF, 0xF0 }, /* apple logo */ + { 0xFB01, 0xDE }, /* latin small ligature fi */ + { 0xFB02, 0xDF }, /* latin small ligature fl */ +}; + +#endif /* HTML_TABLES_H */ diff --git a/ext/standard/tests/strings/get_html_translation_table_basic1.phpt b/ext/standard/tests/strings/get_html_translation_table_basic1.phpt index c09388335b..8b6c9afdaa 100644 --- a/ext/standard/tests/strings/get_html_translation_table_basic1.phpt +++ b/ext/standard/tests/strings/get_html_translation_table_basic1.phpt @@ -43,14 +43,14 @@ echo "Done\n"; *** Testing get_html_translation_table() : basic functionality *** -- with default arguments -- array(4) { + ["&"]=> + string(5) "&" ["""]=> string(6) """ ["<"]=> string(4) "<" [">"]=> string(4) ">" - ["&"]=> - string(5) "&" } -- with table = HTML_ENTITIES -- array(171) { @@ -400,13 +400,13 @@ array(171) { } -- with table = HTML_SPECIALCHARS -- array(4) { + ["&"]=> + string(5) "&" ["""]=> string(6) """ ["<"]=> string(4) "<" [">"]=> string(4) ">" - ["&"]=> - string(5) "&" } Done diff --git a/ext/standard/tests/strings/html_entity_decode_cp866.phpt b/ext/standard/tests/strings/html_entity_decode_cp866.phpt new file mode 100644 index 0000000000..94b23b6660 --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_cp866.phpt @@ -0,0 +1,533 @@ +--TEST-- +Translation of HTML entities for encoding CP866 +--FILE-- + array(0x80, "CYRILLIC CAPITAL LETTER A"), +0x0411 => array(0x81, "CYRILLIC CAPITAL LETTER BE"), +0x0412 => array(0x82, "CYRILLIC CAPITAL LETTER VE"), +0x0413 => array(0x83, "CYRILLIC CAPITAL LETTER GHE"), +0x0414 => array(0x84, "CYRILLIC CAPITAL LETTER DE"), +0x0415 => array(0x85, "CYRILLIC CAPITAL LETTER IE"), +0x0416 => array(0x86, "CYRILLIC CAPITAL LETTER ZHE"), +0x0417 => array(0x87, "CYRILLIC CAPITAL LETTER ZE"), +0x0418 => array(0x88, "CYRILLIC CAPITAL LETTER I"), +0x0419 => array(0x89, "CYRILLIC CAPITAL LETTER SHORT I"), +0x041a => array(0x8a, "CYRILLIC CAPITAL LETTER KA"), +0x041b => array(0x8b, "CYRILLIC CAPITAL LETTER EL"), +0x041c => array(0x8c, "CYRILLIC CAPITAL LETTER EM"), +0x041d => array(0x8d, "CYRILLIC CAPITAL LETTER EN"), +0x041e => array(0x8e, "CYRILLIC CAPITAL LETTER O"), +0x041f => array(0x8f, "CYRILLIC CAPITAL LETTER PE"), +0x0420 => array(0x90, "CYRILLIC CAPITAL LETTER ER"), +0x0421 => array(0x91, "CYRILLIC CAPITAL LETTER ES"), +0x0422 => array(0x92, "CYRILLIC CAPITAL LETTER TE"), +0x0423 => array(0x93, "CYRILLIC CAPITAL LETTER U"), +0x0424 => array(0x94, "CYRILLIC CAPITAL LETTER EF"), +0x0425 => array(0x95, "CYRILLIC CAPITAL LETTER HA"), +0x0426 => array(0x96, "CYRILLIC CAPITAL LETTER TSE"), +0x0427 => array(0x97, "CYRILLIC CAPITAL LETTER CHE"), +0x0428 => array(0x98, "CYRILLIC CAPITAL LETTER SHA"), +0x0429 => array(0x99, "CYRILLIC CAPITAL LETTER SHCHA"), +0x042a => array(0x9a, "CYRILLIC CAPITAL LETTER HARD SIGN"), +0x042b => array(0x9b, "CYRILLIC CAPITAL LETTER YERU"), +0x042c => array(0x9c, "CYRILLIC CAPITAL LETTER SOFT SIGN"), +0x042d => array(0x9d, "CYRILLIC CAPITAL LETTER E"), +0x042e => array(0x9e, "CYRILLIC CAPITAL LETTER YU"), +0x042f => array(0x9f, "CYRILLIC CAPITAL LETTER YA"), +0x0430 => array(0xa0, "CYRILLIC SMALL LETTER A"), +0x0431 => array(0xa1, "CYRILLIC SMALL LETTER BE"), +0x0432 => array(0xa2, "CYRILLIC SMALL LETTER VE"), +0x0433 => array(0xa3, "CYRILLIC SMALL LETTER GHE"), +0x0434 => array(0xa4, "CYRILLIC SMALL LETTER DE"), +0x0435 => array(0xa5, "CYRILLIC SMALL LETTER IE"), +0x0436 => array(0xa6, "CYRILLIC SMALL LETTER ZHE"), +0x0437 => array(0xa7, "CYRILLIC SMALL LETTER ZE"), +0x0438 => array(0xa8, "CYRILLIC SMALL LETTER I"), +0x0439 => array(0xa9, "CYRILLIC SMALL LETTER SHORT I"), +0x043a => array(0xaa, "CYRILLIC SMALL LETTER KA"), +0x043b => array(0xab, "CYRILLIC SMALL LETTER EL"), +0x043c => array(0xac, "CYRILLIC SMALL LETTER EM"), +0x043d => array(0xad, "CYRILLIC SMALL LETTER EN"), +0x043e => array(0xae, "CYRILLIC SMALL LETTER O"), +0x043f => array(0xaf, "CYRILLIC SMALL LETTER PE"), +0x2591 => array(0xb0, "LIGHT SHADE"), +0x2592 => array(0xb1, "MEDIUM SHADE"), +0x2593 => array(0xb2, "DARK SHADE"), +0x2502 => array(0xb3, "BOX DRAWINGS LIGHT VERTICAL"), +0x2524 => array(0xb4, "BOX DRAWINGS LIGHT VERTICAL AND LEFT"), +0x2561 => array(0xb5, "BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE"), +0x2562 => array(0xb6, "BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE"), +0x2556 => array(0xb7, "BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE"), +0x2555 => array(0xb8, "BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE"), +0x2563 => array(0xb9, "BOX DRAWINGS DOUBLE VERTICAL AND LEFT"), +0x2551 => array(0xba, "BOX DRAWINGS DOUBLE VERTICAL"), +0x2557 => array(0xbb, "BOX DRAWINGS DOUBLE DOWN AND LEFT"), +0x255d => array(0xbc, "BOX DRAWINGS DOUBLE UP AND LEFT"), +0x255c => array(0xbd, "BOX DRAWINGS UP DOUBLE AND LEFT SINGLE"), +0x255b => array(0xbe, "BOX DRAWINGS UP SINGLE AND LEFT DOUBLE"), +0x2510 => array(0xbf, "BOX DRAWINGS LIGHT DOWN AND LEFT"), +0x2514 => array(0xc0, "BOX DRAWINGS LIGHT UP AND RIGHT"), +0x2534 => array(0xc1, "BOX DRAWINGS LIGHT UP AND HORIZONTAL"), +0x252c => array(0xc2, "BOX DRAWINGS LIGHT DOWN AND HORIZONTAL"), +0x251c => array(0xc3, "BOX DRAWINGS LIGHT VERTICAL AND RIGHT"), +0x2500 => array(0xc4, "BOX DRAWINGS LIGHT HORIZONTAL"), +0x253c => array(0xc5, "BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL"), +0x255e => array(0xc6, "BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE"), +0x255f => array(0xc7, "BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE"), +0x255a => array(0xc8, "BOX DRAWINGS DOUBLE UP AND RIGHT"), +0x2554 => array(0xc9, "BOX DRAWINGS DOUBLE DOWN AND RIGHT"), +0x2569 => array(0xca, "BOX DRAWINGS DOUBLE UP AND HORIZONTAL"), +0x2566 => array(0xcb, "BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL"), +0x2560 => array(0xcc, "BOX DRAWINGS DOUBLE VERTICAL AND RIGHT"), +0x2550 => array(0xcd, "BOX DRAWINGS DOUBLE HORIZONTAL"), +0x256c => array(0xce, "BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL"), +0x2567 => array(0xcf, "BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE"), +0x2568 => array(0xd0, "BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE"), +0x2564 => array(0xd1, "BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE"), +0x2565 => array(0xd2, "BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE"), +0x2559 => array(0xd3, "BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE"), +0x2558 => array(0xd4, "BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE"), +0x2552 => array(0xd5, "BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE"), +0x2553 => array(0xd6, "BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE"), +0x256b => array(0xd7, "BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE"), +0x256a => array(0xd8, "BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE"), +0x2518 => array(0xd9, "BOX DRAWINGS LIGHT UP AND LEFT"), +0x250c => array(0xda, "BOX DRAWINGS LIGHT DOWN AND RIGHT"), +0x2588 => array(0xdb, "FULL BLOCK"), +0x2584 => array(0xdc, "LOWER HALF BLOCK"), +0x258c => array(0xdd, "LEFT HALF BLOCK"), +0x2590 => array(0xde, "RIGHT HALF BLOCK"), +0x2580 => array(0xdf, "UPPER HALF BLOCK"), +0x0440 => array(0xe0, "CYRILLIC SMALL LETTER ER"), +0x0441 => array(0xe1, "CYRILLIC SMALL LETTER ES"), +0x0442 => array(0xe2, "CYRILLIC SMALL LETTER TE"), +0x0443 => array(0xe3, "CYRILLIC SMALL LETTER U"), +0x0444 => array(0xe4, "CYRILLIC SMALL LETTER EF"), +0x0445 => array(0xe5, "CYRILLIC SMALL LETTER HA"), +0x0446 => array(0xe6, "CYRILLIC SMALL LETTER TSE"), +0x0447 => array(0xe7, "CYRILLIC SMALL LETTER CHE"), +0x0448 => array(0xe8, "CYRILLIC SMALL LETTER SHA"), +0x0449 => array(0xe9, "CYRILLIC SMALL LETTER SHCHA"), +0x044a => array(0xea, "CYRILLIC SMALL LETTER HARD SIGN"), +0x044b => array(0xeb, "CYRILLIC SMALL LETTER YERU"), +0x044c => array(0xec, "CYRILLIC SMALL LETTER SOFT SIGN"), +0x044d => array(0xed, "CYRILLIC SMALL LETTER E"), +0x044e => array(0xee, "CYRILLIC SMALL LETTER YU"), +0x044f => array(0xef, "CYRILLIC SMALL LETTER YA"), +0x0401 => array(0xf0, "CYRILLIC CAPITAL LETTER IO"), +0x0451 => array(0xf1, "CYRILLIC SMALL LETTER IO"), +0x0404 => array(0xf2, "CYRILLIC CAPITAL LETTER UKRAINIAN IE"), +0x0454 => array(0xf3, "CYRILLIC SMALL LETTER UKRAINIAN IE"), +0x0407 => array(0xf4, "CYRILLIC CAPITAL LETTER YI"), +0x0457 => array(0xf5, "CYRILLIC SMALL LETTER YI"), +0x040e => array(0xf6, "CYRILLIC CAPITAL LETTER SHORT U"), +0x045e => array(0xf7, "CYRILLIC SMALL LETTER SHORT U"), +0x00b0 => array(0xf8, "DEGREE SIGN"), +0x2219 => array(0xf9, "BULLET OPERATOR"), +0x00b7 => array(0xfa, "MIDDLE DOT"), +0x221a => array(0xfb, "SQUARE ROOT"), +0x2116 => array(0xfc, "NUMERO SIGN"), +0x00a4 => array(0xfd, "CURRENCY SIGN"), +0x25a0 => array(0xfe, "BLACK SQUARE"), +0x00a0 => array(0xff, "NO-BREAK SPACE"), +); + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'CP866'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'CP866'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +CYRILLIC CAPITAL LETTER A: А => 80 +€ => € + +CYRILLIC CAPITAL LETTER BE: Б => 81 + =>  + +CYRILLIC CAPITAL LETTER VE: В => 82 +‚ => ‚ + +CYRILLIC CAPITAL LETTER GHE: Г => 83 +ƒ => ƒ + +CYRILLIC CAPITAL LETTER DE: Д => 84 +„ => „ + +CYRILLIC CAPITAL LETTER IE: Е => 85 +… => … + +CYRILLIC CAPITAL LETTER ZHE: Ж => 86 +† => † + +CYRILLIC CAPITAL LETTER ZE: З => 87 +‡ => ‡ + +CYRILLIC CAPITAL LETTER I: И => 88 +ˆ => ˆ + +CYRILLIC CAPITAL LETTER SHORT I: Й => 89 +‰ => ‰ + +CYRILLIC CAPITAL LETTER KA: К => 8a +Š => Š + +CYRILLIC CAPITAL LETTER EL: Л => 8b +‹ => ‹ + +CYRILLIC CAPITAL LETTER EM: М => 8c +Œ => Œ + +CYRILLIC CAPITAL LETTER EN: Н => 8d + =>  + +CYRILLIC CAPITAL LETTER O: О => 8e +Ž => Ž + +CYRILLIC CAPITAL LETTER PE: П => 8f + =>  + +CYRILLIC CAPITAL LETTER ER: Р => 90 + =>  + +CYRILLIC CAPITAL LETTER ES: С => 91 +‘ => ‘ + +CYRILLIC CAPITAL LETTER TE: Т => 92 +’ => ’ + +CYRILLIC CAPITAL LETTER U: У => 93 +“ => “ + +CYRILLIC CAPITAL LETTER EF: Ф => 94 +” => ” + +CYRILLIC CAPITAL LETTER HA: Х => 95 +• => • + +CYRILLIC CAPITAL LETTER TSE: Ц => 96 +– => – + +CYRILLIC CAPITAL LETTER CHE: Ч => 97 +— => — + +CYRILLIC CAPITAL LETTER SHA: Ш => 98 +˜ => ˜ + +CYRILLIC CAPITAL LETTER SHCHA: Щ => 99 +™ => ™ + +CYRILLIC CAPITAL LETTER HARD SIGN: Ъ => 9a +š => š + +CYRILLIC CAPITAL LETTER YERU: Ы => 9b +› => › + +CYRILLIC CAPITAL LETTER SOFT SIGN: Ь => 9c +œ => œ + +CYRILLIC CAPITAL LETTER E: Э => 9d + =>  + +CYRILLIC CAPITAL LETTER YU: Ю => 9f +ž => ž + +CYRILLIC CAPITAL LETTER YA: Я => 2623783432463b +Ÿ => Ÿ + +CYRILLIC SMALL LETTER A: а => a0 +  => ff + +CYRILLIC SMALL LETTER BE: б => a1 +¡ => ¡ + +CYRILLIC SMALL LETTER VE: в => a2 +¢ => ¢ + +CYRILLIC SMALL LETTER GHE: г => a3 +£ => £ + +CYRILLIC SMALL LETTER DE: д => a4 +¤ => fd + +CYRILLIC SMALL LETTER IE: е => a5 +¥ => ¥ + +CYRILLIC SMALL LETTER ZHE: ж => a6 +¦ => ¦ + +CYRILLIC SMALL LETTER ZE: з => a7 +§ => § + +CYRILLIC SMALL LETTER I: и => a8 +¨ => ¨ + +CYRILLIC SMALL LETTER SHORT I: й => a9 +© => © + +CYRILLIC SMALL LETTER KA: к => aa +ª => ª + +CYRILLIC SMALL LETTER EL: л => ab +« => « + +CYRILLIC SMALL LETTER EM: м => ac +¬ => ¬ + +CYRILLIC SMALL LETTER EN: н => ad +­ => ­ + +CYRILLIC SMALL LETTER O: о => ae +® => ® + +CYRILLIC SMALL LETTER PE: п => af +¯ => ¯ + +LIGHT SHADE: ░ => b0 +° => f8 + +MEDIUM SHADE: ▒ => b1 +± => ± + +DARK SHADE: ▓ => b2 +² => ² + +BOX DRAWINGS LIGHT VERTICAL: │ => b3 +³ => ³ + +BOX DRAWINGS LIGHT VERTICAL AND LEFT: ┤ => b4 +´ => ´ + +BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE: ╡ => b5 +µ => µ + +BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE: ╢ => b6 +¶ => ¶ + +BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE: ╖ => b7 +· => fa + +BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE: ╕ => b8 +¸ => ¸ + +BOX DRAWINGS DOUBLE VERTICAL AND LEFT: ╣ => b9 +¹ => ¹ + +BOX DRAWINGS DOUBLE VERTICAL: ║ => ba +º => º + +BOX DRAWINGS DOUBLE DOWN AND LEFT: ╗ => bb +» => » + +BOX DRAWINGS DOUBLE UP AND LEFT: ╝ => bc +¼ => ¼ + +BOX DRAWINGS UP DOUBLE AND LEFT SINGLE: ╜ => bd +½ => ½ + +BOX DRAWINGS UP SINGLE AND LEFT DOUBLE: ╛ => be +¾ => ¾ + +BOX DRAWINGS LIGHT DOWN AND LEFT: ┐ => bf +¿ => ¿ + +BOX DRAWINGS LIGHT UP AND RIGHT: └ => c0 +À => À + +BOX DRAWINGS LIGHT UP AND HORIZONTAL: ┴ => c1 +Á => Á + +BOX DRAWINGS LIGHT DOWN AND HORIZONTAL: ┬ => c2 + =>  + +BOX DRAWINGS LIGHT VERTICAL AND RIGHT: ├ => c3 +à => à + +BOX DRAWINGS LIGHT HORIZONTAL: ─ => c4 +Ä => Ä + +BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL: ┼ => c5 +Å => Å + +BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE: ╞ => c6 +Æ => Æ + +BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE: ╟ => c7 +Ç => Ç + +BOX DRAWINGS DOUBLE UP AND RIGHT: ╚ => c8 +È => È + +BOX DRAWINGS DOUBLE DOWN AND RIGHT: ╔ => c9 +É => É + +BOX DRAWINGS DOUBLE UP AND HORIZONTAL: ╩ => ca +Ê => Ê + +BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL: ╦ => cb +Ë => Ë + +BOX DRAWINGS DOUBLE VERTICAL AND RIGHT: ╠ => cc +Ì => Ì + +BOX DRAWINGS DOUBLE HORIZONTAL: ═ => cd +Í => Í + +BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL: ╬ => ce +Î => Î + +BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE: ╧ => cf +Ï => Ï + +BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE: ╨ => d0 +Ð => Ð + +BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE: ╤ => d1 +Ñ => Ñ + +BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE: ╥ => d2 +Ò => Ò + +BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE: ╙ => d3 +Ó => Ó + +BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE: ╘ => d4 +Ô => Ô + +BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE: ╒ => d5 +Õ => Õ + +BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE: ╓ => d6 +Ö => Ö + +BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE: ╫ => d7 +× => × + +BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE: ╪ => d8 +Ø => Ø + +BOX DRAWINGS LIGHT UP AND LEFT: ┘ => d9 +Ù => Ù + +BOX DRAWINGS LIGHT DOWN AND RIGHT: ┌ => da +Ú => Ú + +FULL BLOCK: █ => db +Û => Û + +LOWER HALF BLOCK: ▄ => dc +Ü => Ü + +LEFT HALF BLOCK: ▌ => dd +Ý => Ý + +RIGHT HALF BLOCK: ▐ => de +Þ => Þ + +UPPER HALF BLOCK: ▀ => df +ß => ß + +CYRILLIC SMALL LETTER ER: р => e0 +à => à + +CYRILLIC SMALL LETTER ES: с => e1 +á => á + +CYRILLIC SMALL LETTER TE: т => e2 +â => â + +CYRILLIC SMALL LETTER U: у => e3 +ã => ã + +CYRILLIC SMALL LETTER EF: ф => e4 +ä => ä + +CYRILLIC SMALL LETTER HA: х => e5 +å => å + +CYRILLIC SMALL LETTER TSE: ц => e6 +æ => æ + +CYRILLIC SMALL LETTER CHE: ч => e7 +ç => ç + +CYRILLIC SMALL LETTER SHA: ш => e8 +è => è + +CYRILLIC SMALL LETTER SHCHA: щ => e9 +é => é + +CYRILLIC SMALL LETTER HARD SIGN: ъ => ea +ê => ê + +CYRILLIC SMALL LETTER YERU: ы => eb +ë => ë + +CYRILLIC SMALL LETTER SOFT SIGN: ь => ec +ì => ì + +CYRILLIC SMALL LETTER E: э => ed +í => í + +CYRILLIC SMALL LETTER YU: ю => ee +î => î + +CYRILLIC SMALL LETTER YA: я => ef +ï => ï + +CYRILLIC CAPITAL LETTER IO: Ё => f0 +ð => ð + +CYRILLIC SMALL LETTER IO: ё => f1 +ñ => ñ + +CYRILLIC CAPITAL LETTER UKRAINIAN IE: Є => f2 +ò => ò + +CYRILLIC SMALL LETTER UKRAINIAN IE: є => f3 +ó => ó + +CYRILLIC CAPITAL LETTER YI: Ї => f4 +ô => ô + +CYRILLIC SMALL LETTER YI: ї => f5 +õ => õ + +CYRILLIC CAPITAL LETTER SHORT U: Ў => f6 +ö => ö + +CYRILLIC SMALL LETTER SHORT U: ў => f7 +÷ => ÷ + +DEGREE SIGN: ° => f8 +ø => ø + +BULLET OPERATOR: ∙ => f9 +ù => ù + +MIDDLE DOT: · => fa +ú => ú + +SQUARE ROOT: √ => fb +û => û + +NUMERO SIGN: № => fc +ü => ü + +CURRENCY SIGN: ¤ => fd +ý => ý + +BLACK SQUARE: ■ => fe +þ => þ + +NO-BREAK SPACE:   => ff +ÿ => ÿ + + diff --git a/ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt b/ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt new file mode 100644 index 0000000000..a3be8f3668 --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt @@ -0,0 +1,405 @@ +--TEST-- +Translation of HTML entities for encoding ISO-8859-15 +--FILE-- + array(0xA0, "NO-BREAK SPACE"), +0x00A1 => array(0xA1, "INVERTED EXCLAMATION MARK"), +0x00A2 => array(0xA2, "CENT SIGN"), +0x00A3 => array(0xA3, "POUND SIGN"), +0x20AC => array(0xA4, "EURO SIGN"), +0x00A5 => array(0xA5, "YEN SIGN"), +0x0160 => array(0xA6, "LATIN CAPITAL LETTER S WITH CARON"), +0x00A7 => array(0xA7, "SECTION SIGN"), +0x0161 => array(0xA8, "LATIN SMALL LETTER S WITH CARON"), +0x00A9 => array(0xA9, "COPYRIGHT SIGN"), +0x00AA => array(0xAA, "FEMININE ORDINAL INDICATOR"), +0x00AB => array(0xAB, "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x00AC => array(0xAC, "NOT SIGN"), +0x00AD => array(0xAD, "SOFT HYPHEN"), +0x00AE => array(0xAE, "REGISTERED SIGN"), +0x00AF => array(0xAF, "MACRON"), +0x00B0 => array(0xB0, "DEGREE SIGN"), +0x00B1 => array(0xB1, "PLUS-MINUS SIGN"), +0x00B2 => array(0xB2, "SUPERSCRIPT TWO"), +0x00B3 => array(0xB3, "SUPERSCRIPT THREE"), +0x017D => array(0xB4, "LATIN CAPITAL LETTER Z WITH CARON"), +0x00B5 => array(0xB5, "MICRO SIGN"), +0x00B6 => array(0xB6, "PILCROW SIGN"), +0x00B7 => array(0xB7, "MIDDLE DOT"), +0x017E => array(0xB8, "LATIN SMALL LETTER Z WITH CARON"), +0x00B9 => array(0xB9, "SUPERSCRIPT ONE"), +0x00BA => array(0xBA, "MASCULINE ORDINAL INDICATOR"), +0x00BB => array(0xBB, "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x0152 => array(0xBC, "LATIN CAPITAL LIGATURE OE"), +0x0153 => array(0xBD, "LATIN SMALL LIGATURE OE"), +0x0178 => array(0xBE, "LATIN CAPITAL LETTER Y WITH DIAERESIS"), +0x00BF => array(0xBF, "INVERTED QUESTION MARK"), +0x00C0 => array(0xC0, "LATIN CAPITAL LETTER A WITH GRAVE"), +0x00C1 => array(0xC1, "LATIN CAPITAL LETTER A WITH ACUTE"), +0x00C2 => array(0xC2, "LATIN CAPITAL LETTER A WITH CIRCUMFLEX"), +0x00C3 => array(0xC3, "LATIN CAPITAL LETTER A WITH TILDE"), +0x00C4 => array(0xC4, "LATIN CAPITAL LETTER A WITH DIAERESIS"), +0x00C5 => array(0xC5, "LATIN CAPITAL LETTER A WITH RING ABOVE"), +0x00C6 => array(0xC6, "LATIN CAPITAL LETTER AE"), +0x00C7 => array(0xC7, "LATIN CAPITAL LETTER C WITH CEDILLA"), +0x00C8 => array(0xC8, "LATIN CAPITAL LETTER E WITH GRAVE"), +0x00C9 => array(0xC9, "LATIN CAPITAL LETTER E WITH ACUTE"), +0x00CA => array(0xCA, "LATIN CAPITAL LETTER E WITH CIRCUMFLEX"), +0x00CB => array(0xCB, "LATIN CAPITAL LETTER E WITH DIAERESIS"), +0x00CC => array(0xCC, "LATIN CAPITAL LETTER I WITH GRAVE"), +0x00CD => array(0xCD, "LATIN CAPITAL LETTER I WITH ACUTE"), +0x00CE => array(0xCE, "LATIN CAPITAL LETTER I WITH CIRCUMFLEX"), +0x00CF => array(0xCF, "LATIN CAPITAL LETTER I WITH DIAERESIS"), +0x00D0 => array(0xD0, "LATIN CAPITAL LETTER ETH"), +0x00D1 => array(0xD1, "LATIN CAPITAL LETTER N WITH TILDE"), +0x00D2 => array(0xD2, "LATIN CAPITAL LETTER O WITH GRAVE"), +0x00D3 => array(0xD3, "LATIN CAPITAL LETTER O WITH ACUTE"), +0x00D4 => array(0xD4, "LATIN CAPITAL LETTER O WITH CIRCUMFLEX"), +0x00D5 => array(0xD5, "LATIN CAPITAL LETTER O WITH TILDE"), +0x00D6 => array(0xD6, "LATIN CAPITAL LETTER O WITH DIAERESIS"), +0x00D7 => array(0xD7, "MULTIPLICATION SIGN"), +0x00D8 => array(0xD8, "LATIN CAPITAL LETTER O WITH STROKE"), +0x00D9 => array(0xD9, "LATIN CAPITAL LETTER U WITH GRAVE"), +0x00DA => array(0xDA, "LATIN CAPITAL LETTER U WITH ACUTE"), +0x00DB => array(0xDB, "LATIN CAPITAL LETTER U WITH CIRCUMFLEX"), +0x00DC => array(0xDC, "LATIN CAPITAL LETTER U WITH DIAERESIS"), +0x00DD => array(0xDD, "LATIN CAPITAL LETTER Y WITH ACUTE"), +0x00DE => array(0xDE, "LATIN CAPITAL LETTER THORN"), +0x00DF => array(0xDF, "LATIN SMALL LETTER SHARP S"), +0x00E0 => array(0xE0, "LATIN SMALL LETTER A WITH GRAVE"), +0x00E1 => array(0xE1, "LATIN SMALL LETTER A WITH ACUTE"), +0x00E2 => array(0xE2, "LATIN SMALL LETTER A WITH CIRCUMFLEX"), +0x00E3 => array(0xE3, "LATIN SMALL LETTER A WITH TILDE"), +0x00E4 => array(0xE4, "LATIN SMALL LETTER A WITH DIAERESIS"), +0x00E5 => array(0xE5, "LATIN SMALL LETTER A WITH RING ABOVE"), +0x00E6 => array(0xE6, "LATIN SMALL LETTER AE"), +0x00E7 => array(0xE7, "LATIN SMALL LETTER C WITH CEDILLA"), +0x00E8 => array(0xE8, "LATIN SMALL LETTER E WITH GRAVE"), +0x00E9 => array(0xE9, "LATIN SMALL LETTER E WITH ACUTE"), +0x00EA => array(0xEA, "LATIN SMALL LETTER E WITH CIRCUMFLEX"), +0x00EB => array(0xEB, "LATIN SMALL LETTER E WITH DIAERESIS"), +0x00EC => array(0xEC, "LATIN SMALL LETTER I WITH GRAVE"), +0x00ED => array(0xED, "LATIN SMALL LETTER I WITH ACUTE"), +0x00EE => array(0xEE, "LATIN SMALL LETTER I WITH CIRCUMFLEX"), +0x00EF => array(0xEF, "LATIN SMALL LETTER I WITH DIAERESIS"), +0x00F0 => array(0xF0, "LATIN SMALL LETTER ETH"), +0x00F1 => array(0xF1, "LATIN SMALL LETTER N WITH TILDE"), +0x00F2 => array(0xF2, "LATIN SMALL LETTER O WITH GRAVE"), +0x00F3 => array(0xF3, "LATIN SMALL LETTER O WITH ACUTE"), +0x00F4 => array(0xF4, "LATIN SMALL LETTER O WITH CIRCUMFLEX"), +0x00F5 => array(0xF5, "LATIN SMALL LETTER O WITH TILDE"), +0x00F6 => array(0xF6, "LATIN SMALL LETTER O WITH DIAERESIS"), +0x00F7 => array(0xF7, "DIVISION SIGN"), +0x00F8 => array(0xF8, "LATIN SMALL LETTER O WITH STROKE"), +0x00F9 => array(0xF9, "LATIN SMALL LETTER U WITH GRAVE"), +0x00FA => array(0xFA, "LATIN SMALL LETTER U WITH ACUTE"), +0x00FB => array(0xFB, "LATIN SMALL LETTER U WITH CIRCUMFLEX"), +0x00FC => array(0xFC, "LATIN SMALL LETTER U WITH DIAERESIS"), +0x00FD => array(0xFD, "LATIN SMALL LETTER Y WITH ACUTE"), +0x00FE => array(0xFE, "LATIN SMALL LETTER THORN"), +0x00FF => array(0xFF, "LATIN SMALL LETTER Y WITH DIAERESIS"), +); + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-15'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-15'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +NO-BREAK SPACE:   => a0 +  => a0 + +INVERTED EXCLAMATION MARK: ¡ => a1 +¡ => a1 + +CENT SIGN: ¢ => a2 +¢ => a2 + +POUND SIGN: £ => a3 +£ => a3 + +EURO SIGN: € => a4 +¤ => ¤ + +YEN SIGN: ¥ => a5 +¥ => a5 + +LATIN CAPITAL LETTER S WITH CARON: Š => a6 +¦ => ¦ + +SECTION SIGN: § => a7 +§ => a7 + +LATIN SMALL LETTER S WITH CARON: š => a8 +¨ => ¨ + +COPYRIGHT SIGN: © => a9 +© => a9 + +FEMININE ORDINAL INDICATOR: ª => aa +ª => aa + +LEFT-POINTING DOUBLE ANGLE QUOTATION MARK: « => ab +« => ab + +NOT SIGN: ¬ => ac +¬ => ac + +SOFT HYPHEN: ­ => ad +­ => ad + +REGISTERED SIGN: ® => ae +® => ae + +MACRON: ¯ => af +¯ => af + +DEGREE SIGN: ° => b0 +° => b0 + +PLUS-MINUS SIGN: ± => b1 +± => b1 + +SUPERSCRIPT TWO: ² => b2 +² => b2 + +SUPERSCRIPT THREE: ³ => b3 +³ => b3 + +LATIN CAPITAL LETTER Z WITH CARON: Ž => b4 +´ => ´ + +MICRO SIGN: µ => b5 +µ => b5 + +PILCROW SIGN: ¶ => b6 +¶ => b6 + +MIDDLE DOT: · => b7 +· => b7 + +LATIN SMALL LETTER Z WITH CARON: ž => b8 +¸ => ¸ + +SUPERSCRIPT ONE: ¹ => b9 +¹ => b9 + +MASCULINE ORDINAL INDICATOR: º => ba +º => ba + +RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK: » => bb +» => bb + +LATIN CAPITAL LIGATURE OE: Œ => bc +¼ => ¼ + +LATIN SMALL LIGATURE OE: œ => bd +½ => ½ + +LATIN CAPITAL LETTER Y WITH DIAERESIS: Ÿ => be +¾ => ¾ + +INVERTED QUESTION MARK: ¿ => bf +¿ => bf + +LATIN CAPITAL LETTER A WITH GRAVE: À => c0 +À => c0 + +LATIN CAPITAL LETTER A WITH ACUTE: Á => c1 +Á => c1 + +LATIN CAPITAL LETTER A WITH CIRCUMFLEX:  => c2 + => c2 + +LATIN CAPITAL LETTER A WITH TILDE: à => c3 +à => c3 + +LATIN CAPITAL LETTER A WITH DIAERESIS: Ä => c4 +Ä => c4 + +LATIN CAPITAL LETTER A WITH RING ABOVE: Å => c5 +Å => c5 + +LATIN CAPITAL LETTER AE: Æ => c6 +Æ => c6 + +LATIN CAPITAL LETTER C WITH CEDILLA: Ç => c7 +Ç => c7 + +LATIN CAPITAL LETTER E WITH GRAVE: È => c8 +È => c8 + +LATIN CAPITAL LETTER E WITH ACUTE: É => c9 +É => c9 + +LATIN CAPITAL LETTER E WITH CIRCUMFLEX: Ê => ca +Ê => ca + +LATIN CAPITAL LETTER E WITH DIAERESIS: Ë => cb +Ë => cb + +LATIN CAPITAL LETTER I WITH GRAVE: Ì => cc +Ì => cc + +LATIN CAPITAL LETTER I WITH ACUTE: Í => cd +Í => cd + +LATIN CAPITAL LETTER I WITH CIRCUMFLEX: Î => ce +Î => ce + +LATIN CAPITAL LETTER I WITH DIAERESIS: Ï => cf +Ï => cf + +LATIN CAPITAL LETTER ETH: Ð => d0 +Ð => d0 + +LATIN CAPITAL LETTER N WITH TILDE: Ñ => d1 +Ñ => d1 + +LATIN CAPITAL LETTER O WITH GRAVE: Ò => d2 +Ò => d2 + +LATIN CAPITAL LETTER O WITH ACUTE: Ó => d3 +Ó => d3 + +LATIN CAPITAL LETTER O WITH CIRCUMFLEX: Ô => d4 +Ô => d4 + +LATIN CAPITAL LETTER O WITH TILDE: Õ => d5 +Õ => d5 + +LATIN CAPITAL LETTER O WITH DIAERESIS: Ö => d6 +Ö => d6 + +MULTIPLICATION SIGN: × => d7 +× => d7 + +LATIN CAPITAL LETTER O WITH STROKE: Ø => d8 +Ø => d8 + +LATIN CAPITAL LETTER U WITH GRAVE: Ù => d9 +Ù => d9 + +LATIN CAPITAL LETTER U WITH ACUTE: Ú => da +Ú => da + +LATIN CAPITAL LETTER U WITH CIRCUMFLEX: Û => db +Û => db + +LATIN CAPITAL LETTER U WITH DIAERESIS: Ü => dc +Ü => dc + +LATIN CAPITAL LETTER Y WITH ACUTE: Ý => dd +Ý => dd + +LATIN CAPITAL LETTER THORN: Þ => de +Þ => de + +LATIN SMALL LETTER SHARP S: ß => df +ß => df + +LATIN SMALL LETTER A WITH GRAVE: à => e0 +à => e0 + +LATIN SMALL LETTER A WITH ACUTE: á => e1 +á => e1 + +LATIN SMALL LETTER A WITH CIRCUMFLEX: â => e2 +â => e2 + +LATIN SMALL LETTER A WITH TILDE: ã => e3 +ã => e3 + +LATIN SMALL LETTER A WITH DIAERESIS: ä => e4 +ä => e4 + +LATIN SMALL LETTER A WITH RING ABOVE: å => e5 +å => e5 + +LATIN SMALL LETTER AE: æ => e6 +æ => e6 + +LATIN SMALL LETTER C WITH CEDILLA: ç => e7 +ç => e7 + +LATIN SMALL LETTER E WITH GRAVE: è => e8 +è => e8 + +LATIN SMALL LETTER E WITH ACUTE: é => e9 +é => e9 + +LATIN SMALL LETTER E WITH CIRCUMFLEX: ê => ea +ê => ea + +LATIN SMALL LETTER E WITH DIAERESIS: ë => eb +ë => eb + +LATIN SMALL LETTER I WITH GRAVE: ì => ec +ì => ec + +LATIN SMALL LETTER I WITH ACUTE: í => ed +í => ed + +LATIN SMALL LETTER I WITH CIRCUMFLEX: î => ee +î => ee + +LATIN SMALL LETTER I WITH DIAERESIS: ï => ef +ï => ef + +LATIN SMALL LETTER ETH: ð => f0 +ð => f0 + +LATIN SMALL LETTER N WITH TILDE: ñ => f1 +ñ => f1 + +LATIN SMALL LETTER O WITH GRAVE: ò => f2 +ò => f2 + +LATIN SMALL LETTER O WITH ACUTE: ó => f3 +ó => f3 + +LATIN SMALL LETTER O WITH CIRCUMFLEX: ô => f4 +ô => f4 + +LATIN SMALL LETTER O WITH TILDE: õ => f5 +õ => f5 + +LATIN SMALL LETTER O WITH DIAERESIS: ö => f6 +ö => f6 + +DIVISION SIGN: ÷ => f7 +÷ => f7 + +LATIN SMALL LETTER O WITH STROKE: ø => f8 +ø => f8 + +LATIN SMALL LETTER U WITH GRAVE: ù => f9 +ù => f9 + +LATIN SMALL LETTER U WITH ACUTE: ú => fa +ú => fa + +LATIN SMALL LETTER U WITH CIRCUMFLEX: û => fb +û => fb + +LATIN SMALL LETTER U WITH DIAERESIS: ü => fc +ü => fc + +LATIN SMALL LETTER Y WITH ACUTE: ý => fd +ý => fd + +LATIN SMALL LETTER THORN: þ => fe +þ => fe + +LATIN SMALL LETTER Y WITH DIAERESIS: ÿ => ff +ÿ => ff + + diff --git a/ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt b/ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt new file mode 100644 index 0000000000..6a65413c9c --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt @@ -0,0 +1,405 @@ +--TEST-- +Translation of HTML entities for encoding ISO-8859-5 +--FILE-- + array(0xA0, "NO-BREAK SPACE"), +0x0401 => array(0xA1, "CYRILLIC CAPITAL LETTER IO"), +0x0402 => array(0xA2, "CYRILLIC CAPITAL LETTER DJE"), +0x0403 => array(0xA3, "CYRILLIC CAPITAL LETTER GJE"), +0x0404 => array(0xA4, "CYRILLIC CAPITAL LETTER UKRAINIAN IE"), +0x0405 => array(0xA5, "CYRILLIC CAPITAL LETTER DZE"), +0x0406 => array(0xA6, "CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I"), +0x0407 => array(0xA7, "CYRILLIC CAPITAL LETTER YI"), +0x0408 => array(0xA8, "CYRILLIC CAPITAL LETTER JE"), +0x0409 => array(0xA9, "CYRILLIC CAPITAL LETTER LJE"), +0x040A => array(0xAA, "CYRILLIC CAPITAL LETTER NJE"), +0x040B => array(0xAB, "CYRILLIC CAPITAL LETTER TSHE"), +0x040C => array(0xAC, "CYRILLIC CAPITAL LETTER KJE"), +0x00AD => array(0xAD, "SOFT HYPHEN"), +0x040E => array(0xAE, "CYRILLIC CAPITAL LETTER SHORT U"), +0x040F => array(0xAF, "CYRILLIC CAPITAL LETTER DZHE"), +0x0410 => array(0xB0, "CYRILLIC CAPITAL LETTER A"), +0x0411 => array(0xB1, "CYRILLIC CAPITAL LETTER BE"), +0x0412 => array(0xB2, "CYRILLIC CAPITAL LETTER VE"), +0x0413 => array(0xB3, "CYRILLIC CAPITAL LETTER GHE"), +0x0414 => array(0xB4, "CYRILLIC CAPITAL LETTER DE"), +0x0415 => array(0xB5, "CYRILLIC CAPITAL LETTER IE"), +0x0416 => array(0xB6, "CYRILLIC CAPITAL LETTER ZHE"), +0x0417 => array(0xB7, "CYRILLIC CAPITAL LETTER ZE"), +0x0418 => array(0xB8, "CYRILLIC CAPITAL LETTER I"), +0x0419 => array(0xB9, "CYRILLIC CAPITAL LETTER SHORT I"), +0x041A => array(0xBA, "CYRILLIC CAPITAL LETTER KA"), +0x041B => array(0xBB, "CYRILLIC CAPITAL LETTER EL"), +0x041C => array(0xBC, "CYRILLIC CAPITAL LETTER EM"), +0x041D => array(0xBD, "CYRILLIC CAPITAL LETTER EN"), +0x041E => array(0xBE, "CYRILLIC CAPITAL LETTER O"), +0x041F => array(0xBF, "CYRILLIC CAPITAL LETTER PE"), +0x0420 => array(0xC0, "CYRILLIC CAPITAL LETTER ER"), +0x0421 => array(0xC1, "CYRILLIC CAPITAL LETTER ES"), +0x0422 => array(0xC2, "CYRILLIC CAPITAL LETTER TE"), +0x0423 => array(0xC3, "CYRILLIC CAPITAL LETTER U"), +0x0424 => array(0xC4, "CYRILLIC CAPITAL LETTER EF"), +0x0425 => array(0xC5, "CYRILLIC CAPITAL LETTER HA"), +0x0426 => array(0xC6, "CYRILLIC CAPITAL LETTER TSE"), +0x0427 => array(0xC7, "CYRILLIC CAPITAL LETTER CHE"), +0x0428 => array(0xC8, "CYRILLIC CAPITAL LETTER SHA"), +0x0429 => array(0xC9, "CYRILLIC CAPITAL LETTER SHCHA"), +0x042A => array(0xCA, "CYRILLIC CAPITAL LETTER HARD SIGN"), +0x042B => array(0xCB, "CYRILLIC CAPITAL LETTER YERU"), +0x042C => array(0xCC, "CYRILLIC CAPITAL LETTER SOFT SIGN"), +0x042D => array(0xCD, "CYRILLIC CAPITAL LETTER E"), +0x042E => array(0xCE, "CYRILLIC CAPITAL LETTER YU"), +0x042F => array(0xCF, "CYRILLIC CAPITAL LETTER YA"), +0x0430 => array(0xD0, "CYRILLIC SMALL LETTER A"), +0x0431 => array(0xD1, "CYRILLIC SMALL LETTER BE"), +0x0432 => array(0xD2, "CYRILLIC SMALL LETTER VE"), +0x0433 => array(0xD3, "CYRILLIC SMALL LETTER GHE"), +0x0434 => array(0xD4, "CYRILLIC SMALL LETTER DE"), +0x0435 => array(0xD5, "CYRILLIC SMALL LETTER IE"), +0x0436 => array(0xD6, "CYRILLIC SMALL LETTER ZHE"), +0x0437 => array(0xD7, "CYRILLIC SMALL LETTER ZE"), +0x0438 => array(0xD8, "CYRILLIC SMALL LETTER I"), +0x0439 => array(0xD9, "CYRILLIC SMALL LETTER SHORT I"), +0x043A => array(0xDA, "CYRILLIC SMALL LETTER KA"), +0x043B => array(0xDB, "CYRILLIC SMALL LETTER EL"), +0x043C => array(0xDC, "CYRILLIC SMALL LETTER EM"), +0x043D => array(0xDD, "CYRILLIC SMALL LETTER EN"), +0x043E => array(0xDE, "CYRILLIC SMALL LETTER O"), +0x043F => array(0xDF, "CYRILLIC SMALL LETTER PE"), +0x0440 => array(0xE0, "CYRILLIC SMALL LETTER ER"), +0x0441 => array(0xE1, "CYRILLIC SMALL LETTER ES"), +0x0442 => array(0xE2, "CYRILLIC SMALL LETTER TE"), +0x0443 => array(0xE3, "CYRILLIC SMALL LETTER U"), +0x0444 => array(0xE4, "CYRILLIC SMALL LETTER EF"), +0x0445 => array(0xE5, "CYRILLIC SMALL LETTER HA"), +0x0446 => array(0xE6, "CYRILLIC SMALL LETTER TSE"), +0x0447 => array(0xE7, "CYRILLIC SMALL LETTER CHE"), +0x0448 => array(0xE8, "CYRILLIC SMALL LETTER SHA"), +0x0449 => array(0xE9, "CYRILLIC SMALL LETTER SHCHA"), +0x044A => array(0xEA, "CYRILLIC SMALL LETTER HARD SIGN"), +0x044B => array(0xEB, "CYRILLIC SMALL LETTER YERU"), +0x044C => array(0xEC, "CYRILLIC SMALL LETTER SOFT SIGN"), +0x044D => array(0xED, "CYRILLIC SMALL LETTER E"), +0x044E => array(0xEE, "CYRILLIC SMALL LETTER YU"), +0x044F => array(0xEF, "CYRILLIC SMALL LETTER YA"), +0x2116 => array(0xF0, "NUMERO SIGN"), +0x0451 => array(0xF1, "CYRILLIC SMALL LETTER IO"), +0x0452 => array(0xF2, "CYRILLIC SMALL LETTER DJE"), +0x0453 => array(0xF3, "CYRILLIC SMALL LETTER GJE"), +0x0454 => array(0xF4, "CYRILLIC SMALL LETTER UKRAINIAN IE"), +0x0455 => array(0xF5, "CYRILLIC SMALL LETTER DZE"), +0x0456 => array(0xF6, "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I"), +0x0457 => array(0xF7, "CYRILLIC SMALL LETTER YI"), +0x0458 => array(0xF8, "CYRILLIC SMALL LETTER JE"), +0x0459 => array(0xF9, "CYRILLIC SMALL LETTER LJE"), +0x045A => array(0xFA, "CYRILLIC SMALL LETTER NJE"), +0x045B => array(0xFB, "CYRILLIC SMALL LETTER TSHE"), +0x045C => array(0xFC, "CYRILLIC SMALL LETTER KJE"), +0x00A7 => array(0xFD, "SECTION SIGN"), +0x045E => array(0xFE, "CYRILLIC SMALL LETTER SHORT U"), +0x045F => array(0xFF, "CYRILLIC SMALL LETTER DZHE"), +); + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-5'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-5'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +NO-BREAK SPACE:   => a0 +  => a0 + +CYRILLIC CAPITAL LETTER IO: Ё => a1 +¡ => ¡ + +CYRILLIC CAPITAL LETTER DJE: Ђ => a2 +¢ => ¢ + +CYRILLIC CAPITAL LETTER GJE: Ѓ => a3 +£ => £ + +CYRILLIC CAPITAL LETTER UKRAINIAN IE: Є => a4 +¤ => ¤ + +CYRILLIC CAPITAL LETTER DZE: Ѕ => a5 +¥ => ¥ + +CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I: І => a6 +¦ => ¦ + +CYRILLIC CAPITAL LETTER YI: Ї => a7 +§ => fd + +CYRILLIC CAPITAL LETTER JE: Ј => a8 +¨ => ¨ + +CYRILLIC CAPITAL LETTER LJE: Љ => a9 +© => © + +CYRILLIC CAPITAL LETTER NJE: Њ => aa +ª => ª + +CYRILLIC CAPITAL LETTER TSHE: Ћ => ab +« => « + +CYRILLIC CAPITAL LETTER KJE: Ќ => ac +¬ => ¬ + +SOFT HYPHEN: ­ => ad +­ => ad + +CYRILLIC CAPITAL LETTER SHORT U: Ў => ae +® => ® + +CYRILLIC CAPITAL LETTER DZHE: Џ => af +¯ => ¯ + +CYRILLIC CAPITAL LETTER A: А => b0 +° => ° + +CYRILLIC CAPITAL LETTER BE: Б => b1 +± => ± + +CYRILLIC CAPITAL LETTER VE: В => b2 +² => ² + +CYRILLIC CAPITAL LETTER GHE: Г => b3 +³ => ³ + +CYRILLIC CAPITAL LETTER DE: Д => b4 +´ => ´ + +CYRILLIC CAPITAL LETTER IE: Е => b5 +µ => µ + +CYRILLIC CAPITAL LETTER ZHE: Ж => b6 +¶ => ¶ + +CYRILLIC CAPITAL LETTER ZE: З => b7 +· => · + +CYRILLIC CAPITAL LETTER I: И => b8 +¸ => ¸ + +CYRILLIC CAPITAL LETTER SHORT I: Й => b9 +¹ => ¹ + +CYRILLIC CAPITAL LETTER KA: К => ba +º => º + +CYRILLIC CAPITAL LETTER EL: Л => bb +» => » + +CYRILLIC CAPITAL LETTER EM: М => bc +¼ => ¼ + +CYRILLIC CAPITAL LETTER EN: Н => bd +½ => ½ + +CYRILLIC CAPITAL LETTER O: О => be +¾ => ¾ + +CYRILLIC CAPITAL LETTER PE: П => bf +¿ => ¿ + +CYRILLIC CAPITAL LETTER ER: Р => c0 +À => À + +CYRILLIC CAPITAL LETTER ES: С => c1 +Á => Á + +CYRILLIC CAPITAL LETTER TE: Т => c2 + =>  + +CYRILLIC CAPITAL LETTER U: У => c3 +à => à + +CYRILLIC CAPITAL LETTER EF: Ф => c4 +Ä => Ä + +CYRILLIC CAPITAL LETTER HA: Х => c5 +Å => Å + +CYRILLIC CAPITAL LETTER TSE: Ц => c6 +Æ => Æ + +CYRILLIC CAPITAL LETTER CHE: Ч => c7 +Ç => Ç + +CYRILLIC CAPITAL LETTER SHA: Ш => c8 +È => È + +CYRILLIC CAPITAL LETTER SHCHA: Щ => c9 +É => É + +CYRILLIC CAPITAL LETTER HARD SIGN: Ъ => ca +Ê => Ê + +CYRILLIC CAPITAL LETTER YERU: Ы => cb +Ë => Ë + +CYRILLIC CAPITAL LETTER SOFT SIGN: Ь => cc +Ì => Ì + +CYRILLIC CAPITAL LETTER E: Э => cd +Í => Í + +CYRILLIC CAPITAL LETTER YU: Ю => ce +Î => Î + +CYRILLIC CAPITAL LETTER YA: Я => cf +Ï => Ï + +CYRILLIC SMALL LETTER A: а => d0 +Ð => Ð + +CYRILLIC SMALL LETTER BE: б => d1 +Ñ => Ñ + +CYRILLIC SMALL LETTER VE: в => d2 +Ò => Ò + +CYRILLIC SMALL LETTER GHE: г => d3 +Ó => Ó + +CYRILLIC SMALL LETTER DE: д => d4 +Ô => Ô + +CYRILLIC SMALL LETTER IE: е => d5 +Õ => Õ + +CYRILLIC SMALL LETTER ZHE: ж => d6 +Ö => Ö + +CYRILLIC SMALL LETTER ZE: з => d7 +× => × + +CYRILLIC SMALL LETTER I: и => d8 +Ø => Ø + +CYRILLIC SMALL LETTER SHORT I: й => d9 +Ù => Ù + +CYRILLIC SMALL LETTER KA: к => da +Ú => Ú + +CYRILLIC SMALL LETTER EL: л => db +Û => Û + +CYRILLIC SMALL LETTER EM: м => dc +Ü => Ü + +CYRILLIC SMALL LETTER EN: н => dd +Ý => Ý + +CYRILLIC SMALL LETTER O: о => de +Þ => Þ + +CYRILLIC SMALL LETTER PE: п => df +ß => ß + +CYRILLIC SMALL LETTER ER: р => e0 +à => à + +CYRILLIC SMALL LETTER ES: с => e1 +á => á + +CYRILLIC SMALL LETTER TE: т => e2 +â => â + +CYRILLIC SMALL LETTER U: у => e3 +ã => ã + +CYRILLIC SMALL LETTER EF: ф => e4 +ä => ä + +CYRILLIC SMALL LETTER HA: х => e5 +å => å + +CYRILLIC SMALL LETTER TSE: ц => e6 +æ => æ + +CYRILLIC SMALL LETTER CHE: ч => e7 +ç => ç + +CYRILLIC SMALL LETTER SHA: ш => e8 +è => è + +CYRILLIC SMALL LETTER SHCHA: щ => e9 +é => é + +CYRILLIC SMALL LETTER HARD SIGN: ъ => ea +ê => ê + +CYRILLIC SMALL LETTER YERU: ы => eb +ë => ë + +CYRILLIC SMALL LETTER SOFT SIGN: ь => ec +ì => ì + +CYRILLIC SMALL LETTER E: э => ed +í => í + +CYRILLIC SMALL LETTER YU: ю => ee +î => î + +CYRILLIC SMALL LETTER YA: я => ef +ï => ï + +NUMERO SIGN: № => f0 +ð => ð + +CYRILLIC SMALL LETTER IO: ё => 2623783435313b +ñ => ñ + +CYRILLIC SMALL LETTER DJE: ђ => 2623783435323b +ò => ò + +CYRILLIC SMALL LETTER GJE: ѓ => 2623783435333b +ó => ó + +CYRILLIC SMALL LETTER UKRAINIAN IE: є => 2623783435343b +ô => ô + +CYRILLIC SMALL LETTER DZE: ѕ => 2623783435353b +õ => õ + +CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I: і => 2623783435363b +ö => ö + +CYRILLIC SMALL LETTER YI: ї => 2623783435373b +÷ => ÷ + +CYRILLIC SMALL LETTER JE: ј => 2623783435383b +ø => ø + +CYRILLIC SMALL LETTER LJE: љ => 2623783435393b +ù => ù + +CYRILLIC SMALL LETTER NJE: њ => 2623783435413b +ú => ú + +CYRILLIC SMALL LETTER TSHE: ћ => 2623783435423b +û => û + +CYRILLIC SMALL LETTER KJE: ќ => 2623783435433b +ü => ü + +SECTION SIGN: § => fd +ý => ý + +CYRILLIC SMALL LETTER SHORT U: ў => 2623783435453b +þ => þ + +CYRILLIC SMALL LETTER DZHE: џ => 2623783435463b +ÿ => ÿ + + diff --git a/ext/standard/tests/strings/html_entity_decode_koi8-r.phpt b/ext/standard/tests/strings/html_entity_decode_koi8-r.phpt new file mode 100644 index 0000000000..cb7fc7d1d8 --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_koi8-r.phpt @@ -0,0 +1,533 @@ +--TEST-- +Translation of HTML entities for encoding KOI8-R +--FILE-- + array(0x80, "BOX DRAWINGS LIGHT HORIZONTAL"), +0x2502 => array(0x81, "BOX DRAWINGS LIGHT VERTICAL"), +0x250C => array(0x82, "BOX DRAWINGS LIGHT DOWN AND RIGHT"), +0x2510 => array(0x83, "BOX DRAWINGS LIGHT DOWN AND LEFT"), +0x2514 => array(0x84, "BOX DRAWINGS LIGHT UP AND RIGHT"), +0x2518 => array(0x85, "BOX DRAWINGS LIGHT UP AND LEFT"), +0x251C => array(0x86, "BOX DRAWINGS LIGHT VERTICAL AND RIGHT"), +0x2524 => array(0x87, "BOX DRAWINGS LIGHT VERTICAL AND LEFT"), +0x252C => array(0x88, "BOX DRAWINGS LIGHT DOWN AND HORIZONTAL"), +0x2534 => array(0x89, "BOX DRAWINGS LIGHT UP AND HORIZONTAL"), +0x253C => array(0x8A, "BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL"), +0x2580 => array(0x8B, "UPPER HALF BLOCK"), +0x2584 => array(0x8C, "LOWER HALF BLOCK"), +0x2588 => array(0x8D, "FULL BLOCK"), +0x258C => array(0x8E, "LEFT HALF BLOCK"), +0x2590 => array(0x8F, "RIGHT HALF BLOCK"), +0x2591 => array(0x90, "LIGHT SHADE"), +0x2592 => array(0x91, "MEDIUM SHADE"), +0x2593 => array(0x92, "DARK SHADE"), +0x2320 => array(0x93, "TOP HALF INTEGRAL"), +0x25A0 => array(0x94, "BLACK SQUARE"), +0x2219 => array(0x95, "BULLET OPERATOR"), +0x221A => array(0x96, "SQUARE ROOT"), +0x2248 => array(0x97, "ALMOST EQUAL TO"), +0x2264 => array(0x98, "LESS-THAN OR EQUAL TO"), +0x2265 => array(0x99, "GREATER-THAN OR EQUAL TO"), +0x00A0 => array(0x9A, "NO-BREAK SPACE"), +0x2321 => array(0x9B, "BOTTOM HALF INTEGRAL"), +0x00B0 => array(0x9C, "DEGREE SIGN"), +0x00B2 => array(0x9D, "SUPERSCRIPT TWO"), +0x00B7 => array(0x9E, "MIDDLE DOT"), +0x00F7 => array(0x9F, "DIVISION SIGN"), +0x2550 => array(0xA0, "BOX DRAWINGS DOUBLE HORIZONTAL"), +0x2551 => array(0xA1, "BOX DRAWINGS DOUBLE VERTICAL"), +0x2552 => array(0xA2, "BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE"), +0x0451 => array(0xA3, "CYRILLIC SMALL LETTER IO"), +0x2553 => array(0xA4, "BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE"), +0x2554 => array(0xA5, "BOX DRAWINGS DOUBLE DOWN AND RIGHT"), +0x2555 => array(0xA6, "BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE"), +0x2556 => array(0xA7, "BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE"), +0x2557 => array(0xA8, "BOX DRAWINGS DOUBLE DOWN AND LEFT"), +0x2558 => array(0xA9, "BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE"), +0x2559 => array(0xAA, "BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE"), +0x255A => array(0xAB, "BOX DRAWINGS DOUBLE UP AND RIGHT"), +0x255B => array(0xAC, "BOX DRAWINGS UP SINGLE AND LEFT DOUBLE"), +0x255C => array(0xAD, "BOX DRAWINGS UP DOUBLE AND LEFT SINGLE"), +0x255D => array(0xAE, "BOX DRAWINGS DOUBLE UP AND LEFT"), +0x255E => array(0xAF, "BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE"), +0x255F => array(0xB0, "BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE"), +0x2560 => array(0xB1, "BOX DRAWINGS DOUBLE VERTICAL AND RIGHT"), +0x2561 => array(0xB2, "BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE"), +0x0401 => array(0xB3, "CYRILLIC CAPITAL LETTER IO"), +0x2562 => array(0xB4, "BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE"), +0x2563 => array(0xB5, "BOX DRAWINGS DOUBLE VERTICAL AND LEFT"), +0x2564 => array(0xB6, "BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE"), +0x2565 => array(0xB7, "BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE"), +0x2566 => array(0xB8, "BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL"), +0x2567 => array(0xB9, "BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE"), +0x2568 => array(0xBA, "BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE"), +0x2569 => array(0xBB, "BOX DRAWINGS DOUBLE UP AND HORIZONTAL"), +0x256A => array(0xBC, "BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE"), +0x256B => array(0xBD, "BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE"), +0x256C => array(0xBE, "BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL"), +0x00A9 => array(0xBF, "COPYRIGHT SIGN"), +0x044E => array(0xC0, "CYRILLIC SMALL LETTER YU"), +0x0430 => array(0xC1, "CYRILLIC SMALL LETTER A"), +0x0431 => array(0xC2, "CYRILLIC SMALL LETTER BE"), +0x0446 => array(0xC3, "CYRILLIC SMALL LETTER TSE"), +0x0434 => array(0xC4, "CYRILLIC SMALL LETTER DE"), +0x0435 => array(0xC5, "CYRILLIC SMALL LETTER IE"), +0x0444 => array(0xC6, "CYRILLIC SMALL LETTER EF"), +0x0433 => array(0xC7, "CYRILLIC SMALL LETTER GHE"), +0x0445 => array(0xC8, "CYRILLIC SMALL LETTER HA"), +0x0438 => array(0xC9, "CYRILLIC SMALL LETTER I"), +0x0439 => array(0xCA, "CYRILLIC SMALL LETTER SHORT I"), +0x043A => array(0xCB, "CYRILLIC SMALL LETTER KA"), +0x043B => array(0xCC, "CYRILLIC SMALL LETTER EL"), +0x043C => array(0xCD, "CYRILLIC SMALL LETTER EM"), +0x043D => array(0xCE, "CYRILLIC SMALL LETTER EN"), +0x043E => array(0xCF, "CYRILLIC SMALL LETTER O"), +0x043F => array(0xD0, "CYRILLIC SMALL LETTER PE"), +0x044F => array(0xD1, "CYRILLIC SMALL LETTER YA"), +0x0440 => array(0xD2, "CYRILLIC SMALL LETTER ER"), +0x0441 => array(0xD3, "CYRILLIC SMALL LETTER ES"), +0x0442 => array(0xD4, "CYRILLIC SMALL LETTER TE"), +0x0443 => array(0xD5, "CYRILLIC SMALL LETTER U"), +0x0436 => array(0xD6, "CYRILLIC SMALL LETTER ZHE"), +0x0432 => array(0xD7, "CYRILLIC SMALL LETTER VE"), +0x044C => array(0xD8, "CYRILLIC SMALL LETTER SOFT SIGN"), +0x044B => array(0xD9, "CYRILLIC SMALL LETTER YERU"), +0x0437 => array(0xDA, "CYRILLIC SMALL LETTER ZE"), +0x0448 => array(0xDB, "CYRILLIC SMALL LETTER SHA"), +0x044D => array(0xDC, "CYRILLIC SMALL LETTER E"), +0x0449 => array(0xDD, "CYRILLIC SMALL LETTER SHCHA"), +0x0447 => array(0xDE, "CYRILLIC SMALL LETTER CHE"), +0x044A => array(0xDF, "CYRILLIC SMALL LETTER HARD SIGN"), +0x042E => array(0xE0, "CYRILLIC CAPITAL LETTER YU"), +0x0410 => array(0xE1, "CYRILLIC CAPITAL LETTER A"), +0x0411 => array(0xE2, "CYRILLIC CAPITAL LETTER BE"), +0x0426 => array(0xE3, "CYRILLIC CAPITAL LETTER TSE"), +0x0414 => array(0xE4, "CYRILLIC CAPITAL LETTER DE"), +0x0415 => array(0xE5, "CYRILLIC CAPITAL LETTER IE"), +0x0424 => array(0xE6, "CYRILLIC CAPITAL LETTER EF"), +0x0413 => array(0xE7, "CYRILLIC CAPITAL LETTER GHE"), +0x0425 => array(0xE8, "CYRILLIC CAPITAL LETTER HA"), +0x0418 => array(0xE9, "CYRILLIC CAPITAL LETTER I"), +0x0419 => array(0xEA, "CYRILLIC CAPITAL LETTER SHORT I"), +0x041A => array(0xEB, "CYRILLIC CAPITAL LETTER KA"), +0x041B => array(0xEC, "CYRILLIC CAPITAL LETTER EL"), +0x041C => array(0xED, "CYRILLIC CAPITAL LETTER EM"), +0x041D => array(0xEE, "CYRILLIC CAPITAL LETTER EN"), +0x041E => array(0xEF, "CYRILLIC CAPITAL LETTER O"), +0x041F => array(0xF0, "CYRILLIC CAPITAL LETTER PE"), +0x042F => array(0xF1, "CYRILLIC CAPITAL LETTER YA"), +0x0420 => array(0xF2, "CYRILLIC CAPITAL LETTER ER"), +0x0421 => array(0xF3, "CYRILLIC CAPITAL LETTER ES"), +0x0422 => array(0xF4, "CYRILLIC CAPITAL LETTER TE"), +0x0423 => array(0xF5, "CYRILLIC CAPITAL LETTER U"), +0x0416 => array(0xF6, "CYRILLIC CAPITAL LETTER ZHE"), +0x0412 => array(0xF7, "CYRILLIC CAPITAL LETTER VE"), +0x042C => array(0xF8, "CYRILLIC CAPITAL LETTER SOFT SIGN"), +0x042B => array(0xF9, "CYRILLIC CAPITAL LETTER YERU"), +0x0417 => array(0xFA, "CYRILLIC CAPITAL LETTER ZE"), +0x0428 => array(0xFB, "CYRILLIC CAPITAL LETTER SHA"), +0x042D => array(0xFC, "CYRILLIC CAPITAL LETTER E"), +0x0429 => array(0xFD, "CYRILLIC CAPITAL LETTER SHCHA"), +0x0427 => array(0xFE, "CYRILLIC CAPITAL LETTER CHE"), +0x042A => array(0xFF, "CYRILLIC CAPITAL LETTER HARD SIGN"), +); + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'KOI8-R'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'KOI8-R'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +BOX DRAWINGS LIGHT HORIZONTAL: ─ => 80 +€ => € + +BOX DRAWINGS LIGHT VERTICAL: │ => 81 + =>  + +BOX DRAWINGS LIGHT DOWN AND RIGHT: ┌ => 82 +‚ => ‚ + +BOX DRAWINGS LIGHT DOWN AND LEFT: ┐ => 83 +ƒ => ƒ + +BOX DRAWINGS LIGHT UP AND RIGHT: └ => 84 +„ => „ + +BOX DRAWINGS LIGHT UP AND LEFT: ┘ => 85 +… => … + +BOX DRAWINGS LIGHT VERTICAL AND RIGHT: ├ => 86 +† => † + +BOX DRAWINGS LIGHT VERTICAL AND LEFT: ┤ => 87 +‡ => ‡ + +BOX DRAWINGS LIGHT DOWN AND HORIZONTAL: ┬ => 88 +ˆ => ˆ + +BOX DRAWINGS LIGHT UP AND HORIZONTAL: ┴ => 89 +‰ => ‰ + +BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL: ┼ => 8a +Š => Š + +UPPER HALF BLOCK: ▀ => 8b +‹ => ‹ + +LOWER HALF BLOCK: ▄ => 8c +Œ => Œ + +FULL BLOCK: █ => 8d + =>  + +LEFT HALF BLOCK: ▌ => 8e +Ž => Ž + +RIGHT HALF BLOCK: ▐ => 8f + =>  + +LIGHT SHADE: ░ => 90 + =>  + +MEDIUM SHADE: ▒ => 91 +‘ => ‘ + +DARK SHADE: ▓ => 92 +’ => ’ + +TOP HALF INTEGRAL: ⌠ => 93 +“ => “ + +BLACK SQUARE: ■ => 94 +” => ” + +BULLET OPERATOR: ∙ => 95 +• => • + +SQUARE ROOT: √ => 96 +– => – + +ALMOST EQUAL TO: ≈ => 97 +— => — + +LESS-THAN OR EQUAL TO: ≤ => 98 +˜ => ˜ + +GREATER-THAN OR EQUAL TO: ≥ => 99 +™ => ™ + +NO-BREAK SPACE:   => 9a +š => š + +BOTTOM HALF INTEGRAL: ⌡ => 9b +› => › + +DEGREE SIGN: ° => 9c +œ => œ + +SUPERSCRIPT TWO: ² => 9d + =>  + +MIDDLE DOT: · => 9e +ž => ž + +DIVISION SIGN: ÷ => 9f +Ÿ => Ÿ + +BOX DRAWINGS DOUBLE HORIZONTAL: ═ => a0 +  => 9a + +BOX DRAWINGS DOUBLE VERTICAL: ║ => a1 +¡ => ¡ + +BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE: ╒ => a2 +¢ => ¢ + +CYRILLIC SMALL LETTER IO: ё => a3 +£ => £ + +BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE: ╓ => a4 +¤ => ¤ + +BOX DRAWINGS DOUBLE DOWN AND RIGHT: ╔ => a5 +¥ => ¥ + +BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE: ╕ => a6 +¦ => ¦ + +BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE: ╖ => a7 +§ => § + +BOX DRAWINGS DOUBLE DOWN AND LEFT: ╗ => a8 +¨ => ¨ + +BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE: ╘ => a9 +© => bf + +BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE: ╙ => aa +ª => ª + +BOX DRAWINGS DOUBLE UP AND RIGHT: ╚ => ab +« => « + +BOX DRAWINGS UP SINGLE AND LEFT DOUBLE: ╛ => ac +¬ => ¬ + +BOX DRAWINGS UP DOUBLE AND LEFT SINGLE: ╜ => ad +­ => ­ + +BOX DRAWINGS DOUBLE UP AND LEFT: ╝ => ae +® => ® + +BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE: ╞ => af +¯ => ¯ + +BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE: ╟ => b0 +° => 9c + +BOX DRAWINGS DOUBLE VERTICAL AND RIGHT: ╠ => b1 +± => ± + +BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE: ╡ => b2 +² => 9d + +CYRILLIC CAPITAL LETTER IO: Ё => b3 +³ => ³ + +BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE: ╢ => b4 +´ => ´ + +BOX DRAWINGS DOUBLE VERTICAL AND LEFT: ╣ => b5 +µ => µ + +BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE: ╤ => b6 +¶ => ¶ + +BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE: ╥ => b7 +· => 9e + +BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL: ╦ => b8 +¸ => ¸ + +BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE: ╧ => b9 +¹ => ¹ + +BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE: ╨ => ba +º => º + +BOX DRAWINGS DOUBLE UP AND HORIZONTAL: ╩ => bb +» => » + +BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE: ╪ => bc +¼ => ¼ + +BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE: ╫ => bd +½ => ½ + +BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL: ╬ => be +¾ => ¾ + +COPYRIGHT SIGN: © => bf +¿ => ¿ + +CYRILLIC SMALL LETTER YU: ю => c0 +À => À + +CYRILLIC SMALL LETTER A: а => c1 +Á => Á + +CYRILLIC SMALL LETTER BE: б => c2 + =>  + +CYRILLIC SMALL LETTER TSE: ц => c3 +à => à + +CYRILLIC SMALL LETTER DE: д => c4 +Ä => Ä + +CYRILLIC SMALL LETTER IE: е => c5 +Å => Å + +CYRILLIC SMALL LETTER EF: ф => c6 +Æ => Æ + +CYRILLIC SMALL LETTER GHE: г => c7 +Ç => Ç + +CYRILLIC SMALL LETTER HA: х => c8 +È => È + +CYRILLIC SMALL LETTER I: и => c9 +É => É + +CYRILLIC SMALL LETTER SHORT I: й => ca +Ê => Ê + +CYRILLIC SMALL LETTER KA: к => cb +Ë => Ë + +CYRILLIC SMALL LETTER EL: л => cc +Ì => Ì + +CYRILLIC SMALL LETTER EM: м => cd +Í => Í + +CYRILLIC SMALL LETTER EN: н => ce +Î => Î + +CYRILLIC SMALL LETTER O: о => cf +Ï => Ï + +CYRILLIC SMALL LETTER PE: п => d0 +Ð => Ð + +CYRILLIC SMALL LETTER YA: я => d1 +Ñ => Ñ + +CYRILLIC SMALL LETTER ER: р => d2 +Ò => Ò + +CYRILLIC SMALL LETTER ES: с => d3 +Ó => Ó + +CYRILLIC SMALL LETTER TE: т => d4 +Ô => Ô + +CYRILLIC SMALL LETTER U: у => d5 +Õ => Õ + +CYRILLIC SMALL LETTER ZHE: ж => d6 +Ö => Ö + +CYRILLIC SMALL LETTER VE: в => d7 +× => × + +CYRILLIC SMALL LETTER SOFT SIGN: ь => d8 +Ø => Ø + +CYRILLIC SMALL LETTER YERU: ы => d9 +Ù => Ù + +CYRILLIC SMALL LETTER ZE: з => da +Ú => Ú + +CYRILLIC SMALL LETTER SHA: ш => db +Û => Û + +CYRILLIC SMALL LETTER E: э => dc +Ü => Ü + +CYRILLIC SMALL LETTER SHCHA: щ => dd +Ý => Ý + +CYRILLIC SMALL LETTER CHE: ч => de +Þ => Þ + +CYRILLIC SMALL LETTER HARD SIGN: ъ => df +ß => ß + +CYRILLIC CAPITAL LETTER YU: Ю => e0 +à => à + +CYRILLIC CAPITAL LETTER A: А => e1 +á => á + +CYRILLIC CAPITAL LETTER BE: Б => e2 +â => â + +CYRILLIC CAPITAL LETTER TSE: Ц => e3 +ã => ã + +CYRILLIC CAPITAL LETTER DE: Д => e4 +ä => ä + +CYRILLIC CAPITAL LETTER IE: Е => e5 +å => å + +CYRILLIC CAPITAL LETTER EF: Ф => e6 +æ => æ + +CYRILLIC CAPITAL LETTER GHE: Г => e7 +ç => ç + +CYRILLIC CAPITAL LETTER HA: Х => e8 +è => è + +CYRILLIC CAPITAL LETTER I: И => e9 +é => é + +CYRILLIC CAPITAL LETTER SHORT I: Й => ea +ê => ê + +CYRILLIC CAPITAL LETTER KA: К => eb +ë => ë + +CYRILLIC CAPITAL LETTER EL: Л => ec +ì => ì + +CYRILLIC CAPITAL LETTER EM: М => ed +í => í + +CYRILLIC CAPITAL LETTER EN: Н => ee +î => î + +CYRILLIC CAPITAL LETTER O: О => ef +ï => ï + +CYRILLIC CAPITAL LETTER PE: П => f0 +ð => ð + +CYRILLIC CAPITAL LETTER YA: Я => f1 +ñ => ñ + +CYRILLIC CAPITAL LETTER ER: Р => f2 +ò => ò + +CYRILLIC CAPITAL LETTER ES: С => f3 +ó => ó + +CYRILLIC CAPITAL LETTER TE: Т => f4 +ô => ô + +CYRILLIC CAPITAL LETTER U: У => f5 +õ => õ + +CYRILLIC CAPITAL LETTER ZHE: Ж => f6 +ö => ö + +CYRILLIC CAPITAL LETTER VE: В => f7 +÷ => 9f + +CYRILLIC CAPITAL LETTER SOFT SIGN: Ь => f8 +ø => ø + +CYRILLIC CAPITAL LETTER YERU: Ы => f9 +ù => ù + +CYRILLIC CAPITAL LETTER ZE: З => fa +ú => ú + +CYRILLIC CAPITAL LETTER SHA: Ш => fb +û => û + +CYRILLIC CAPITAL LETTER E: Э => fc +ü => ü + +CYRILLIC CAPITAL LETTER SHCHA: Щ => fd +ý => ý + +CYRILLIC CAPITAL LETTER CHE: Ч => fe +þ => þ + +CYRILLIC CAPITAL LETTER HARD SIGN: Ъ => ff +ÿ => ÿ + + diff --git a/ext/standard/tests/strings/html_entity_decode_macroman.phpt b/ext/standard/tests/strings/html_entity_decode_macroman.phpt new file mode 100644 index 0000000000..4691bcf1a7 --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_macroman.phpt @@ -0,0 +1,540 @@ +--TEST-- +Translation of HTML entities for encoding MacRoman +--FILE-- + array(0x80, "LATIN CAPITAL LETTER A WITH DIAERESIS"), +0x00C5 => array(0x81, "LATIN CAPITAL LETTER A WITH RING ABOVE"), +0x00C7 => array(0x82, "LATIN CAPITAL LETTER C WITH CEDILLA"), +0x00C9 => array(0x83, "LATIN CAPITAL LETTER E WITH ACUTE"), +0x00D1 => array(0x84, "LATIN CAPITAL LETTER N WITH TILDE"), +0x00D6 => array(0x85, "LATIN CAPITAL LETTER O WITH DIAERESIS"), +0x00DC => array(0x86, "LATIN CAPITAL LETTER U WITH DIAERESIS"), +0x00E1 => array(0x87, "LATIN SMALL LETTER A WITH ACUTE"), +0x00E0 => array(0x88, "LATIN SMALL LETTER A WITH GRAVE"), +0x00E2 => array(0x89, "LATIN SMALL LETTER A WITH CIRCUMFLEX"), +0x00E4 => array(0x8A, "LATIN SMALL LETTER A WITH DIAERESIS"), +0x00E3 => array(0x8B, "LATIN SMALL LETTER A WITH TILDE"), +0x00E5 => array(0x8C, "LATIN SMALL LETTER A WITH RING ABOVE"), +0x00E7 => array(0x8D, "LATIN SMALL LETTER C WITH CEDILLA"), +0x00E9 => array(0x8E, "LATIN SMALL LETTER E WITH ACUTE"), +0x00E8 => array(0x8F, "LATIN SMALL LETTER E WITH GRAVE"), +0x00EA => array(0x90, "LATIN SMALL LETTER E WITH CIRCUMFLEX"), +0x00EB => array(0x91, "LATIN SMALL LETTER E WITH DIAERESIS"), +0x00ED => array(0x92, "LATIN SMALL LETTER I WITH ACUTE"), +0x00EC => array(0x93, "LATIN SMALL LETTER I WITH GRAVE"), +0x00EE => array(0x94, "LATIN SMALL LETTER I WITH CIRCUMFLEX"), +0x00EF => array(0x95, "LATIN SMALL LETTER I WITH DIAERESIS"), +0x00F1 => array(0x96, "LATIN SMALL LETTER N WITH TILDE"), +0x00F3 => array(0x97, "LATIN SMALL LETTER O WITH ACUTE"), +0x00F2 => array(0x98, "LATIN SMALL LETTER O WITH GRAVE"), +0x00F4 => array(0x99, "LATIN SMALL LETTER O WITH CIRCUMFLEX"), +0x00F6 => array(0x9A, "LATIN SMALL LETTER O WITH DIAERESIS"), +0x00F5 => array(0x9B, "LATIN SMALL LETTER O WITH TILDE"), +0x00FA => array(0x9C, "LATIN SMALL LETTER U WITH ACUTE"), +0x00F9 => array(0x9D, "LATIN SMALL LETTER U WITH GRAVE"), +0x00FB => array(0x9E, "LATIN SMALL LETTER U WITH CIRCUMFLEX"), +0x00FC => array(0x9F, "LATIN SMALL LETTER U WITH DIAERESIS"), +0x2020 => array(0xA0, "DAGGER"), +0x00B0 => array(0xA1, "DEGREE SIGN"), +0x00A2 => array(0xA2, "CENT SIGN"), +0x00A3 => array(0xA3, "POUND SIGN"), +0x00A7 => array(0xA4, "SECTION SIGN"), +0x2022 => array(0xA5, "BULLET"), +0x00B6 => array(0xA6, "PILCROW SIGN"), +0x00DF => array(0xA7, "LATIN SMALL LETTER SHARP S"), +0x00AE => array(0xA8, "REGISTERED SIGN"), +0x00A9 => array(0xA9, "COPYRIGHT SIGN"), +0x2122 => array(0xAA, "TRADE MARK SIGN"), +0x00B4 => array(0xAB, "ACUTE ACCENT"), +0x00A8 => array(0xAC, "DIAERESIS"), +0x2260 => array(0xAD, "NOT EQUAL TO"), +0x00C6 => array(0xAE, "LATIN CAPITAL LETTER AE"), +0x00D8 => array(0xAF, "LATIN CAPITAL LETTER O WITH STROKE"), +0x221E => array(0xB0, "INFINITY"), +0x00B1 => array(0xB1, "PLUS-MINUS SIGN"), +0x2264 => array(0xB2, "LESS-THAN OR EQUAL TO"), +0x2265 => array(0xB3, "GREATER-THAN OR EQUAL TO"), +0x00A5 => array(0xB4, "YEN SIGN"), +0x00B5 => array(0xB5, "MICRO SIGN"), +0x2202 => array(0xB6, "PARTIAL DIFFERENTIAL"), +0x2211 => array(0xB7, "N-ARY SUMMATION"), +0x220F => array(0xB8, "N-ARY PRODUCT"), +0x03C0 => array(0xB9, "GREEK SMALL LETTER PI"), +0x222B => array(0xBA, "INTEGRAL"), +0x00AA => array(0xBB, "FEMININE ORDINAL INDICATOR"), +0x00BA => array(0xBC, "MASCULINE ORDINAL INDICATOR"), +0x03A9 => array(0xBD, "GREEK CAPITAL LETTER OMEGA"), +0x00E6 => array(0xBE, "LATIN SMALL LETTER AE"), +0x00F8 => array(0xBF, "LATIN SMALL LETTER O WITH STROKE"), +0x00BF => array(0xC0, "INVERTED QUESTION MARK"), +0x00A1 => array(0xC1, "INVERTED EXCLAMATION MARK"), +0x00AC => array(0xC2, "NOT SIGN"), +0x221A => array(0xC3, "SQUARE ROOT"), +0x0192 => array(0xC4, "LATIN SMALL LETTER F WITH HOOK"), +0x2248 => array(0xC5, "ALMOST EQUAL TO"), +0x2206 => array(0xC6, "INCREMENT"), +0x00AB => array(0xC7, "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x00BB => array(0xC8, "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x2026 => array(0xC9, "HORIZONTAL ELLIPSIS"), +0x00A0 => array(0xCA, "NO-BREAK SPACE"), +0x00C0 => array(0xCB, "LATIN CAPITAL LETTER A WITH GRAVE"), +0x00C3 => array(0xCC, "LATIN CAPITAL LETTER A WITH TILDE"), +0x00D5 => array(0xCD, "LATIN CAPITAL LETTER O WITH TILDE"), +0x0152 => array(0xCE, "LATIN CAPITAL LIGATURE OE"), +0x0153 => array(0xCF, "LATIN SMALL LIGATURE OE"), +0x2013 => array(0xD0, "EN DASH"), +0x2014 => array(0xD1, "EM DASH"), +0x201C => array(0xD2, "LEFT DOUBLE QUOTATION MARK"), +0x201D => array(0xD3, "RIGHT DOUBLE QUOTATION MARK"), +0x2018 => array(0xD4, "LEFT SINGLE QUOTATION MARK"), +0x2019 => array(0xD5, "RIGHT SINGLE QUOTATION MARK"), +0x00F7 => array(0xD6, "DIVISION SIGN"), +0x25CA => array(0xD7, "LOZENGE"), +0x00FF => array(0xD8, "LATIN SMALL LETTER Y WITH DIAERESIS"), +0x0178 => array(0xD9, "LATIN CAPITAL LETTER Y WITH DIAERESIS"), +0x2044 => array(0xDA, "FRACTION SLASH"), +0x20AC => array(0xDB, "EURO SIGN"), +0x2039 => array(0xDC, "SINGLE LEFT-POINTING ANGLE QUOTATION MARK"), +0x203A => array(0xDD, "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK"), +0xFB01 => array(0xDE, "LATIN SMALL LIGATURE FI"), +0xFB02 => array(0xDF, "LATIN SMALL LIGATURE FL"), +0x2021 => array(0xE0, "DOUBLE DAGGER"), +0x00B7 => array(0xE1, "MIDDLE DOT"), +0x201A => array(0xE2, "SINGLE LOW-9 QUOTATION MARK"), +0x201E => array(0xE3, "DOUBLE LOW-9 QUOTATION MARK"), +0x2030 => array(0xE4, "PER MILLE SIGN"), +0x00C2 => array(0xE5, "LATIN CAPITAL LETTER A WITH CIRCUMFLEX"), +0x00CA => array(0xE6, "LATIN CAPITAL LETTER E WITH CIRCUMFLEX"), +0x00C1 => array(0xE7, "LATIN CAPITAL LETTER A WITH ACUTE"), +0x00CB => array(0xE8, "LATIN CAPITAL LETTER E WITH DIAERESIS"), +0x00C8 => array(0xE9, "LATIN CAPITAL LETTER E WITH GRAVE"), +0x00CD => array(0xEA, "LATIN CAPITAL LETTER I WITH ACUTE"), +0x00CE => array(0xEB, "LATIN CAPITAL LETTER I WITH CIRCUMFLEX"), +0x00CF => array(0xEC, "LATIN CAPITAL LETTER I WITH DIAERESIS"), +0x00CC => array(0xED, "LATIN CAPITAL LETTER I WITH GRAVE"), +0x00D3 => array(0xEE, "LATIN CAPITAL LETTER O WITH ACUTE"), +0x00D4 => array(0xEF, "LATIN CAPITAL LETTER O WITH CIRCUMFLEX"), +0xF8FF => array(0xF0, "Apple logo"), +0x00D2 => array(0xF1, "LATIN CAPITAL LETTER O WITH GRAVE"), +0x00DA => array(0xF2, "LATIN CAPITAL LETTER U WITH ACUTE"), +0x00DB => array(0xF3, "LATIN CAPITAL LETTER U WITH CIRCUMFLEX"), +0x00D9 => array(0xF4, "LATIN CAPITAL LETTER U WITH GRAVE"), +0x0131 => array(0xF5, "LATIN SMALL LETTER DOTLESS I"), +0x02C6 => array(0xF6, "MODIFIER LETTER CIRCUMFLEX ACCENT"), +0x02DC => array(0xF7, "SMALL TILDE"), +0x00AF => array(0xF8, "MACRON"), +0x02D8 => array(0xF9, "BREVE"), +0x02D9 => array(0xFA, "DOT ABOVE"), +0x02DA => array(0xFB, "RING ABOVE"), +0x00B8 => array(0xFC, "CEDILLA"), +0x02DD => array(0xFD, "DOUBLE ACUTE ACCENT"), +0x02DB => array(0xFE, "OGONEK"), +0x02C7 => array(0xFF, "CARON"), +); + +$res = html_entity_decode("", ENT_QUOTES, 'MacRoman'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'MacRoman'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'MacRoman'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +Special test for  (shouldn't decode): + + +LATIN CAPITAL LETTER A WITH DIAERESIS: Ä => 80 +€ => € + +LATIN CAPITAL LETTER A WITH RING ABOVE: Å => 81 + =>  + +LATIN CAPITAL LETTER C WITH CEDILLA: Ç => 82 +‚ => ‚ + +LATIN CAPITAL LETTER E WITH ACUTE: É => 83 +ƒ => ƒ + +LATIN CAPITAL LETTER N WITH TILDE: Ñ => 84 +„ => „ + +LATIN CAPITAL LETTER O WITH DIAERESIS: Ö => 85 +… => … + +LATIN CAPITAL LETTER U WITH DIAERESIS: Ü => 86 +† => † + +LATIN SMALL LETTER A WITH ACUTE: á => 87 +‡ => ‡ + +LATIN SMALL LETTER A WITH GRAVE: à => 88 +ˆ => ˆ + +LATIN SMALL LETTER A WITH CIRCUMFLEX: â => 89 +‰ => ‰ + +LATIN SMALL LETTER A WITH DIAERESIS: ä => 8a +Š => Š + +LATIN SMALL LETTER A WITH TILDE: ã => 8b +‹ => ‹ + +LATIN SMALL LETTER A WITH RING ABOVE: å => 8c +Œ => Œ + +LATIN SMALL LETTER C WITH CEDILLA: ç => 8d + =>  + +LATIN SMALL LETTER E WITH ACUTE: é => 8e +Ž => Ž + +LATIN SMALL LETTER E WITH GRAVE: è => 8f + =>  + +LATIN SMALL LETTER E WITH CIRCUMFLEX: ê => 90 + =>  + +LATIN SMALL LETTER E WITH DIAERESIS: ë => 91 +‘ => ‘ + +LATIN SMALL LETTER I WITH ACUTE: í => 92 +’ => ’ + +LATIN SMALL LETTER I WITH GRAVE: ì => 93 +“ => “ + +LATIN SMALL LETTER I WITH CIRCUMFLEX: î => 94 +” => ” + +LATIN SMALL LETTER I WITH DIAERESIS: ï => 95 +• => • + +LATIN SMALL LETTER N WITH TILDE: ñ => 96 +– => – + +LATIN SMALL LETTER O WITH ACUTE: ó => 97 +— => — + +LATIN SMALL LETTER O WITH GRAVE: ò => 98 +˜ => ˜ + +LATIN SMALL LETTER O WITH CIRCUMFLEX: ô => 99 +™ => ™ + +LATIN SMALL LETTER O WITH DIAERESIS: ö => 9a +š => š + +LATIN SMALL LETTER O WITH TILDE: õ => 9b +› => › + +LATIN SMALL LETTER U WITH ACUTE: ú => 9c +œ => œ + +LATIN SMALL LETTER U WITH GRAVE: ù => 9d + =>  + +LATIN SMALL LETTER U WITH CIRCUMFLEX: û => 9e +ž => ž + +LATIN SMALL LETTER U WITH DIAERESIS: ü => 9f +Ÿ => Ÿ + +DAGGER: † => a0 +  => ca + +DEGREE SIGN: ° => a1 +¡ => c1 + +CENT SIGN: ¢ => a2 +¢ => a2 + +POUND SIGN: £ => a3 +£ => a3 + +SECTION SIGN: § => a4 +¤ => ¤ + +BULLET: • => a5 +¥ => b4 + +PILCROW SIGN: ¶ => a6 +¦ => ¦ + +LATIN SMALL LETTER SHARP S: ß => a7 +§ => a4 + +REGISTERED SIGN: ® => a8 +¨ => ac + +COPYRIGHT SIGN: © => a9 +© => a9 + +TRADE MARK SIGN: ™ => aa +ª => bb + +ACUTE ACCENT: ´ => ab +« => c7 + +DIAERESIS: ¨ => ac +¬ => c2 + +NOT EQUAL TO: ≠ => ad +­ => ­ + +LATIN CAPITAL LETTER AE: Æ => ae +® => a8 + +LATIN CAPITAL LETTER O WITH STROKE: Ø => af +¯ => f8 + +INFINITY: ∞ => b0 +° => a1 + +PLUS-MINUS SIGN: ± => b1 +± => b1 + +LESS-THAN OR EQUAL TO: ≤ => b2 +² => ² + +GREATER-THAN OR EQUAL TO: ≥ => b3 +³ => ³ + +YEN SIGN: ¥ => b4 +´ => ab + +MICRO SIGN: µ => b5 +µ => b5 + +PARTIAL DIFFERENTIAL: ∂ => b6 +¶ => a6 + +N-ARY SUMMATION: ∑ => b7 +· => e1 + +N-ARY PRODUCT: ∏ => b8 +¸ => fc + +GREEK SMALL LETTER PI: π => b9 +¹ => ¹ + +INTEGRAL: ∫ => ba +º => bc + +FEMININE ORDINAL INDICATOR: ª => bb +» => c8 + +MASCULINE ORDINAL INDICATOR: º => bc +¼ => ¼ + +GREEK CAPITAL LETTER OMEGA: Ω => bd +½ => ½ + +LATIN SMALL LETTER AE: æ => be +¾ => ¾ + +LATIN SMALL LETTER O WITH STROKE: ø => bf +¿ => c0 + +INVERTED QUESTION MARK: ¿ => c0 +À => cb + +INVERTED EXCLAMATION MARK: ¡ => c1 +Á => e7 + +NOT SIGN: ¬ => c2 + => e5 + +SQUARE ROOT: √ => c3 +à => cc + +LATIN SMALL LETTER F WITH HOOK: ƒ => c4 +Ä => 80 + +ALMOST EQUAL TO: ≈ => c5 +Å => 81 + +INCREMENT: ∆ => c6 +Æ => ae + +LEFT-POINTING DOUBLE ANGLE QUOTATION MARK: « => c7 +Ç => 82 + +RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK: » => c8 +È => e9 + +HORIZONTAL ELLIPSIS: … => c9 +É => 83 + +NO-BREAK SPACE:   => ca +Ê => e6 + +LATIN CAPITAL LETTER A WITH GRAVE: À => cb +Ë => e8 + +LATIN CAPITAL LETTER A WITH TILDE: à => cc +Ì => ed + +LATIN CAPITAL LETTER O WITH TILDE: Õ => cd +Í => ea + +LATIN CAPITAL LIGATURE OE: Œ => ce +Î => eb + +LATIN SMALL LIGATURE OE: œ => cf +Ï => ec + +EN DASH: – => d0 +Ð => Ð + +EM DASH: — => d1 +Ñ => 84 + +LEFT DOUBLE QUOTATION MARK: “ => d2 +Ò => f1 + +RIGHT DOUBLE QUOTATION MARK: ” => d3 +Ó => ee + +LEFT SINGLE QUOTATION MARK: ‘ => d4 +Ô => ef + +RIGHT SINGLE QUOTATION MARK: ’ => d5 +Õ => cd + +DIVISION SIGN: ÷ => d6 +Ö => 85 + +LOZENGE: ◊ => d7 +× => × + +LATIN SMALL LETTER Y WITH DIAERESIS: ÿ => d8 +Ø => af + +LATIN CAPITAL LETTER Y WITH DIAERESIS: Ÿ => d9 +Ù => f4 + +FRACTION SLASH: ⁄ => da +Ú => f2 + +EURO SIGN: € => db +Û => f3 + +SINGLE LEFT-POINTING ANGLE QUOTATION MARK: ‹ => dc +Ü => 86 + +SINGLE RIGHT-POINTING ANGLE QUOTATION MARK: › => dd +Ý => Ý + +LATIN SMALL LIGATURE FI: fi => de +Þ => Þ + +LATIN SMALL LIGATURE FL: fl => df +ß => a7 + +DOUBLE DAGGER: ‡ => e0 +à => 88 + +MIDDLE DOT: · => e1 +á => 87 + +SINGLE LOW-9 QUOTATION MARK: ‚ => e2 +â => 89 + +DOUBLE LOW-9 QUOTATION MARK: „ => e3 +ã => 8b + +PER MILLE SIGN: ‰ => e4 +ä => 8a + +LATIN CAPITAL LETTER A WITH CIRCUMFLEX:  => e5 +å => 8c + +LATIN CAPITAL LETTER E WITH CIRCUMFLEX: Ê => e6 +æ => be + +LATIN CAPITAL LETTER A WITH ACUTE: Á => e7 +ç => 8d + +LATIN CAPITAL LETTER E WITH DIAERESIS: Ë => e8 +è => 8f + +LATIN CAPITAL LETTER E WITH GRAVE: È => e9 +é => 8e + +LATIN CAPITAL LETTER I WITH ACUTE: Í => ea +ê => 90 + +LATIN CAPITAL LETTER I WITH CIRCUMFLEX: Î => eb +ë => 91 + +LATIN CAPITAL LETTER I WITH DIAERESIS: Ï => ec +ì => 93 + +LATIN CAPITAL LETTER I WITH GRAVE: Ì => ed +í => 92 + +LATIN CAPITAL LETTER O WITH ACUTE: Ó => ee +î => 94 + +LATIN CAPITAL LETTER O WITH CIRCUMFLEX: Ô => ef +ï => 95 + +Apple logo:  => f0 +ð => ð + +LATIN CAPITAL LETTER O WITH GRAVE: Ò => f1 +ñ => 96 + +LATIN CAPITAL LETTER U WITH ACUTE: Ú => f2 +ò => 98 + +LATIN CAPITAL LETTER U WITH CIRCUMFLEX: Û => f3 +ó => 97 + +LATIN CAPITAL LETTER U WITH GRAVE: Ù => f4 +ô => 99 + +LATIN SMALL LETTER DOTLESS I: ı => f5 +õ => 9b + +MODIFIER LETTER CIRCUMFLEX ACCENT: ˆ => f6 +ö => 9a + +SMALL TILDE: ˜ => f7 +÷ => d6 + +MACRON: ¯ => f8 +ø => bf + +BREVE: ˘ => f9 +ù => 9d + +DOT ABOVE: ˙ => fa +ú => 9c + +RING ABOVE: ˚ => fb +û => 9e + +CEDILLA: ¸ => fc +ü => 9f + +DOUBLE ACUTE ACCENT: ˝ => fd +ý => ý + +OGONEK: ˛ => fe +þ => þ + +CARON: ˇ => ff +ÿ => d8 + + diff --git a/ext/standard/tests/strings/html_entity_decode_win1251.phpt b/ext/standard/tests/strings/html_entity_decode_win1251.phpt new file mode 100644 index 0000000000..e47392623c --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_win1251.phpt @@ -0,0 +1,537 @@ +--TEST-- +Translation of HTML entities for encoding WIN-1251 +--FILE-- + array(0x80, "CYRILLIC CAPITAL LETTER DJE"), +0x0403 => array(0x81, "CYRILLIC CAPITAL LETTER GJE"), +0x201A => array(0x82, "SINGLE LOW-9 QUOTATION MARK"), +0x0453 => array(0x83, "CYRILLIC SMALL LETTER GJE"), +0x201E => array(0x84, "DOUBLE LOW-9 QUOTATION MARK"), +0x2026 => array(0x85, "HORIZONTAL ELLIPSIS"), +0x2020 => array(0x86, "DAGGER"), +0x2021 => array(0x87, "DOUBLE DAGGER"), +0x20AC => array(0x88, "EURO SIGN"), +0x2030 => array(0x89, "PER MILLE SIGN"), +0x0409 => array(0x8A, "CYRILLIC CAPITAL LETTER LJE"), +0x2039 => array(0x8B, "SINGLE LEFT-POINTING ANGLE QUOTATION MARK"), +0x040A => array(0x8C, "CYRILLIC CAPITAL LETTER NJE"), +0x040C => array(0x8D, "CYRILLIC CAPITAL LETTER KJE"), +0x040B => array(0x8E, "CYRILLIC CAPITAL LETTER TSHE"), +0x040F => array(0x8F, "CYRILLIC CAPITAL LETTER DZHE"), +0x0452 => array(0x90, "CYRILLIC SMALL LETTER DJE"), +0x2018 => array(0x91, "LEFT SINGLE QUOTATION MARK"), +0x2019 => array(0x92, "RIGHT SINGLE QUOTATION MARK"), +0x201C => array(0x93, "LEFT DOUBLE QUOTATION MARK"), +0x201D => array(0x94, "RIGHT DOUBLE QUOTATION MARK"), +0x2022 => array(0x95, "BULLET"), +0x2013 => array(0x96, "EN DASH"), +0x2014 => array(0x97, "EM DASH"), +//0x98 #UNDEFINED +0x2122 => array(0x99, "TRADE MARK SIGN"), +0x0459 => array(0x9A, "CYRILLIC SMALL LETTER LJE"), +0x203A => array(0x9B, "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK"), +0x045A => array(0x9C, "CYRILLIC SMALL LETTER NJE"), +0x045C => array(0x9D, "CYRILLIC SMALL LETTER KJE"), +0x045B => array(0x9E, "CYRILLIC SMALL LETTER TSHE"), +0x045F => array(0x9F, "CYRILLIC SMALL LETTER DZHE"), +0x00A0 => array(0xA0, "NO-BREAK SPACE"), +0x040E => array(0xA1, "CYRILLIC CAPITAL LETTER SHORT U"), +0x045E => array(0xA2, "CYRILLIC SMALL LETTER SHORT U"), +0x0408 => array(0xA3, "CYRILLIC CAPITAL LETTER JE"), +0x00A4 => array(0xA4, "CURRENCY SIGN"), +0x0490 => array(0xA5, "CYRILLIC CAPITAL LETTER GHE WITH UPTURN"), +0x00A6 => array(0xA6, "BROKEN BAR"), +0x00A7 => array(0xA7, "SECTION SIGN"), +0x0401 => array(0xA8, "CYRILLIC CAPITAL LETTER IO"), +0x00A9 => array(0xA9, "COPYRIGHT SIGN"), +0x0404 => array(0xAA, "CYRILLIC CAPITAL LETTER UKRAINIAN IE"), +0x00AB => array(0xAB, "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x00AC => array(0xAC, "NOT SIGN"), +0x00AD => array(0xAD, "SOFT HYPHEN"), +0x00AE => array(0xAE, "REGISTERED SIGN"), +0x0407 => array(0xAF, "CYRILLIC CAPITAL LETTER YI"), +0x00B0 => array(0xB0, "DEGREE SIGN"), +0x00B1 => array(0xB1, "PLUS-MINUS SIGN"), +0x0406 => array(0xB2, "CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I"), +0x0456 => array(0xB3, "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I"), +0x0491 => array(0xB4, "CYRILLIC SMALL LETTER GHE WITH UPTURN"), +0x00B5 => array(0xB5, "MICRO SIGN"), +0x00B6 => array(0xB6, "PILCROW SIGN"), +0x00B7 => array(0xB7, "MIDDLE DOT"), +0x0451 => array(0xB8, "CYRILLIC SMALL LETTER IO"), +0x2116 => array(0xB9, "NUMERO SIGN"), +0x0454 => array(0xBA, "CYRILLIC SMALL LETTER UKRAINIAN IE"), +0x00BB => array(0xBB, "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x0458 => array(0xBC, "CYRILLIC SMALL LETTER JE"), +0x0405 => array(0xBD, "CYRILLIC CAPITAL LETTER DZE"), +0x0455 => array(0xBE, "CYRILLIC SMALL LETTER DZE"), +0x0457 => array(0xBF, "CYRILLIC SMALL LETTER YI"), +0x0410 => array(0xC0, "CYRILLIC CAPITAL LETTER A"), +0x0411 => array(0xC1, "CYRILLIC CAPITAL LETTER BE"), +0x0412 => array(0xC2, "CYRILLIC CAPITAL LETTER VE"), +0x0413 => array(0xC3, "CYRILLIC CAPITAL LETTER GHE"), +0x0414 => array(0xC4, "CYRILLIC CAPITAL LETTER DE"), +0x0415 => array(0xC5, "CYRILLIC CAPITAL LETTER IE"), +0x0416 => array(0xC6, "CYRILLIC CAPITAL LETTER ZHE"), +0x0417 => array(0xC7, "CYRILLIC CAPITAL LETTER ZE"), +0x0418 => array(0xC8, "CYRILLIC CAPITAL LETTER I"), +0x0419 => array(0xC9, "CYRILLIC CAPITAL LETTER SHORT I"), +0x041A => array(0xCA, "CYRILLIC CAPITAL LETTER KA"), +0x041B => array(0xCB, "CYRILLIC CAPITAL LETTER EL"), +0x041C => array(0xCC, "CYRILLIC CAPITAL LETTER EM"), +0x041D => array(0xCD, "CYRILLIC CAPITAL LETTER EN"), +0x041E => array(0xCE, "CYRILLIC CAPITAL LETTER O"), +0x041F => array(0xCF, "CYRILLIC CAPITAL LETTER PE"), +0x0420 => array(0xD0, "CYRILLIC CAPITAL LETTER ER"), +0x0421 => array(0xD1, "CYRILLIC CAPITAL LETTER ES"), +0x0422 => array(0xD2, "CYRILLIC CAPITAL LETTER TE"), +0x0423 => array(0xD3, "CYRILLIC CAPITAL LETTER U"), +0x0424 => array(0xD4, "CYRILLIC CAPITAL LETTER EF"), +0x0425 => array(0xD5, "CYRILLIC CAPITAL LETTER HA"), +0x0426 => array(0xD6, "CYRILLIC CAPITAL LETTER TSE"), +0x0427 => array(0xD7, "CYRILLIC CAPITAL LETTER CHE"), +0x0428 => array(0xD8, "CYRILLIC CAPITAL LETTER SHA"), +0x0429 => array(0xD9, "CYRILLIC CAPITAL LETTER SHCHA"), +0x042A => array(0xDA, "CYRILLIC CAPITAL LETTER HARD SIGN"), +0x042B => array(0xDB, "CYRILLIC CAPITAL LETTER YERU"), +0x042C => array(0xDC, "CYRILLIC CAPITAL LETTER SOFT SIGN"), +0x042D => array(0xDD, "CYRILLIC CAPITAL LETTER E"), +0x042E => array(0xDE, "CYRILLIC CAPITAL LETTER YU"), +0x042F => array(0xDF, "CYRILLIC CAPITAL LETTER YA"), +0x0430 => array(0xE0, "CYRILLIC SMALL LETTER A"), +0x0431 => array(0xE1, "CYRILLIC SMALL LETTER BE"), +0x0432 => array(0xE2, "CYRILLIC SMALL LETTER VE"), +0x0433 => array(0xE3, "CYRILLIC SMALL LETTER GHE"), +0x0434 => array(0xE4, "CYRILLIC SMALL LETTER DE"), +0x0435 => array(0xE5, "CYRILLIC SMALL LETTER IE"), +0x0436 => array(0xE6, "CYRILLIC SMALL LETTER ZHE"), +0x0437 => array(0xE7, "CYRILLIC SMALL LETTER ZE"), +0x0438 => array(0xE8, "CYRILLIC SMALL LETTER I"), +0x0439 => array(0xE9, "CYRILLIC SMALL LETTER SHORT I"), +0x043A => array(0xEA, "CYRILLIC SMALL LETTER KA"), +0x043B => array(0xEB, "CYRILLIC SMALL LETTER EL"), +0x043C => array(0xEC, "CYRILLIC SMALL LETTER EM"), +0x043D => array(0xED, "CYRILLIC SMALL LETTER EN"), +0x043E => array(0xEE, "CYRILLIC SMALL LETTER O"), +0x043F => array(0xEF, "CYRILLIC SMALL LETTER PE"), +0x0440 => array(0xF0, "CYRILLIC SMALL LETTER ER"), +0x0441 => array(0xF1, "CYRILLIC SMALL LETTER ES"), +0x0442 => array(0xF2, "CYRILLIC SMALL LETTER TE"), +0x0443 => array(0xF3, "CYRILLIC SMALL LETTER U"), +0x0444 => array(0xF4, "CYRILLIC SMALL LETTER EF"), +0x0445 => array(0xF5, "CYRILLIC SMALL LETTER HA"), +0x0446 => array(0xF6, "CYRILLIC SMALL LETTER TSE"), +0x0447 => array(0xF7, "CYRILLIC SMALL LETTER CHE"), +0x0448 => array(0xF8, "CYRILLIC SMALL LETTER SHA"), +0x0449 => array(0xF9, "CYRILLIC SMALL LETTER SHCHA"), +0x044A => array(0xFA, "CYRILLIC SMALL LETTER HARD SIGN"), +0x044B => array(0xFB, "CYRILLIC SMALL LETTER YERU"), +0x044C => array(0xFC, "CYRILLIC SMALL LETTER SOFT SIGN"), +0x044D => array(0xFD, "CYRILLIC SMALL LETTER E"), +0x044E => array(0xFE, "CYRILLIC SMALL LETTER YU"), +0x044F => array(0xFF, "CYRILLIC SMALL LETTER YA"), +); + +$res = html_entity_decode("˜", ENT_QUOTES, 'WINDOWS-1251'); +echo "Special test for ˜ (shouldn't decode):\n"; +echo $res,"\n\n"; + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1251'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1251'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +Special test for ˜ (shouldn't decode): +˜ + +CYRILLIC CAPITAL LETTER DJE: Ђ => 80 +€ => € + +CYRILLIC CAPITAL LETTER GJE: Ѓ => 81 + =>  + +SINGLE LOW-9 QUOTATION MARK: ‚ => 82 +‚ => ‚ + +CYRILLIC SMALL LETTER GJE: ѓ => 83 +ƒ => ƒ + +DOUBLE LOW-9 QUOTATION MARK: „ => 84 +„ => „ + +HORIZONTAL ELLIPSIS: … => 85 +… => … + +DAGGER: † => 86 +† => † + +DOUBLE DAGGER: ‡ => 87 +‡ => ‡ + +EURO SIGN: € => 88 +ˆ => ˆ + +PER MILLE SIGN: ‰ => 89 +‰ => ‰ + +CYRILLIC CAPITAL LETTER LJE: Љ => 8a +Š => Š + +SINGLE LEFT-POINTING ANGLE QUOTATION MARK: ‹ => 8b +‹ => ‹ + +CYRILLIC CAPITAL LETTER NJE: Њ => 8c +Œ => Œ + +CYRILLIC CAPITAL LETTER KJE: Ќ => 8d + =>  + +CYRILLIC CAPITAL LETTER TSHE: Ћ => 8e +Ž => Ž + +CYRILLIC CAPITAL LETTER DZHE: Џ => 8f + =>  + +CYRILLIC SMALL LETTER DJE: ђ => 90 + =>  + +LEFT SINGLE QUOTATION MARK: ‘ => 91 +‘ => ‘ + +RIGHT SINGLE QUOTATION MARK: ’ => 92 +’ => ’ + +LEFT DOUBLE QUOTATION MARK: “ => 93 +“ => “ + +RIGHT DOUBLE QUOTATION MARK: ” => 94 +” => ” + +BULLET: • => 95 +• => • + +EN DASH: – => 96 +– => – + +EM DASH: — => 97 +— => — + +TRADE MARK SIGN: ™ => 99 +™ => ™ + +CYRILLIC SMALL LETTER LJE: љ => 9a +š => š + +SINGLE RIGHT-POINTING ANGLE QUOTATION MARK: › => 9b +› => › + +CYRILLIC SMALL LETTER NJE: њ => 9c +œ => œ + +CYRILLIC SMALL LETTER KJE: ќ => 9d + =>  + +CYRILLIC SMALL LETTER TSHE: ћ => 9e +ž => ž + +CYRILLIC SMALL LETTER DZHE: џ => 9f +Ÿ => Ÿ + +NO-BREAK SPACE:   => a0 +  => a0 + +CYRILLIC CAPITAL LETTER SHORT U: Ў => a1 +¡ => ¡ + +CYRILLIC SMALL LETTER SHORT U: ў => a2 +¢ => ¢ + +CYRILLIC CAPITAL LETTER JE: Ј => a3 +£ => £ + +CURRENCY SIGN: ¤ => a4 +¤ => a4 + +CYRILLIC CAPITAL LETTER GHE WITH UPTURN: Ґ => a5 +¥ => ¥ + +BROKEN BAR: ¦ => a6 +¦ => a6 + +SECTION SIGN: § => a7 +§ => a7 + +CYRILLIC CAPITAL LETTER IO: Ё => a8 +¨ => ¨ + +COPYRIGHT SIGN: © => a9 +© => a9 + +CYRILLIC CAPITAL LETTER UKRAINIAN IE: Є => aa +ª => ª + +LEFT-POINTING DOUBLE ANGLE QUOTATION MARK: « => ab +« => ab + +NOT SIGN: ¬ => ac +¬ => ac + +SOFT HYPHEN: ­ => ad +­ => ad + +REGISTERED SIGN: ® => ae +® => ae + +CYRILLIC CAPITAL LETTER YI: Ї => af +¯ => ¯ + +DEGREE SIGN: ° => b0 +° => b0 + +PLUS-MINUS SIGN: ± => b1 +± => b1 + +CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I: І => b2 +² => ² + +CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I: і => b3 +³ => ³ + +CYRILLIC SMALL LETTER GHE WITH UPTURN: ґ => b4 +´ => ´ + +MICRO SIGN: µ => b5 +µ => b5 + +PILCROW SIGN: ¶ => b6 +¶ => b6 + +MIDDLE DOT: · => b7 +· => b7 + +CYRILLIC SMALL LETTER IO: ё => b8 +¸ => ¸ + +NUMERO SIGN: № => b9 +¹ => ¹ + +CYRILLIC SMALL LETTER UKRAINIAN IE: є => ba +º => º + +RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK: » => bb +» => bb + +CYRILLIC SMALL LETTER JE: ј => bc +¼ => ¼ + +CYRILLIC CAPITAL LETTER DZE: Ѕ => bd +½ => ½ + +CYRILLIC SMALL LETTER DZE: ѕ => be +¾ => ¾ + +CYRILLIC SMALL LETTER YI: ї => bf +¿ => ¿ + +CYRILLIC CAPITAL LETTER A: А => c0 +À => À + +CYRILLIC CAPITAL LETTER BE: Б => c1 +Á => Á + +CYRILLIC CAPITAL LETTER VE: В => c2 + =>  + +CYRILLIC CAPITAL LETTER GHE: Г => c3 +à => à + +CYRILLIC CAPITAL LETTER DE: Д => c4 +Ä => Ä + +CYRILLIC CAPITAL LETTER IE: Е => c5 +Å => Å + +CYRILLIC CAPITAL LETTER ZHE: Ж => c6 +Æ => Æ + +CYRILLIC CAPITAL LETTER ZE: З => c7 +Ç => Ç + +CYRILLIC CAPITAL LETTER I: И => c8 +È => È + +CYRILLIC CAPITAL LETTER SHORT I: Й => c9 +É => É + +CYRILLIC CAPITAL LETTER KA: К => ca +Ê => Ê + +CYRILLIC CAPITAL LETTER EL: Л => cb +Ë => Ë + +CYRILLIC CAPITAL LETTER EM: М => cc +Ì => Ì + +CYRILLIC CAPITAL LETTER EN: Н => cd +Í => Í + +CYRILLIC CAPITAL LETTER O: О => ce +Î => Î + +CYRILLIC CAPITAL LETTER PE: П => cf +Ï => Ï + +CYRILLIC CAPITAL LETTER ER: Р => d0 +Ð => Ð + +CYRILLIC CAPITAL LETTER ES: С => d1 +Ñ => Ñ + +CYRILLIC CAPITAL LETTER TE: Т => d2 +Ò => Ò + +CYRILLIC CAPITAL LETTER U: У => d3 +Ó => Ó + +CYRILLIC CAPITAL LETTER EF: Ф => d4 +Ô => Ô + +CYRILLIC CAPITAL LETTER HA: Х => d5 +Õ => Õ + +CYRILLIC CAPITAL LETTER TSE: Ц => d6 +Ö => Ö + +CYRILLIC CAPITAL LETTER CHE: Ч => d7 +× => × + +CYRILLIC CAPITAL LETTER SHA: Ш => d8 +Ø => Ø + +CYRILLIC CAPITAL LETTER SHCHA: Щ => d9 +Ù => Ù + +CYRILLIC CAPITAL LETTER HARD SIGN: Ъ => da +Ú => Ú + +CYRILLIC CAPITAL LETTER YERU: Ы => db +Û => Û + +CYRILLIC CAPITAL LETTER SOFT SIGN: Ь => dc +Ü => Ü + +CYRILLIC CAPITAL LETTER E: Э => dd +Ý => Ý + +CYRILLIC CAPITAL LETTER YU: Ю => de +Þ => Þ + +CYRILLIC CAPITAL LETTER YA: Я => df +ß => ß + +CYRILLIC SMALL LETTER A: а => e0 +à => à + +CYRILLIC SMALL LETTER BE: б => e1 +á => á + +CYRILLIC SMALL LETTER VE: в => e2 +â => â + +CYRILLIC SMALL LETTER GHE: г => e3 +ã => ã + +CYRILLIC SMALL LETTER DE: д => e4 +ä => ä + +CYRILLIC SMALL LETTER IE: е => e5 +å => å + +CYRILLIC SMALL LETTER ZHE: ж => e6 +æ => æ + +CYRILLIC SMALL LETTER ZE: з => e7 +ç => ç + +CYRILLIC SMALL LETTER I: и => e8 +è => è + +CYRILLIC SMALL LETTER SHORT I: й => e9 +é => é + +CYRILLIC SMALL LETTER KA: к => ea +ê => ê + +CYRILLIC SMALL LETTER EL: л => eb +ë => ë + +CYRILLIC SMALL LETTER EM: м => ec +ì => ì + +CYRILLIC SMALL LETTER EN: н => ed +í => í + +CYRILLIC SMALL LETTER O: о => ee +î => î + +CYRILLIC SMALL LETTER PE: п => ef +ï => ï + +CYRILLIC SMALL LETTER ER: р => f0 +ð => ð + +CYRILLIC SMALL LETTER ES: с => f1 +ñ => ñ + +CYRILLIC SMALL LETTER TE: т => f2 +ò => ò + +CYRILLIC SMALL LETTER U: у => f3 +ó => ó + +CYRILLIC SMALL LETTER EF: ф => f4 +ô => ô + +CYRILLIC SMALL LETTER HA: х => f5 +õ => õ + +CYRILLIC SMALL LETTER TSE: ц => f6 +ö => ö + +CYRILLIC SMALL LETTER CHE: ч => f7 +÷ => ÷ + +CYRILLIC SMALL LETTER SHA: ш => f8 +ø => ø + +CYRILLIC SMALL LETTER SHCHA: щ => f9 +ù => ù + +CYRILLIC SMALL LETTER HARD SIGN: ъ => fa +ú => ú + +CYRILLIC SMALL LETTER YERU: ы => fb +û => û + +CYRILLIC SMALL LETTER SOFT SIGN: ь => fc +ü => ü + +CYRILLIC SMALL LETTER E: э => fd +ý => ý + +CYRILLIC SMALL LETTER YU: ю => fe +þ => þ + +CYRILLIC SMALL LETTER YA: я => ff +ÿ => ÿ + + diff --git a/ext/standard/tests/strings/html_entity_decode_win1252.phpt b/ext/standard/tests/strings/html_entity_decode_win1252.phpt new file mode 100644 index 0000000000..2a7a6981dc --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_win1252.phpt @@ -0,0 +1,169 @@ +--TEST-- +Translation of HTML entities for encoding WIN-1252 +--FILE-- + array(0x80, "EURO SIGN"), +//0x81 #UNDEFINED +0x201A => array(0x82, "SINGLE LOW-9 QUOTATION MARK"), +0x0192 => array(0x83, "LATIN SMALL LETTER F WITH HOOK"), +0x201E => array(0x84, "DOUBLE LOW-9 QUOTATION MARK"), +0x2026 => array(0x85, "HORIZONTAL ELLIPSIS"), +0x2020 => array(0x86, "DAGGER"), +0x2021 => array(0x87, "DOUBLE DAGGER"), +0x02C6 => array(0x88, "MODIFIER LETTER CIRCUMFLEX ACCENT"), +0x2030 => array(0x89, "PER MILLE SIGN"), +0x0160 => array(0x8A, "LATIN CAPITAL LETTER S WITH CARON"), +0x2039 => array(0x8B, "SINGLE LEFT-POINTING ANGLE QUOTATION MARK"), +0x0152 => array(0x8C, "LATIN CAPITAL LIGATURE OE"), +//0x8D #UNDEFINED +0x017D => array(0x8E, "LATIN CAPITAL LETTER Z WITH CARON"), +//0x8F #UNDEFINED +//0x90 #UNDEFINED +0x2018 => array(0x91, "LEFT SINGLE QUOTATION MARK"), +0x2019 => array(0x92, "RIGHT SINGLE QUOTATION MARK"), +0x201C => array(0x93, "LEFT DOUBLE QUOTATION MARK"), +0x201D => array(0x94, "RIGHT DOUBLE QUOTATION MARK"), +0x2022 => array(0x95, "BULLET"), +0x2013 => array(0x96, "EN DASH"), +0x2014 => array(0x97, "EM DASH"), +0x02DC => array(0x98, "SMALL TILDE"), +0x2122 => array(0x99, "TRADE MARK SIGN"), +0x0161 => array(0x9A, "LATIN SMALL LETTER S WITH CARON"), +0x203A => array(0x9B, "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK"), +0x0153 => array(0x9C, "LATIN SMALL LIGATURE OE"), +//0x9D #UNDEFINED +0x017E => array(0x9E, "LATIN SMALL LETTER Z WITH CARON"), +0x0178 => array(0x9F, "LATIN CAPITAL LETTER Y WITH DIAERESIS"), +); + +$res = html_entity_decode("", ENT_QUOTES, 'WINDOWS-1252'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +$res = html_entity_decode("", ENT_QUOTES, 'WINDOWS-1252'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +$res = html_entity_decode("", ENT_QUOTES, 'WINDOWS-1252'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +$res = html_entity_decode("", ENT_QUOTES, 'WINDOWS-1252'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +$res = html_entity_decode("", ENT_QUOTES, 'WINDOWS-1252'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1252'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1252'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +Special test for  (shouldn't decode): + + +Special test for  (shouldn't decode): + + +Special test for  (shouldn't decode): + + +Special test for  (shouldn't decode): + + +Special test for  (shouldn't decode): + + +EURO SIGN: € => 80 +€ => € + +SINGLE LOW-9 QUOTATION MARK: ‚ => 82 +‚ => ‚ + +LATIN SMALL LETTER F WITH HOOK: ƒ => 83 +ƒ => ƒ + +DOUBLE LOW-9 QUOTATION MARK: „ => 84 +„ => „ + +HORIZONTAL ELLIPSIS: … => 85 +… => … + +DAGGER: † => 86 +† => † + +DOUBLE DAGGER: ‡ => 87 +‡ => ‡ + +MODIFIER LETTER CIRCUMFLEX ACCENT: ˆ => 88 +ˆ => ˆ + +PER MILLE SIGN: ‰ => 89 +‰ => ‰ + +LATIN CAPITAL LETTER S WITH CARON: Š => 8a +Š => Š + +SINGLE LEFT-POINTING ANGLE QUOTATION MARK: ‹ => 8b +‹ => ‹ + +LATIN CAPITAL LIGATURE OE: Œ => 8c +Œ => Œ + +LATIN CAPITAL LETTER Z WITH CARON: Ž => 8e +Ž => Ž + +LEFT SINGLE QUOTATION MARK: ‘ => 91 +‘ => ‘ + +RIGHT SINGLE QUOTATION MARK: ’ => 92 +’ => ’ + +LEFT DOUBLE QUOTATION MARK: “ => 93 +“ => “ + +RIGHT DOUBLE QUOTATION MARK: ” => 94 +” => ” + +BULLET: • => 95 +• => • + +EN DASH: – => 96 +– => – + +EM DASH: — => 97 +— => — + +SMALL TILDE: ˜ => 98 +˜ => ˜ + +TRADE MARK SIGN: ™ => 99 +™ => ™ + +LATIN SMALL LETTER S WITH CARON: š => 9a +š => š + +SINGLE RIGHT-POINTING ANGLE QUOTATION MARK: › => 9b +› => › + +LATIN SMALL LIGATURE OE: œ => 9c +œ => œ + +LATIN SMALL LETTER Z WITH CARON: ž => 9e +ž => ž + +LATIN CAPITAL LETTER Y WITH DIAERESIS: Ÿ => 9f +Ÿ => Ÿ + + diff --git a/ext/standard/tests/strings/htmlentities17.phpt b/ext/standard/tests/strings/htmlentities17.phpt index b203e7c3e0..d9e67a9b87 100644 --- a/ext/standard/tests/strings/htmlentities17.phpt +++ b/ext/standard/tests/strings/htmlentities17.phpt @@ -3,7 +3,6 @@ htmlentities() / html_entity_decode() #8592 - #9002 table test --FILE-- --EXPECT-- -string(8) "≀" string(7) "⊕" string(8) "⊗" string(6) "⊥" @@ -37,7 +35,6 @@ string(8) "⌊" string(8) "⌋" string(6) "⟨" string(6) "⟩" -string(6) "e28980" string(6) "e28a95" string(6) "e28a97" string(6) "e28aa5" -- 2.40.0