]> granicus.if.org Git - php/commitdiff
-New encoding/decoding HTML-ENTITIES.
authorMarcus Boerger <helly@php.net>
Sun, 4 Aug 2002 14:20:11 +0000 (14:20 +0000)
committerMarcus Boerger <helly@php.net>
Sun, 4 Aug 2002 14:20:11 +0000 (14:20 +0000)
#This allows to enforce named and numeric entities in output.
#Example:CMS that reads UTF8 or ISO-8859-1 but exports HTML in ASCII.
#Using mbstring.http_output=HTML-ENTITIES users receive entities which
#can displayed correctly independant of any brwoser side encoding.
@New mbstring encoding/decoding HTML-ENTITIES. (marcus)

ext/mbstring/config.m4
ext/mbstring/html_entities.c [new file with mode: 0644]
ext/mbstring/mbfilter.c
ext/mbstring/mbfilter.h
ext/mbstring/tests/020.phpt [new file with mode: 0644]

index 60cfb3396d0692ef4700daf4602c5e1b9c5ed334..a0656c11f32398477d91a141abf6a7214047ab9e 100644 (file)
@@ -30,7 +30,7 @@ if test "$PHP_MBSTRING" != "no"; then
     AC_DEFINE(HAVE_MBSTR_KR,1,[whether to have korean support])
     AC_DEFINE(HAVE_MBSTR_RU,1,[whether to have russian support])
   fi
-  PHP_NEW_EXTENSION(mbstring, mbfilter_ja.c mbfilter_cn.c mbfilter_tw.c mbfilter_kr.c mbfilter_ru.c mbfilter.c mbstring.c mbregex.c php_mbregex.c, $ext_shared)
+  PHP_NEW_EXTENSION(mbstring, mbfilter_ja.c mbfilter_cn.c mbfilter_tw.c mbfilter_kr.c mbfilter_ru.c mbfilter.c mbstring.c mbregex.c php_mbregex.c html_entities.c, $ext_shared)
 else
   PHP_MBSTR_ENC_TRANS=no
 fi
diff --git a/ext/mbstring/html_entities.c b/ext/mbstring/html_entities.c
new file mode 100644 (file)
index 0000000..482c001
--- /dev/null
@@ -0,0 +1,291 @@
+/*
+   +----------------------------------------------------------------------+
+   | PHP Version 4                                                        |
+   +----------------------------------------------------------------------+
+   | Copyright (c) 2001 The PHP Group                                     |
+   +----------------------------------------------------------------------+
+   | This source file is subject to version 2.02 of the PHP license,      |
+   | that is bundled with this package in the file LICENSE, and is        |
+   | available at through the world-wide-web at                           |
+   | http://www.php.net/license/2_02.txt.                                 |
+   | If you did not receive a copy of the PHP license and are unable to   |
+   | obtain it through the world-wide-web, please send a note to          |
+   | license@php.net so we can mail you a copy immediately.               |
+   +----------------------------------------------------------------------+
+   | Author: Marcus Boerger <helly@php.net>                               |
+   +----------------------------------------------------------------------+
+ */
+
+/* $Id$ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "php.h"
+#include "php_globals.h"
+
+#include <stdlib.h>
+#include "mbfilter.h"
+
+const mbfl_html_entity mbfl_html_entity_list[] = { /*
+  {"quot",       34},  DO NOT CONVERT THESE AUTOMATICALLY
+  {"amp",        38},
+  {"lt",         60},
+  {"gt",         62},  */
+  {"nbsp",      160},
+  {"iexcl",     161},
+  {"cent",      162},
+  {"pound",     163},
+  {"curren",    164},
+  {"yen",       165},
+  {"brvbar",    166},
+  {"sect",      167},
+  {"uml",       168},
+  {"copy",      169},
+  {"ordf",      170},
+  {"laquo",     171},
+  {"not",       172},
+  {"shy",       173},
+  {"reg",       174},
+  {"macr",      175},
+  {"deg",       176},
+  {"plusmn",    177},
+  {"sup2",      178},
+  {"sup3",      179},
+  {"acute",     180},
+  {"micro",     181},
+  {"para",      182},
+  {"middot",    183},
+  {"cedil",     184},
+  {"sup1",      185},
+  {"ordm",      186},
+  {"raquo",     187},
+  {"frac14",    188},
+  {"frac12",    189},
+  {"frac34",    190},
+  {"iquest",    191},
+  {"Agrave",    192},
+  {"Aacute",    193},
+  {"Acirc",     194},
+  {"Atilde",    195},
+  {"Auml",      196},
+  {"Aring",     197},
+  {"AElig",     198},
+  {"Ccedil",    199},
+  {"Egrave",    200},
+  {"Eacute",    201},
+  {"Ecirc",     202},
+  {"Euml",      203},
+  {"Igrave",    204},
+  {"Iacute",    205},
+  {"Icirc",     206},
+  {"Iuml",      207},
+  {"ETH",       208},
+  {"Ntilde",    209},
+  {"Ograve",    210},
+  {"Oacute",    211},
+  {"Ocirc",     212},
+  {"Otilde",    213},
+  {"Ouml",      214},
+  {"times",     215},
+  {"Oslash",    216},
+  {"Ugrave",    217},
+  {"Uacute",    218},
+  {"Ucirc",     219},
+  {"Uuml",      220},
+  {"Yacute",    221},
+  {"THORN",     222},
+  {"szlig",     223},
+  {"agrave",    224},
+  {"aacute",    225},
+  {"acirc",     226},
+  {"atilde",    227},
+  {"auml",      228},
+  {"aring",     229},
+  {"aelig",     230},
+  {"ccedil",    231},
+  {"egrave",    232},
+  {"eacute",    233},
+  {"ecirc",     234},
+  {"euml",      235},
+  {"igrave",    236},
+  {"iacute",    237},
+  {"icirc",     238},
+  {"iuml",      239},
+  {"eth",       240},
+  {"ntilde",    241},
+  {"ograve",    242},
+  {"oacute",    243},
+  {"ocirc",     244},
+  {"otilde",    245},
+  {"ouml",      246},
+  {"divide",    247},
+  {"oslash",    248},
+  {"ugrave",    249},
+  {"uacute",    250},
+  {"ucirc",     251},
+  {"uuml",      252},
+  {"yacute",    253},
+  {"thorn",     254},
+  {"yuml",      255},
+  {"OElig",     338},
+  {"oelig",     339},
+  {"Scaron",    352},
+  {"scaron",    353},
+  {"Yuml",      376},
+  {"fnof",      402},
+  {"circ",      710},
+  {"tilde",     732},
+  {"Alpha",     913},
+  {"Beta",      914},
+  {"Gamma",     915},
+  {"Delta",     916},
+  {"Epsilon",   917},
+  {"Zeta",      918},
+  {"Eta",       919},
+  {"Theta",     920},
+  {"Iota",      921},
+  {"Kappa",     922},
+  {"Lambda",    923},
+  {"Mu",        924},
+  {"Nu",        925},
+  {"Xi",        926},
+  {"Omicron",   927},
+  {"Pi",        928},
+  {"Rho",       929},
+  {"Sigma",     931},
+  {"Tau",       932},
+  {"Upsilon",   933},
+  {"Phi",       934},
+  {"Chi",       935},
+  {"Psi",       936},
+  {"Omega",     937},
+  {"beta",      946},
+  {"gamma",     947},
+  {"delta",     948},
+  {"epsilon",   949},
+  {"zeta",      950},
+  {"eta",       951},
+  {"theta",     952},
+  {"iota",      953},
+  {"kappa",     954},
+  {"lambda",    955},
+  {"mu",        956},
+  {"nu",        957},
+  {"xi",        958},
+  {"omicron",   959},
+  {"pi",        960},
+  {"rho",       961},
+  {"sigmaf",    962},
+  {"sigma",     963},
+  {"tau",       964},
+  {"upsilon",   965},
+  {"phi",       966},
+  {"chi",       967},
+  {"psi",       968},
+  {"omega",     969},
+  {"thetasym",  977},
+  {"upsih",     978},
+  {"piv",       982},
+  {"ensp",     8194},
+  {"emsp",     8195},
+  {"thinsp",   8201},
+  {"zwnj",     8204},
+  {"zwj",      8205},
+  {"lrm",      8206},
+  {"rlm",      8207},
+  {"ndash",    8211},
+  {"mdash",    8212},
+  {"lsquo",    8216},
+  {"rsquo",    8217},
+  {"sbquo",    8218},
+  {"ldquo",    8220},
+  {"rdquo",    8221},
+  {"bdquo",    8222},
+  {"dagger",   8224},
+  {"Dagger",   8225},
+  {"bull",     8226},
+  {"hellip",   8230},
+  {"permil",   8240},
+  {"prime",    8242},
+  {"Prime",    8243},
+  {"lsaquo",   8249},
+  {"rsaquo",   8250},
+  {"oline",    8254},
+  {"frasl",    8260},
+  {"euro",     8364},
+  {"weierp",   8472},
+  {"image",    8465},
+  {"real",     8476},
+  {"trade",    8482},
+  {"alefsym",  8501},
+  {"larr",     8592},
+  {"uarr",     8593},
+  {"rarr",     8594},
+  {"darr",     8595},
+  {"harr",     8596},
+  {"crarr",    8629},
+  {"lArr",     8656},
+  {"uArr",     8657},
+  {"rArr",     8658},
+  {"dArr",     8659},
+  {"hArr",     8660},
+  {"forall",   8704},
+  {"part",     8706},
+  {"exist",    8707},
+  {"empty",    8709},
+  {"nabla",    8711},
+  {"isin",     8712},
+  {"notin",    8713},
+  {"ni",       8715},
+  {"prod",     8719},
+  {"sum",      8721},
+  {"minus",    8722},
+  {"lowast",   8727},
+  {"radic",    8730},
+  {"prop",     8733},
+  {"infin",    8734},
+  {"ang",      8736},
+  {"and",      8743},
+  {"or",       8744},
+  {"cap",      8745},
+  {"cup",      8746},
+  {"int",      8747},
+  {"there4",   8756},
+  {"sim",      8764},
+  {"cong",     8773},
+  {"asymp",    8776},
+  {"ne",       8800},
+  {"equiv",    8801},
+  {"le",       8804},
+  {"ge",       8805},
+  {"sub",      8834},
+  {"sup",      8835},
+  {"nsub",     8836},
+  {"sube",     8838},
+  {"supe",     8839},
+  {"oplus",    8853},
+  {"otimes",   8855},
+  {"perp",     8869},
+  {"sdot",     8901},
+  {"lceil",    8968},
+  {"rceil",    8969},
+  {"lfloor",   8970},
+  {"rfloor",   8971},
+  {"lang",     9001},
+  {"rang",     9002},
+  {"loz",      9674},
+  {"spades",   9824},
+  {"clubs",    9827},
+  {"hearts",   9829},
+  {"diams",    9830},
+  {NULL,         -1}  /* mark end of table */
+};
+
+/*
+ * Local variables:
+ * tab-width: 4
+ * c-basic-offset: 4
+ * End:
+ */
index 041e4e03b9164360f7258677d36160db949d8ce1..87e645e835e780ff3b30160f2eb82c0edc672644 100644 (file)
@@ -397,6 +397,25 @@ static const unsigned char mblen_table_uhc[] = { /* 0x81-0xFE */
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
 };
 
+static const unsigned char mblen_table_html[] = { /* 0x00, 0x80 - 0xFF, only valid for numeric entities */
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6
+};
+
 /* encoding structure */
 static const char *mbfl_encoding_pass_aliases[] = {"none", NULL};
 
@@ -483,6 +502,17 @@ static const mbfl_encoding mbfl_encoding_uuencode = {
        MBFL_ENCTYPE_SBCS
 };
 
+static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
+
+static const mbfl_encoding mbfl_encoding_html_ent = {
+       mbfl_no_encoding_html_ent,
+       "HTML-ENTITIES",
+       "html",
+       (const char *(*)[])&mbfl_encoding_html_ent_aliases,
+       NULL, /* mblen_table_html, Do not use table instead calulate length based on entities actually used */
+       MBFL_ENCTYPE_HTML_ENT
+};
+
 static const char *mbfl_encoding_qprint_aliases[] = {"qprint", NULL};
 
 static const mbfl_encoding mbfl_encoding_qprint = {
@@ -1025,6 +1055,7 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
        &mbfl_encoding_byte4le,
        &mbfl_encoding_base64,
        &mbfl_encoding_uuencode,
+       &mbfl_encoding_html_ent,
        &mbfl_encoding_qprint,
        &mbfl_encoding_7bit,
        &mbfl_encoding_8bit,
@@ -1115,6 +1146,13 @@ static int mbfl_filt_conv_base64dec(int c, mbfl_convert_filter *filter TSRMLS_DC
 static int mbfl_filt_conv_base64dec_flush(mbfl_convert_filter *filter TSRMLS_DC);
 static int mbfl_filt_conv_uudec(int c, mbfl_convert_filter *filter TSRMLS_DC);
 
+static void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter TSRMLS_DC);
+static void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter TSRMLS_DC);
+static int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter TSRMLS_DC);
+static int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter TSRMLS_DC);
+static int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter TSRMLS_DC);
+static int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter TSRMLS_DC);
+
 static int mbfl_filt_conv_qprintenc(int c, mbfl_convert_filter *filter TSRMLS_DC);
 static int mbfl_filt_conv_qprintenc_flush(mbfl_convert_filter *filter TSRMLS_DC);
 static int mbfl_filt_conv_qprintdec(int c, mbfl_convert_filter *filter TSRMLS_DC);
@@ -1307,6 +1345,22 @@ static const struct mbfl_convert_vtbl vtbl_uuencode_8bit = {
        mbfl_filt_conv_uudec,
        mbfl_filt_conv_common_flush };
 
+static const struct mbfl_convert_vtbl vtbl_wchar_html = {
+       mbfl_no_encoding_wchar,
+       mbfl_no_encoding_html_ent,
+       mbfl_filt_conv_common_ctor,
+       mbfl_filt_conv_common_dtor,
+       mbfl_filt_conv_html_enc,
+       mbfl_filt_conv_html_enc_flush };
+
+static const struct mbfl_convert_vtbl vtbl_html_wchar = {
+       mbfl_no_encoding_html_ent,
+       mbfl_no_encoding_wchar,
+       mbfl_filt_conv_html_dec_ctor,
+       mbfl_filt_conv_html_dec_dtor,
+       mbfl_filt_conv_html_dec,
+       mbfl_filt_conv_html_dec_flush };
+
 static const struct mbfl_convert_vtbl vtbl_8bit_qprint = {
        mbfl_no_encoding_8bit,
        mbfl_no_encoding_qprint,
@@ -2185,6 +2239,8 @@ static const struct mbfl_convert_vtbl *mbfl_convert_filter_list[] = {
        &vtbl_8bit_b64,
        &vtbl_b64_8bit,
        &vtbl_uuencode_8bit,
+       &vtbl_wchar_html,
+       &vtbl_html_wchar,
        &vtbl_8bit_qprint,
        &vtbl_qprint_8bit,
        &vtbl_8bit_7bit,
@@ -3467,6 +3523,185 @@ mbfl_filt_conv_base64dec_flush(mbfl_convert_filter *filter TSRMLS_DC)
        return 0;
 }
 
+/*
+ * any => HTML
+ */
+static int
+mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter TSRMLS_DC)
+{
+       int tmp[10];
+       int i = 0, p = 0, e;
+
+       if (c<256 && mblen_table_html[c]==1) {
+               CK((*filter->output_function)(c, filter->data TSRMLS_CC));
+       } else {
+               /*php_error(E_NOTICE, "%s() mbfl_filt_conv_html_enc(0x%08X = %d)", get_active_function_name(TSRMLS_C), c, c);*/
+               CK((*filter->output_function)('&', filter->data TSRMLS_CC));
+               while (1) {
+                   e = mbfl_html_entity_list[i].code;
+                       if (c < e || e == -1) {
+                               break;
+                       }
+                       if (c == e) {
+                               while(mbfl_html_entity_list[i].name[p]) {
+                                       CK((*filter->output_function)((int)mbfl_html_entity_list[i].name[p++], filter->data TSRMLS_CC));
+                               }
+                               break;
+                       }
+                       i++;
+               }
+               if (!p) {
+                       CK((*filter->output_function)('#', filter->data TSRMLS_CC));
+                       do {
+                               tmp[i++] = '0'+c%10;
+                               c /= 10;
+                       } while (c);
+                       do {
+                               CK((*filter->output_function)(tmp[--i], filter->data TSRMLS_CC));
+                       } while(i);
+               }
+               CK((*filter->output_function)(';', filter->data TSRMLS_CC));
+       }
+       return c;
+}
+
+static int
+mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter TSRMLS_DC)
+{
+       filter->status = 0;
+       filter->cache = 0;
+       return 0;
+}
+
+/*
+ * HTML => any
+ */
+#define html_enc_buffer_size   16
+static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+static void 
+mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter TSRMLS_DC)
+{
+       filter->status = 0;
+       filter->cache = (int)mbfl_malloc(html_enc_buffer_size);
+}
+       
+static void 
+mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter TSRMLS_DC)
+{
+       filter->status = 0;
+       if (filter->cache)
+       {
+               mbfl_free((void*)filter->cache);
+       }
+       filter->cache = 0;
+}
+
+static int
+mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter TSRMLS_DC)
+{
+       int  pos, ent = 0;
+       const mbfl_html_entity *entity;
+       char *buffer = (char*)filter->cache;
+
+       if (!filter->status)
+       {
+               if (c == '&' )
+               {
+                       filter->status = 1;
+                       buffer[0] = '&';
+               }
+               else
+               {
+                       CK((*filter->output_function)(c, filter->data TSRMLS_CC));
+               }
+       }
+       else
+       {
+               if (c == ';')
+               {
+                       buffer[filter->status] = 0;
+                       if (buffer[1]=='#')
+                       {
+                               /* numeric entity */
+                               for (pos=2; pos<filter->status; pos++)
+                                       ent = ent*10 + (buffer[pos] - '0');
+                               CK((*filter->output_function)(ent, filter->data TSRMLS_CC));
+                               filter->status = 0;
+                               /*php_error(E_NOTICE,"%s() mbstring decoded '%s'=%d", get_active_function_name(TSRMLS_C), buffer, ent);*/
+                       }
+                       else
+                       {
+                               /* named entity */
+                               entity = mbfl_html_entity_list;
+                               while (entity->name) 
+                               {
+                                       if (!strcmp(buffer+1, entity->name))    
+                                       {
+                                               ent = entity->code;
+                                               break;
+                                       }
+                                       entity++;
+                               }
+                               if (ent)
+                               {
+                                       /* decoded */
+                                       CK((*filter->output_function)(ent, filter->data TSRMLS_CC));
+                                       filter->status = 0;
+                                       /*php_error(E_NOTICE,"%s() mbstring decoded '%s'=%d", get_active_function_name(TSRMLS_C), buffer, ent);*/
+                               }
+                               else
+                               { 
+                                       /* failure */
+                                       buffer[filter->status++] = ';';
+                                       buffer[filter->status] = 0;
+                                       php_error(E_WARNING, "%s() mbstring cannot decode '%s'", get_active_function_name(TSRMLS_C), buffer);
+                                       mbfl_filt_conv_html_dec_flush(filter TSRMLS_CC);
+                               }
+                       }
+               }
+               else
+               {
+                       /* add character */
+                       buffer[filter->status++] = c;
+                       /* add character and check */
+                       if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
+                       {
+                               /* illegal character or end of buffer */
+                               if (c=='&')
+                                       filter->status--;
+                               buffer[filter->status] = 0;
+                               php_error(E_WARNING, "%s() mbstring cannot decode '%s'", get_active_function_name(TSRMLS_C), buffer);
+                               mbfl_filt_conv_html_dec_flush(filter TSRMLS_CC);
+                               if (c=='&')
+                               {
+                                       filter->status = 1;
+                                       buffer[0] = '&';
+                               }
+                       }
+               }
+       }
+       return c;
+}
+
+static int
+mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter TSRMLS_DC)
+{
+       int status, pos = 0;
+       char *buffer;
+
+       buffer = (char*)filter->cache;
+       status = filter->status;
+       /* flush fragments */
+       while (status--)
+       {
+               CK((*filter->output_function)(buffer[pos++], filter->data TSRMLS_CC));
+       }
+       filter->status = 0;
+       /*filter->buffer = 0; of cause NOT*/
+       return 0;
+}
+
 /*
  * any => Quoted-Printable
  */
index 3e3c9dd11c5e1e1f4d91d8e83944d5b211e77f95..563a11ea89ab9089dd820bfe226be83ff152af17 100644 (file)
@@ -128,6 +128,7 @@ enum mbfl_no_encoding {
        mbfl_no_encoding_byte4le,
        mbfl_no_encoding_base64,
        mbfl_no_encoding_uuencode,
+       mbfl_no_encoding_html_ent,
        mbfl_no_encoding_qprint,
        mbfl_no_encoding_7bit,
        mbfl_no_encoding_8bit,
@@ -222,6 +223,7 @@ typedef struct _mbfl_encoding {
 #define MBFL_ENCTYPE_MWC4BE            0x00000400
 #define MBFL_ENCTYPE_MWC4LE            0x00000800
 #define MBFL_ENCTYPE_SHFTCODE  0x00001000 
+#define MBFL_ENCTYPE_HTML_ENT       0x00002000
 
 /* wchar plane, special charactor */
 #define MBFL_WCSPLANE_MASK                     0xffff
@@ -574,4 +576,14 @@ mbfl_html_numeric_entity(mbfl_string *string, mbfl_string *result, int *convmap,
 mbfl_string *
 mbfl_ja_jp_hantozen(mbfl_string *string, mbfl_string *result, int mode TSRMLS_DC);
 
+/*
+ * HTML Entity table
+ */
+typedef struct _mbfl_html_entity {
+       char *  name;
+       int     code;
+} mbfl_html_entity;
+
+extern const mbfl_html_entity mbfl_html_entity_list[];
+
 #endif /* MBFL_MBFILTER_H */
diff --git a/ext/mbstring/tests/020.phpt b/ext/mbstring/tests/020.phpt
new file mode 100644 (file)
index 0000000..cc13d9e
--- /dev/null
@@ -0,0 +1,33 @@
+--TEST--
+HTML input/output
+--SKIPIF--
+<?php 
+       ini_set('include_path','.'); 
+       include('skipif.inc'); 
+?>
+--INI--
+arg_separator.input="x"
+error_reporting=0
+mbstring.http_input=HTML-ENTITIES
+mbstring.internal_encoding=UTF8
+mbstring.http_output=HTML-ENTITIES
+mbstring.encoding_translation=On
+--FILE--
+<?php
+// enable output encoding through output handler
+ob_start("mb_output_handler");
+// &#64... are must be decoded on input these are not reencoded on output. 
+// If you see &#64;&#65;&#66; on output this means input encoding fails.
+// If you do not see &auml;... on output this means output encoding fails.
+// Using UTF-8 internally allows to encode/decode ALL characters.
+// &128... will stay as they are since their character codes are above 127
+// and they do not have a named entity representaion.
+?>
+<?php echo mb_http_input('l').'>'.mb_internal_encoding().'>'.mb_http_output();?>
+
+<?php mb_parse_str("test=&&;&&#64;&#65;&#66;&#128;&#129;&#130;&auml;&ouml;&uuml;&euro;&lang;&rang;");
+echo "test='$test'";
+?>
+--EXPECT--
+HTML-ENTITIES>UTF-8>HTML-ENTITIES
+test='&&;&@AB&#128;&#129;&#130;&auml;&ouml;&uuml;&euro;&lang;&rang;'
\ No newline at end of file