From: Rui Hirokawa Date: Sat, 15 Sep 2001 04:48:48 +0000 (+0000) Subject: Added support for japanese encoding to htmlentites() and htmlspecialchars(). @ Added... X-Git-Tag: PRE_SUBST_Z_MACROS~102 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9c5580c7d41ffc3f261a02a2178c16fede235718;p=php Added support for japanese encoding to htmlentites() and htmlspecialchars(). @ Added support for japanese encoding to htmlentites() and htmlspecialchars(). (Rui) --- diff --git a/ext/standard/html.c b/ext/standard/html.c index 6a6c773140..092949c031 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -35,7 +35,8 @@ Defaults to ISO-8859-1 for now. */ enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, - cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs }; + cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, + cs_big5hkscs, cs_sjis, cs_eucjp}; typedef const char * entity_table_t; /* codepage 1252 is a Windows extension to iso-8859-1. */ @@ -99,6 +100,8 @@ static const struct html_entity_map entity_map[] = { { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, { cs_terminator } }; @@ -113,6 +116,10 @@ static const struct { { "BIG5", cs_big5 }, { "GB2312", cs_gb2312 }, { "BIG5-HKSCS", cs_big5hkscs }, + { "Shift_JIS", cs_sjis }, + { "SJIS", cs_sjis }, + { "EUCJP", cs_eucjp }, + { "EUC-JP", cs_eucjp }, { NULL } }; @@ -233,6 +240,74 @@ inline static unsigned short get_next_char(enum entity_charset charset, pos++; } + } + break; + } + case cs_sjis: + { + /* check if this is the first of a 2-byte sequence */ + if ( (this_char >= 0x81 && this_char <= 0x9f) || + (this_char >= 0xe0 && this_char <= 0xef) + ) { + /* peek at the next char */ + unsigned char next_char = str[pos]; + if ((next_char >= 0x40 && next_char <= 0x7e) || + (next_char >= 0x80 && next_char <= 0xfc)) + { + /* yes, this a wide char */ + this_char <<= 8; + mbseq[mbpos++] = next_char; + this_char |= next_char; + pos++; + } + + } + break; + } + case cs_eucjp: + { + /* check if this is the first of a multi-byte sequence */ + if (this_char >= 0xa1 && this_char <= 0xfe) { + /* peek at the next char */ + unsigned char next_char = str[pos]; + if (next_char >= 0xa1 && next_char <= 0xfe) + { + /* yes, this a jis kanji char */ + this_char <<= 8; + mbseq[mbpos++] = next_char; + this_char |= next_char; + pos++; + } + + } else if (this_char == 0x8e) { + /* peek at the next char */ + unsigned char next_char = str[pos]; + if (next_char >= 0xa1 && next_char <= 0xdf) + { + /* JIS X 0201 kana */ + this_char <<= 8; + mbseq[mbpos++] = next_char; + this_char |= next_char; + pos++; + } + + } else if (this_char == 0x8f) { + /* peek at the next two char */ + unsigned char next_char = str[pos]; + unsigned char next2_char = str[pos+1]; + if ((next_char >= 0xa1 && next_char <= 0xfe) && + (next2_char >= 0xa1 && next2_char <= 0xfe)) + { + /* JIS X 0212 hojo-kanji */ + this_char <<= 8; + mbseq[mbpos++] = next_char; + this_char |= next_char; + this_char <<= 8; + mbseq[mbpos++] = next2_char; + this_char |= next2_char; + pos+=2; + } + } break; }