From: Rui Hirokawa Date: Sun, 12 May 2002 13:06:13 +0000 (+0000) Subject: added chinese HZ encoding support. fixed ascii area character conversion was not... X-Git-Tag: php-4.3.0dev-ZendEngine2-Preview1~98 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=767fa10a9285c4af12c3e704377279b42696c381;p=php added chinese HZ encoding support. fixed ascii area character conversion was not work in euc-cn and euc-kr. --- diff --git a/ext/mbstring/mbfilter.c b/ext/mbstring/mbfilter.c index 4e335f12fb..0968b44fc9 100644 --- a/ext/mbstring/mbfilter.c +++ b/ext/mbstring/mbfilter.c @@ -145,6 +145,16 @@ static mbfl_language mbfl_language_japanese = { mbfl_no_encoding_7bit }; +static mbfl_language mbfl_language_korean = { + mbfl_no_language_korean, + "Korean", + "ko", + NULL, + mbfl_no_encoding_2022kr, + mbfl_no_encoding_base64, + mbfl_no_encoding_7bit +}; + static mbfl_language mbfl_language_english = { mbfl_no_language_english, "English", @@ -155,12 +165,22 @@ static mbfl_language mbfl_language_english = { mbfl_no_encoding_8bit }; -static mbfl_language mbfl_language_chinese = { - mbfl_no_language_chinese, - "Chinese", - "zh", +static mbfl_language mbfl_language_simplified_chinese = { + mbfl_no_language_simplified_chinese, + "Simplified Chinese", + "zh-cn", NULL, - mbfl_no_encoding_2022jp, + mbfl_no_encoding_hz, + mbfl_no_encoding_base64, + mbfl_no_encoding_7bit +}; + +static mbfl_language mbfl_language_traditional_chinese = { + mbfl_no_language_traditional_chinese, + "Traditional Chinese", + "zh-tw", + NULL, + mbfl_no_encoding_hz, mbfl_no_encoding_base64, mbfl_no_encoding_7bit }; @@ -168,7 +188,9 @@ static mbfl_language mbfl_language_chinese = { static mbfl_language *mbfl_language_ptr_table[] = { &mbfl_language_uni, &mbfl_language_japanese, - &mbfl_language_chinese, + &mbfl_language_korean, + &mbfl_language_simplified_chinese, + &mbfl_language_traditional_chinese, &mbfl_language_english, NULL }; @@ -707,6 +729,15 @@ static mbfl_encoding mbfl_encoding_cp936 = { MBFL_ENCTYPE_MBCS }; +static mbfl_encoding mbfl_encoding_hz = { + mbfl_no_encoding_hz, + "HZ", + "HZ-GB-2312", + NULL, + NULL, + MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_SHFTCODE +}; + #endif /* HAVE_MBSTR_CN */ #if defined(HAVE_MBSTR_TW) @@ -967,6 +998,7 @@ static mbfl_encoding *mbfl_encoding_ptr_list[] = { #if defined(HAVE_MBSTR_CN) &mbfl_encoding_euc_cn, &mbfl_encoding_cp936, + &mbfl_encoding_hz, #endif #if defined(HAVE_MBSTR_TW) &mbfl_encoding_euc_tw, @@ -1072,6 +1104,7 @@ static int mbfl_filt_ident_2022jp(int c, mbfl_identify_filter *filter TSRMLS_DC) #if defined(HAVE_MBSTR_CN) static int mbfl_filt_ident_euccn(int c, mbfl_identify_filter *filter TSRMLS_DC); static int mbfl_filt_ident_cp936(int c, mbfl_identify_filter *filter TSRMLS_DC); +static int mbfl_filt_ident_hz(int c, mbfl_identify_filter *filter TSRMLS_DC); #endif /* HAVE_MBSTR_CN */ #if defined(HAVE_MBSTR_TW) @@ -1605,6 +1638,23 @@ static struct mbfl_convert_vtbl vtbl_wchar_cp936 = { mbfl_filt_conv_common_dtor, mbfl_filt_conv_wchar_cp936, mbfl_filt_conv_common_flush }; + +static struct mbfl_convert_vtbl vtbl_hz_wchar = { + mbfl_no_encoding_hz, + mbfl_no_encoding_wchar, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_hz_wchar, + mbfl_filt_conv_common_flush }; + +static struct mbfl_convert_vtbl vtbl_wchar_hz = { + mbfl_no_encoding_wchar, + mbfl_no_encoding_hz, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_wchar_hz, + mbfl_filt_conv_any_hz_flush }; + #endif /* HAVE_MBSTR_CN */ #if defined(HAVE_MBSTR_TW) @@ -1923,6 +1973,8 @@ static struct mbfl_convert_vtbl *mbfl_convert_filter_list[] = { &vtbl_wchar_euccn, &vtbl_cp936_wchar, &vtbl_wchar_cp936, + &vtbl_hz_wchar, + &vtbl_wchar_hz, #endif #if defined(HAVE_MBSTR_TW) &vtbl_euctw_wchar, @@ -2083,6 +2135,13 @@ static struct mbfl_identify_vtbl vtbl_identify_cp936 = { mbfl_filt_ident_common_ctor, mbfl_filt_ident_common_dtor, mbfl_filt_ident_cp936 }; + +static struct mbfl_identify_vtbl vtbl_identify_hz = { + mbfl_no_encoding_hz, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_common_dtor, + mbfl_filt_ident_hz }; + #endif /* HAVE_MBSTR_CN */ #if defined(HAVE_MBSTR_TW) @@ -2218,6 +2277,7 @@ static struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = { #if defined(HAVE_MBSTR_CN) &vtbl_identify_euccn, &vtbl_identify_cp936, + &vtbl_identify_hz, #endif #if defined(HAVE_MBSTR_TW) &vtbl_identify_euctw, @@ -5756,6 +5816,53 @@ mbfl_filt_ident_cp936(int c, mbfl_identify_filter *filter TSRMLS_DC) return c; } +static int +mbfl_filt_ident_hz(int c, mbfl_identify_filter *filter TSRMLS_DC) +{ + switch (filter->status & 0xf) { +/* case 0x00: ASCII */ +/* case 0x10: GB2312 */ + case 0: + if (c == 0x7e) { + filter->status += 2; + } else if (filter->status == 0x10 && c > 0x20 && c < 0x7f) { /* DBCS first char */ + filter->status += 1; + } else if (c >= 0 && c < 0x80) { /* latin, CTLs */ + ; + } else { + filter->flag = 1; /* bad */ + } + break; + +/* case 0x11: GB2312 second char */ + case 1: + filter->status &= ~0xf; + if (c < 0x21 || c > 0x7e) { /* bad */ + filter->flag = 1; + } + break; + + case 2: + if (c == 0x7d) { /* '}' */ + filter->status = 0; + } else if (c == 0x7b) { /* '{' */ + filter->status = 0x10; + } else if (c == 0x7e) { /* '~' */ + filter->status = 0; + } else { + filter->flag = 1; /* bad */ + filter->status &= ~0xf; + } + break; + + default: + filter->status = 0; + break; + } + + return c; +} + #endif /* HAVE_MBSTR_CN */ #if defined(HAVE_MBSTR_TW) diff --git a/ext/mbstring/mbfilter.h b/ext/mbstring/mbfilter.h index a5077bd578..c63e6273ec 100644 --- a/ext/mbstring/mbfilter.h +++ b/ext/mbstring/mbfilter.h @@ -111,7 +111,8 @@ enum mbfl_no_language { mbfl_no_language_polish, /* pl */ mbfl_no_language_portuguese, /* pt */ mbfl_no_language_swedish, /* sv */ - mbfl_no_language_chinese, /* zh */ + mbfl_no_language_simplified_chinese, /* zh-cn */ + mbfl_no_language_traditional_chinese, /* zh-tw */ mbfl_no_language_max }; @@ -172,7 +173,9 @@ enum mbfl_no_encoding { mbfl_no_encoding_euc_tw, mbfl_no_encoding_big5, mbfl_no_encoding_euc_kr, + mbfl_no_encoding_2022kr, mbfl_no_encoding_uhc, + mbfl_no_encoding_hz, mbfl_no_encoding_charset_max }; diff --git a/ext/mbstring/mbfilter_cn.c b/ext/mbstring/mbfilter_cn.c index 41123b57c6..9dcdd7dfdb 100644 --- a/ext/mbstring/mbfilter_cn.c +++ b/ext/mbstring/mbfilter_cn.c @@ -117,8 +117,8 @@ mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter TSRMLS_DC) c1 = (s >> 8) & 0xff; c2 = s & 0xff; - if (c1 < 0xa1 || c2 < 0xa1) { /* exclude CP932 extension */ - s = 0; + if (c1 < 0xa1 || c2 < 0xa1) { /* exclude CP936 extension */ + s = c; } if (s <= 0) { @@ -259,6 +259,154 @@ mbfl_filt_conv_wchar_cp936(int c, mbfl_convert_filter *filter TSRMLS_DC) return c; } + +/* + * HZ => wchar + */ +int +mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int c1, s, w; + + switch (filter->status & 0xf) { +/* case 0x00: ASCII */ +/* case 0x10: GB2312 */ + case 0: + if (c == 0x7e) { + filter->status += 2; + } else if (filter->status == 0x10 && c > 0x20 && c < 0x7f) { /* DBCS first char */ + filter->cache = c; + filter->status += 1; + } else if (c >= 0 && c < 0x80) { /* latin, CTLs */ + CK((*filter->output_function)(c, filter->data TSRMLS_CC)); + } else { + w = c & MBFL_WCSGROUP_MASK; + w |= MBFL_WCSGROUP_THROUGH; + CK((*filter->output_function)(w, filter->data TSRMLS_CC)); + } + break; + +/* case 0x11: GB2312 second char */ + case 1: + filter->status &= ~0xf; + c1 = filter->cache; + if (c1 > 0x20 && c1 < 0x7f && c > 0x20 && c < 0x7f) { + s = (c1 - 1)*192 + c + 0x40; /* GB2312 */ + if (s >= 0 && s < cp936_ucs_table_size) { + w = cp936_ucs_table[s]; + } else { + w = 0; + } + if (w <= 0) { + w = (c1 << 8) | c; + w &= MBFL_WCSPLANE_MASK; + w |= MBFL_WCSPLANE_GB2312; + } + CK((*filter->output_function)(w, filter->data TSRMLS_CC)); + } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */ + CK((*filter->output_function)(c, filter->data TSRMLS_CC)); + } else { + w = (c1 << 8) | c; + w &= MBFL_WCSGROUP_MASK; + w |= MBFL_WCSGROUP_THROUGH; + CK((*filter->output_function)(w, filter->data TSRMLS_CC)); + } + break; + + /* '~' */ + case 2: + if (c == 0x7d) { /* '}' */ + filter->status = 0x0; + } else if (c == 0x7b) { /* '{' */ + filter->status = 0x10; + } else if (c == 0x7e) { /* '~' */ + filter->status = 0x0; + CK((*filter->output_function)(0x007e, filter->data TSRMLS_CC)); + } + break; + + default: + filter->status = 0; + break; + } + + return c; +} + +/* + * wchar => HZ + */ +int +mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int s; + + s = 0; + if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) { + s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min]; + } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) { + s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min]; + } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) { + s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min]; + } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) { + s = ucs_i_cp936_table[c - ucs_i_cp936_table_min]; + } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) { + s = ucs_hff_cp936_table[c - ucs_hff_cp936_table_min]; + } + if (s >= 0x0080) { + s -= 0x8080; + } + + if (s <= 0) { + if (c == 0) { + s = 0; + } else if (s <= 0) { + s = -1; + } + } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) { + s = -1; + } + if (s >= 0) { + if (s < 0x80) { /* ASCII */ + if ((filter->status & 0xff00) != 0) { + CK((*filter->output_function)(0x7e, filter->data TSRMLS_CC)); /* '~' */ + CK((*filter->output_function)(0x7d, filter->data TSRMLS_CC)); /* '}' */ + } + filter->status = 0; + if (s == 0x7e){ + CK((*filter->output_function)(0x7e, filter->data TSRMLS_CC)); + } + CK((*filter->output_function)(s, filter->data TSRMLS_CC)); + } else { /* GB 2312-80 */ + if ((filter->status & 0xff00) != 0x200) { + CK((*filter->output_function)(0x7e, filter->data TSRMLS_CC)); /* '~' */ + CK((*filter->output_function)(0x7b, filter->data TSRMLS_CC)); /* '{' */ + } + filter->status = 0x200; + CK((*filter->output_function)((s >> 8) & 0x7f, filter->data TSRMLS_CC)); + CK((*filter->output_function)(s & 0x7f, filter->data TSRMLS_CC)); + } + } else { + if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { + CK(mbfl_filt_conv_illegal_output(c, filter TSRMLS_CC)); + } + } + + return c; +} + +int +mbfl_filt_conv_any_hz_flush(mbfl_convert_filter *filter TSRMLS_DC) +{ + /* back to latin */ + if ((filter->status & 0xff00) != 0) { + CK((*filter->output_function)(0x7e, filter->data TSRMLS_CC)); /* ~ */ + CK((*filter->output_function)(0x7d, filter->data TSRMLS_CC)); /* '{' */ + } + filter->status &= 0xff; + return 0; +} + #endif /* HAVE_MBSTR_CN */ /* diff --git a/ext/mbstring/mbfilter_cn.h b/ext/mbstring/mbfilter_cn.h index a71168541b..be25417174 100644 --- a/ext/mbstring/mbfilter_cn.h +++ b/ext/mbstring/mbfilter_cn.h @@ -26,5 +26,8 @@ int mbfl_filt_conv_euccn_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC); int mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter TSRMLS_DC); int mbfl_filt_conv_cp936_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC); int mbfl_filt_conv_wchar_cp936(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_any_hz_flush(mbfl_convert_filter *filter TSRMLS_DC); #endif /* MBFL_MBFILTER_CN_H */ diff --git a/ext/mbstring/mbfilter_kr.c b/ext/mbstring/mbfilter_kr.c index 86f4bdca5d..93f44274f0 100644 --- a/ext/mbstring/mbfilter_kr.c +++ b/ext/mbstring/mbfilter_kr.c @@ -141,8 +141,8 @@ mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter TSRMLS_DC) c1 = (s >> 8) & 0xff; c2 = s & 0xff; /* exclude UHC extension area */ - if (c1 < 0xa1 || c1 > 0xfe || c2 < 0xa1 && c2 > 0xfe){ - s = 0; + if (c1 < 0xa1 || c2 < 0xa1){ + s = c; } if (s <= 0) {