From: Rui Hirokawa Date: Wed, 15 May 2002 12:13:56 +0000 (+0000) Subject: added ISO-2022-KR support in mbstring. X-Git-Tag: php-4.3.0dev-ZendEngine2-Preview1~31 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5f8009a7eae813e6b1c621ada9ed47ea170117e5;p=php added ISO-2022-KR support in mbstring. --- diff --git a/ext/mbstring/mbfilter.c b/ext/mbstring/mbfilter.c index 0968b44fc9..678dc38d29 100644 --- a/ext/mbstring/mbfilter.c +++ b/ext/mbstring/mbfilter.c @@ -788,6 +788,15 @@ static mbfl_encoding mbfl_encoding_uhc = { MBFL_ENCTYPE_MBCS }; +static mbfl_encoding mbfl_encoding_2022kr = { + mbfl_no_encoding_2022kr, + "ISO-2022-KR", + "ISO-2022-KR", + NULL, + NULL, + MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_SHFTCODE +}; + #endif /* HAVE_MBSTR_KR */ static const char *mbfl_encoding_cp1252_aliases[] = {"cp1252", NULL}; @@ -1007,6 +1016,7 @@ static mbfl_encoding *mbfl_encoding_ptr_list[] = { #if defined(HAVE_MBSTR_KR) &mbfl_encoding_euc_kr, &mbfl_encoding_uhc, + &mbfl_encoding_2022kr, #endif NULL }; @@ -1115,6 +1125,7 @@ static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter TSRMLS_DC); #if defined(HAVE_MBSTR_KR) static int mbfl_filt_ident_euckr(int c, mbfl_identify_filter *filter TSRMLS_DC); static int mbfl_filt_ident_uhc(int c, mbfl_identify_filter *filter TSRMLS_DC); +static int mbfl_filt_ident_2022kr(int c, mbfl_identify_filter *filter TSRMLS_DC); #endif /* HAVE_MBSTR_KR */ static int mbfl_filt_ident_cp1252(int c, mbfl_identify_filter *filter TSRMLS_DC); @@ -1723,6 +1734,23 @@ static struct mbfl_convert_vtbl vtbl_wchar_uhc = { mbfl_filt_conv_common_dtor, mbfl_filt_conv_wchar_uhc, mbfl_filt_conv_common_flush }; + +static struct mbfl_convert_vtbl vtbl_wchar_2022kr = { + mbfl_no_encoding_wchar, + mbfl_no_encoding_2022kr, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_wchar_2022kr, + mbfl_filt_conv_any_2022kr_flush }; + +static struct mbfl_convert_vtbl vtbl_2022kr_wchar = { + mbfl_no_encoding_2022kr, + mbfl_no_encoding_wchar, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_2022kr_wchar, + mbfl_filt_conv_common_flush }; + #endif /* HAVE_MBSTR_KR */ static struct mbfl_convert_vtbl vtbl_cp1252_wchar = { @@ -1987,6 +2015,8 @@ static struct mbfl_convert_vtbl *mbfl_convert_filter_list[] = { &vtbl_wchar_euckr, &vtbl_uhc_wchar, &vtbl_wchar_uhc, + &vtbl_2022kr_wchar, + &vtbl_wchar_2022kr, #endif &vtbl_cp1252_wchar, &vtbl_wchar_cp1252, @@ -2170,6 +2200,13 @@ static struct mbfl_identify_vtbl vtbl_identify_uhc = { mbfl_filt_ident_common_ctor, mbfl_filt_ident_common_dtor, mbfl_filt_ident_uhc }; + +static struct mbfl_identify_vtbl vtbl_identify_2022kr = { + mbfl_no_encoding_2022kr, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_common_dtor, + mbfl_filt_ident_2022kr }; + #endif /* HAVE_MBSTR_KR */ static struct mbfl_identify_vtbl vtbl_identify_cp1252 = { @@ -2286,6 +2323,7 @@ static struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = { #if defined(HAVE_MBSTR_KR) &vtbl_identify_euckr, &vtbl_identify_uhc, + &vtbl_identify_2022kr, #endif &vtbl_identify_cp1252, &vtbl_identify_8859_1, @@ -6011,6 +6049,77 @@ mbfl_filt_ident_uhc(int c, mbfl_identify_filter *filter TSRMLS_DC) return c; } +static int +mbfl_filt_ident_2022kr(int c, mbfl_identify_filter *filter TSRMLS_DC) +{ +retry: + switch (filter->status & 0xf) { +/* case 0x00: ASCII */ +/* case 0x10: KSC5601 mode */ +/* case 0x20: KSC5601 DBCS */ +/* case 0x40: KSC5601 SBCS */ + case 0: + if (!(filter->status & 0x10)) { + if (c == 0x1b) + filter->status += 2; + } else if (filter->status == 0x20 && c > 0x20 && c < 0x7f) { /* kanji first char */ + filter->status += 1; + } else if (c >= 0 && c < 0x80) { /* latin, CTLs */ + ; + } else { + filter->flag = 1; /* bad */ + } + break; + +/* case 0x21: KSC5601 second char */ + case 1: + filter->status &= ~0xf; + if (c < 0x21 || c > 0x7e) { /* bad */ + filter->flag = 1; + } + break; + + /* ESC */ + case 2: + if (c == 0x24) { /* '$' */ + filter->status++; + } else { + filter->flag = 1; /* bad */ + filter->status &= ~0xf; + goto retry; + } + break; + + /* ESC $ */ + case 3: + if (c == 0x29) { /* ')' */ + filter->status++; + } else { + filter->flag = 1; /* bad */ + filter->status &= ~0xf; + goto retry; + } + break; + + /* ESC $) */ + case 5: + if (c == 0x43) { /* 'C' */ + filter->status = 0x10; + } else { + filter->flag = 1; /* bad */ + filter->status &= ~0xf; + goto retry; + } + break; + + default: + filter->status = 0; + break; + } + + return c; +} + #endif /* HAVE_MBSTR_KR */ diff --git a/ext/mbstring/mbfilter_cn.c b/ext/mbstring/mbfilter_cn.c index 9dcdd7dfdb..6feab182dd 100644 --- a/ext/mbstring/mbfilter_cn.c +++ b/ext/mbstring/mbfilter_cn.c @@ -353,7 +353,7 @@ mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter TSRMLS_DC) } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) { s = ucs_hff_cp936_table[c - ucs_hff_cp936_table_min]; } - if (s >= 0x0080) { + if (s & 0x8000) { s -= 0x8080; } diff --git a/ext/mbstring/mbfilter_kr.c b/ext/mbstring/mbfilter_kr.c index 93f44274f0..da4157d571 100644 --- a/ext/mbstring/mbfilter_kr.c +++ b/ext/mbstring/mbfilter_kr.c @@ -172,6 +172,205 @@ mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter TSRMLS_DC) return c; } + +/* + * ISO-2022-KR => wchar + */ +int +mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int c1, w, flag; + +retry: + switch (filter->status & 0xf) { + /* case 0x00: ASCII */ + /* case 0x10: KSC5601 */ + case 0: + if (c == 0x1b) { /* ESC */ + filter->status += 2; + } else if (c == 0x0f) { /* SI (ASCII) */ + filter->status &= ~0xff; + } else if (c == 0x0e) { /* SO (KSC5601) */ + filter->status |= 0x10; + } else if ((filter->status & 0x10) != 0 && c > 0x20 && c < 0x7f) { + /* KSC5601 lead byte */ + filter->cache = c; + filter->status += 1; + } else if ((filter->status & 0x10) == 0 && c >= 0 && c < 0x80) { + /* latin, CTLs */ + CK((*filter->output_function)(c, filter->data TSRMLS_CC)); + } else { + w = c & MBFL_WCSGROUP_MASK; + w |= MBFL_WCSGROUP_THROUGH; + CK((*filter->output_function)(w, filter->data TSRMLS_CC)); + } + break; + + case 1: /* dbcs second byte */ + filter->status &= ~0xf; + c1 = filter->cache; + flag = 0; + if (c1 > 0x20 && c1 < 0x47) { + flag = 1; + } else if (c1 >= 0x47 && c1 <= 0x7e && c1 != 0x49) { + flag = 2; + } + if (flag > 0 && c > 0x20 && c < 0x7f) { + if (flag == 1){ + w = (c1 - 0x21)*178 + (c - 0x21) + 0x54; + if (w >= 0 && w < uhc2_ucs_table_size) { + w = uhc2_ucs_table[w]; + } else { + w = 0; + } + } else { + if (c1 < 0x49){ + w = (c1 - 0x47)*94 + c - 0x21; + } else { + w = (c1 - 0x48)*94 + c - 0x21; + } + if (w >= 0 && w < uhc3_ucs_table_size) { + w = uhc3_ucs_table[w]; + } else { + w = 0; + } + } + + if (w <= 0) { + w = (c1 << 8) | c; + w &= MBFL_WCSPLANE_MASK; + w |= MBFL_WCSPLANE_KSC5601; + } + CK((*filter->output_function)(w, filter->data TSRMLS_CC)); + } else if (c == 0x1b) { /* ESC */ + filter->status++; + } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */ + CK((*filter->output_function)(c, filter->data TSRMLS_CC)); + } else { + w = (c1 << 8) | c; + w &= MBFL_WCSGROUP_MASK; + w |= MBFL_WCSGROUP_THROUGH; + CK((*filter->output_function)(w, filter->data TSRMLS_CC)); + } + break; + + case 2: /* ESC */ + if (c == 0x24) { /* '$' */ + filter->status++; + } else { + filter->status &= ~0xf; + CK((*filter->output_function)(0x1b, filter->data TSRMLS_CC)); + goto retry; + } + break; + case 3: /* ESC $ */ + if (c == 0x29) { /* ')' */ + filter->status++; + } else { + filter->status &= ~0xf; + CK((*filter->output_function)(0x1b, filter->data TSRMLS_CC)); + CK((*filter->output_function)(0x24, filter->data TSRMLS_CC)); + goto retry; + } + break; + case 4: /* ESC $ ) */ + if (c == 0x43) { /* 'C' */ + filter->status &= ~0xf; + filter->status |= 0x100; + } else { + filter->status &= ~0xf; + CK((*filter->output_function)(0x1b, filter->data TSRMLS_CC)); + CK((*filter->output_function)(0x24, filter->data TSRMLS_CC)); + CK((*filter->output_function)(0x29, filter->data TSRMLS_CC)); + goto retry; + } + break; + default: + filter->status = 0; + break; + } + + return c; +} + +/* + * wchar => ISO-2022-KR + */ +int +mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int c1, c2, s; + + s = 0; + + if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) { + s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min]; + } else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) { + s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min]; + } else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) { + s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min]; + } else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) { + s = ucs_i_uhc_table[c - ucs_i_uhc_table_min]; + } else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) { + s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min]; + } else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) { + s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min]; + } + + c1 = (s >> 8) & 0xff; + c2 = s & 0xff; + /* exclude UHC extension area */ + if (c1 < 0xa1 || c2 < 0xa1){ + s = c; + } + if (s & 0x8000) { + s -= 0x8080; + } + + if (s <= 0) { + c1 = c & ~MBFL_WCSPLANE_MASK; + if (c1 == MBFL_WCSPLANE_KSC5601) { + s = c & MBFL_WCSPLANE_MASK; + } + if (c == 0) { + s = 0; + } else if (s <= 0) { + s = -1; + } + } else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) { + s = -1; + } + if (s >= 0) { + if (s < 0x80) { /* ASCII */ + if ((filter->status & 0x10) != 0) { + CK((*filter->output_function)(0x0f, filter->data TSRMLS_CC)); /* SI */ + filter->status &= ~0x10; + } + CK((*filter->output_function)(s, filter->data TSRMLS_CC)); + } else { + if ((filter->status & 0x10) == 0) { + CK((*filter->output_function)(0x0e, filter->data TSRMLS_CC)); /* SO */ + filter->status |= 0x10; + } + if ( (filter->status & 0x100) == 0) { + CK((*filter->output_function)(0x1b, filter->data TSRMLS_CC)); /* ESC */ + CK((*filter->output_function)(0x24, filter->data TSRMLS_CC)); /* '$' */ + CK((*filter->output_function)(0x29, filter->data TSRMLS_CC)); /* ')' */ + CK((*filter->output_function)(0x43, filter->data TSRMLS_CC)); /* 'C' */ + filter->status |= 0x100; + } + CK((*filter->output_function)((s >> 8) & 0xff, filter->data TSRMLS_CC)); + CK((*filter->output_function)(s & 0xff, filter->data TSRMLS_CC)); + } + } else { + if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { + CK(mbfl_filt_conv_illegal_output(c, filter TSRMLS_CC)); + } + } + + return c; +} + /* * UHC => wchar */ @@ -314,6 +513,17 @@ mbfl_filt_conv_wchar_uhc(int c, mbfl_convert_filter *filter TSRMLS_DC) return c; } +int +mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter TSRMLS_DC) +{ + /* back to ascii */ + if ((filter->status & 0xff00) != 0) { + CK((*filter->output_function)(0x0f, filter->data TSRMLS_CC)); /* SI */ + } + filter->status &= 0xff; + return 0; +} + #endif /* HAVE_MBSTR_KR */ /* diff --git a/ext/mbstring/mbfilter_kr.h b/ext/mbstring/mbfilter_kr.h index 22b7f5ed4b..34d0d54309 100644 --- a/ext/mbstring/mbfilter_kr.h +++ b/ext/mbstring/mbfilter_kr.h @@ -26,5 +26,8 @@ int mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC); int mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter TSRMLS_DC); int mbfl_filt_conv_uhc_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC); int mbfl_filt_conv_wchar_uhc(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter TSRMLS_DC); #endif /* MBFL_MBFILTER_KR_H */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 5b40ff3f89..f5c1e02ebb 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -65,7 +65,7 @@ #include "php_content_types.h" #include "SAPI.h" -#ifdef ZEND_MULTIBYTE +#if ZEND_MULTIBYTE #include "zend_multibyte.h" #endif /* ZEND_MULTIBYTE */