From 4974e6073e9dda19b7826bc5c390811380d771a5 Mon Sep 17 00:00:00 2001 From: "Den V. Tsopa" Date: Tue, 21 May 2002 07:00:34 +0000 Subject: [PATCH] Added russian codepages (koi8-r,cp1251,cp866) support. --- ext/mbstring/config.m4 | 6 +- ext/mbstring/mbfilter.c | 262 ++++++++++++++++++++------------ ext/mbstring/mbfilter.h | 7 + ext/mbstring/mbfilter_ru.c | 213 ++++++++++++++++++++++++++ ext/mbstring/mbfilter_ru.h | 11 ++ ext/mbstring/mbstring.c | 22 +-- ext/mbstring/unicode_table_ru.h | 69 +++++++++ 7 files changed, 482 insertions(+), 108 deletions(-) create mode 100644 ext/mbstring/mbfilter_ru.c create mode 100644 ext/mbstring/mbfilter_ru.h create mode 100644 ext/mbstring/unicode_table_ru.h diff --git a/ext/mbstring/config.m4 b/ext/mbstring/config.m4 index f250aac3e5..4ffc3c6505 100644 --- a/ext/mbstring/config.m4 +++ b/ext/mbstring/config.m4 @@ -26,13 +26,17 @@ if test "$PHP_MBSTRING" != "no"; then if test "$PHP_MBSTRING" = "kr"; then AC_DEFINE(HAVE_MBSTR_KR,1,[whether to have korean support]) fi + if test "$PHP_MBSTRING" = "ru"; then + AC_DEFINE(HAVE_MBSTR_RU,1,[whether to have russian support]) + fi if test "$PHP_MBSTRING" = "all"; then AC_DEFINE(HAVE_MBSTR_JA,1,[whether to have japanese support]) AC_DEFINE(HAVE_MBSTR_CN,1,[whether to have simplified chinese support]) AC_DEFINE(HAVE_MBSTR_TW,1,[whether to have traditional chinese support]) AC_DEFINE(HAVE_MBSTR_KR,1,[whether to have korean support]) + AC_DEFINE(HAVE_MBSTR_RU,1,[whether to have russian support]) fi - PHP_NEW_EXTENSION(mbstring, mbfilter_ja.c mbfilter_cn.c mbfilter_tw.c mbfilter_kr.c mbfilter.c mbstring.c mbregex.c php_mbregex.c, $ext_shared) + PHP_NEW_EXTENSION(mbstring, mbfilter_ja.c mbfilter_cn.c mbfilter_tw.c mbfilter_kr.c mbfilter_ru.c mbfilter.c mbstring.c mbregex.c php_mbregex.c, $ext_shared) else PHP_MBSTR_ENC_TRANS=no fi diff --git a/ext/mbstring/mbfilter.c b/ext/mbstring/mbfilter.c index 678dc38d29..9204f1e838 100644 --- a/ext/mbstring/mbfilter.c +++ b/ext/mbstring/mbfilter.c @@ -104,6 +104,9 @@ #if defined(HAVE_MBSTR_KR) #include "mbfilter_kr.h" #endif +#if defined(HAVE_MBSTR_KR) +#include "mbfilter_ru.h" +#endif #include "zend.h" @@ -185,6 +188,16 @@ static mbfl_language mbfl_language_traditional_chinese = { mbfl_no_encoding_7bit }; +static mbfl_language mbfl_language_russian = { + mbfl_no_language_russian, + "Russian", + "ru", + NULL, + mbfl_no_encoding_koi8r, + mbfl_no_encoding_qprint, + mbfl_no_encoding_8bit +}; + static mbfl_language *mbfl_language_ptr_table[] = { &mbfl_language_uni, &mbfl_language_japanese, @@ -192,6 +205,7 @@ static mbfl_language *mbfl_language_ptr_table[] = { &mbfl_language_simplified_chinese, &mbfl_language_traditional_chinese, &mbfl_language_english, + &mbfl_language_russian, NULL }; @@ -788,15 +802,6 @@ static mbfl_encoding mbfl_encoding_uhc = { MBFL_ENCTYPE_MBCS }; -static mbfl_encoding mbfl_encoding_2022kr = { - mbfl_no_encoding_2022kr, - "ISO-2022-KR", - "ISO-2022-KR", - NULL, - NULL, - MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_SHFTCODE -}; - #endif /* HAVE_MBSTR_KR */ static const char *mbfl_encoding_cp1252_aliases[] = {"cp1252", NULL}; @@ -953,6 +958,41 @@ static mbfl_encoding mbfl_encoding_8859_15 = { MBFL_ENCTYPE_SBCS }; +#if defined(HAVE_MBSTR_KR) +static const char *mbfl_encoding_cp1251_aliases[] = {"CP1251", "CP-1251", "WINDOWS-1251", NULL}; + +static mbfl_encoding mbfl_encoding_cp1251 = { + mbfl_no_encoding_cp1251, + "Windows-1251", + "Windows-1251", + &mbfl_encoding_cp1251_aliases, + NULL, + MBFL_ENCTYPE_SBCS +}; + +static const char *mbfl_encoding_cp866_aliases[] = {"CP866", "CP-866", "IBM-866", NULL}; + +static mbfl_encoding mbfl_encoding_cp866 = { + mbfl_no_encoding_cp866, + "CP866", + "CP866", + &mbfl_encoding_cp866_aliases, + NULL, + MBFL_ENCTYPE_SBCS +}; + +static const char *mbfl_encoding_koi8r_aliases[] = {"KOI8-R", "KOI8R", NULL}; + +static mbfl_encoding mbfl_encoding_koi8r = { + mbfl_no_encoding_koi8r, + "KOI8-R", + "KOI8-R", + &mbfl_encoding_koi8r_aliases, + NULL, + MBFL_ENCTYPE_SBCS +}; +#endif + static mbfl_encoding *mbfl_encoding_ptr_list[] = { &mbfl_encoding_pass, &mbfl_encoding_auto, @@ -1016,7 +1056,11 @@ static mbfl_encoding *mbfl_encoding_ptr_list[] = { #if defined(HAVE_MBSTR_KR) &mbfl_encoding_euc_kr, &mbfl_encoding_uhc, - &mbfl_encoding_2022kr, +#endif +#if defined(HAVE_MBSTR_RU) + &mbfl_encoding_cp1251, + &mbfl_encoding_cp866, + &mbfl_encoding_koi8r, #endif NULL }; @@ -1125,9 +1169,14 @@ static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter TSRMLS_DC); #if defined(HAVE_MBSTR_KR) static int mbfl_filt_ident_euckr(int c, mbfl_identify_filter *filter TSRMLS_DC); static int mbfl_filt_ident_uhc(int c, mbfl_identify_filter *filter TSRMLS_DC); -static int mbfl_filt_ident_2022kr(int c, mbfl_identify_filter *filter TSRMLS_DC); #endif /* HAVE_MBSTR_KR */ +#if defined(HAVE_MBSTR_RU) +static int mbfl_filt_ident_cp1251(int c, mbfl_identify_filter *filter TSRMLS_DC); +static int mbfl_filt_ident_cp866(int c, mbfl_identify_filter *filter TSRMLS_DC); +static int mbfl_filt_ident_koi8r(int c, mbfl_identify_filter *filter TSRMLS_DC); +#endif /* HAVE_MBSTR_RU */ + static int mbfl_filt_ident_cp1252(int c, mbfl_identify_filter *filter TSRMLS_DC); static int mbfl_filt_ident_false(int c, mbfl_identify_filter *filter TSRMLS_DC); static int mbfl_filt_ident_true(int c, mbfl_identify_filter *filter TSRMLS_DC); @@ -1734,24 +1783,58 @@ static struct mbfl_convert_vtbl vtbl_wchar_uhc = { mbfl_filt_conv_common_dtor, mbfl_filt_conv_wchar_uhc, mbfl_filt_conv_common_flush }; +#endif /* HAVE_MBSTR_KR */ -static struct mbfl_convert_vtbl vtbl_wchar_2022kr = { +#if defined(HAVE_MBSTR_RU) +static struct mbfl_convert_vtbl vtbl_wchar_cp1251 = { + mbfl_no_encoding_cp1251, mbfl_no_encoding_wchar, - mbfl_no_encoding_2022kr, mbfl_filt_conv_common_ctor, mbfl_filt_conv_common_dtor, - mbfl_filt_conv_wchar_2022kr, - mbfl_filt_conv_any_2022kr_flush }; + mbfl_filt_conv_wchar_cp1251, + mbfl_filt_conv_common_flush }; -static struct mbfl_convert_vtbl vtbl_2022kr_wchar = { - mbfl_no_encoding_2022kr, +static struct mbfl_convert_vtbl vtbl_cp1251_wchar = { + mbfl_no_encoding_cp1251, mbfl_no_encoding_wchar, mbfl_filt_conv_common_ctor, mbfl_filt_conv_common_dtor, - mbfl_filt_conv_2022kr_wchar, + mbfl_filt_conv_cp1251_wchar, mbfl_filt_conv_common_flush }; -#endif /* HAVE_MBSTR_KR */ +static struct mbfl_convert_vtbl vtbl_wchar_cp866 = { + mbfl_no_encoding_cp866, + mbfl_no_encoding_wchar, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_wchar_cp866, + mbfl_filt_conv_common_flush }; + +static struct mbfl_convert_vtbl vtbl_cp866_wchar = { + mbfl_no_encoding_cp866, + mbfl_no_encoding_wchar, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_cp866_wchar, + mbfl_filt_conv_common_flush }; + +static struct mbfl_convert_vtbl vtbl_wchar_koi8r = { + mbfl_no_encoding_wchar, + mbfl_no_encoding_koi8r, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_wchar_koi8r, + mbfl_filt_conv_common_flush }; + + +static struct mbfl_convert_vtbl vtbl_koi8r_wchar = { + mbfl_no_encoding_koi8r, + mbfl_no_encoding_wchar, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_koi8r_wchar, + mbfl_filt_conv_common_flush }; +#endif /* HAVE_MBSTR_RU */ static struct mbfl_convert_vtbl vtbl_cp1252_wchar = { mbfl_no_encoding_cp1252, @@ -2015,8 +2098,14 @@ static struct mbfl_convert_vtbl *mbfl_convert_filter_list[] = { &vtbl_wchar_euckr, &vtbl_uhc_wchar, &vtbl_wchar_uhc, - &vtbl_2022kr_wchar, - &vtbl_wchar_2022kr, +#endif +#if defined(HAVE_MBSTR_RU) + &vtbl_cp1251_wchar, + &vtbl_wchar_cp1251, + &vtbl_cp866_wchar, + &vtbl_wchar_cp866, + &vtbl_koi8r_wchar, + &vtbl_wchar_koi8r, #endif &vtbl_cp1252_wchar, &vtbl_wchar_cp1252, @@ -2200,14 +2289,27 @@ static struct mbfl_identify_vtbl vtbl_identify_uhc = { mbfl_filt_ident_common_ctor, mbfl_filt_ident_common_dtor, mbfl_filt_ident_uhc }; +#endif /* HAVE_MBSTR_KR */ -static struct mbfl_identify_vtbl vtbl_identify_2022kr = { - mbfl_no_encoding_2022kr, +#if defined(HAVE_MBSTR_RU) +static struct mbfl_identify_vtbl vtbl_identify_cp1251 = { + mbfl_no_encoding_cp1251, mbfl_filt_ident_common_ctor, mbfl_filt_ident_common_dtor, - mbfl_filt_ident_2022kr }; + mbfl_filt_ident_cp1251 }; -#endif /* HAVE_MBSTR_KR */ +static struct mbfl_identify_vtbl vtbl_identify_cp866 = { + mbfl_no_encoding_cp866, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_common_dtor, + mbfl_filt_ident_cp866 }; + +static struct mbfl_identify_vtbl vtbl_identify_koi8r = { + mbfl_no_encoding_koi8r, + mbfl_filt_ident_common_ctor, + mbfl_filt_ident_common_dtor, + mbfl_filt_ident_koi8r }; +#endif /* HAVE_MBSTR_RU */ static struct mbfl_identify_vtbl vtbl_identify_cp1252 = { mbfl_no_encoding_cp1252, @@ -2323,7 +2425,11 @@ static struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = { #if defined(HAVE_MBSTR_KR) &vtbl_identify_euckr, &vtbl_identify_uhc, - &vtbl_identify_2022kr, +#endif +#if defined(HAVE_MBSTR_RU) + &vtbl_identify_cp1251, + &vtbl_identify_cp866, + &vtbl_identify_koi8r, #endif &vtbl_identify_cp1252, &vtbl_identify_8859_1, @@ -6049,77 +6155,6 @@ mbfl_filt_ident_uhc(int c, mbfl_identify_filter *filter TSRMLS_DC) return c; } -static int -mbfl_filt_ident_2022kr(int c, mbfl_identify_filter *filter TSRMLS_DC) -{ -retry: - switch (filter->status & 0xf) { -/* case 0x00: ASCII */ -/* case 0x10: KSC5601 mode */ -/* case 0x20: KSC5601 DBCS */ -/* case 0x40: KSC5601 SBCS */ - case 0: - if (!(filter->status & 0x10)) { - if (c == 0x1b) - filter->status += 2; - } else if (filter->status == 0x20 && c > 0x20 && c < 0x7f) { /* kanji first char */ - filter->status += 1; - } else if (c >= 0 && c < 0x80) { /* latin, CTLs */ - ; - } else { - filter->flag = 1; /* bad */ - } - break; - -/* case 0x21: KSC5601 second char */ - case 1: - filter->status &= ~0xf; - if (c < 0x21 || c > 0x7e) { /* bad */ - filter->flag = 1; - } - break; - - /* ESC */ - case 2: - if (c == 0x24) { /* '$' */ - filter->status++; - } else { - filter->flag = 1; /* bad */ - filter->status &= ~0xf; - goto retry; - } - break; - - /* ESC $ */ - case 3: - if (c == 0x29) { /* ')' */ - filter->status++; - } else { - filter->flag = 1; /* bad */ - filter->status &= ~0xf; - goto retry; - } - break; - - /* ESC $) */ - case 5: - if (c == 0x43) { /* 'C' */ - filter->status = 0x10; - } else { - filter->flag = 1; /* bad */ - filter->status &= ~0xf; - goto retry; - } - break; - - default: - filter->status = 0; - break; - } - - return c; -} - #endif /* HAVE_MBSTR_KR */ @@ -6139,6 +6174,39 @@ mbfl_filt_ident_cp1252(int c, mbfl_identify_filter *filter TSRMLS_DC) return c; } +#if defined(HAVE_MBSTR_RU) +// all of this is so ugly now! +static int +mbfl_filt_ident_cp1251(int c, mbfl_identify_filter *filter) +{ + if (c >= 0x80 && c < 0xff) + filter->flag = 0; + else + filter->flag = 1; /* not it */ + return c; +} + +static int +mbfl_filt_ident_cp866(int c, mbfl_identify_filter *filter) +{ + if (c >= 0x80 && c < 0xff) + filter->flag = 0; + else + filter->flag = 1; /* not it */ + return c; +} + +static int +mbfl_filt_ident_koi8r(int c, mbfl_identify_filter *filter) +{ + if (c >= 0x80 && c < 0xff) + filter->flag = 0; + else + filter->flag = 1; /* not it */ + return c; +} +#endif /* HAVE_MBSTR_RU */ + static int mbfl_filt_ident_2022jp(int c, mbfl_identify_filter *filter TSRMLS_DC) { diff --git a/ext/mbstring/mbfilter.h b/ext/mbstring/mbfilter.h index c63e6273ec..1b18f78a6b 100644 --- a/ext/mbstring/mbfilter.h +++ b/ext/mbstring/mbfilter.h @@ -113,6 +113,7 @@ enum mbfl_no_language { mbfl_no_language_swedish, /* sv */ mbfl_no_language_simplified_chinese, /* zh-cn */ mbfl_no_language_traditional_chinese, /* zh-tw */ + mbfl_no_language_russian, /* ru */ mbfl_no_language_max }; @@ -176,6 +177,9 @@ enum mbfl_no_encoding { mbfl_no_encoding_2022kr, mbfl_no_encoding_uhc, mbfl_no_encoding_hz, + mbfl_no_encoding_cp1251, + mbfl_no_encoding_cp866, + mbfl_no_encoding_koi8r, mbfl_no_encoding_charset_max }; @@ -246,6 +250,9 @@ typedef struct _mbfl_encoding { #define MBFL_WCSPLANE_BIG5 0x70f40000 /* 2121h - 9898h */ #define MBFL_WCSPLANE_CNS11643 0x70f50000 /* 2121h - 9898h */ #define MBFL_WCSPLANE_UHC 0x70f60000 /* 8141h - fefeh */ +#define MBFL_WCSPLANE_CP1251 0x70f70000 +#define MBFL_WCSPLANE_CP866 0x70f80000 +#define MBFL_WCSPLANE_KOI8R 0x70f90000 #define MBFL_WCSGROUP_MASK 0xffffff #define MBFL_WCSGROUP_UCS4MAX 0x70000000 #define MBFL_WCSGROUP_WCHARMAX 0x78000000 diff --git a/ext/mbstring/mbfilter_ru.c b/ext/mbstring/mbfilter_ru.c new file mode 100644 index 0000000000..2d15fff576 --- /dev/null +++ b/ext/mbstring/mbfilter_ru.c @@ -0,0 +1,213 @@ +/* + * "russian code filter and converter" + */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "php.h" +#include "php_globals.h" + +#if defined(HAVE_MBSTR_RU) +#include "mbfilter.h" +#include "unicode_table_ru.h" + +/* + * encoding filter + */ +#define CK(statement) do { if ((statement) < 0) return (-1); } while (0) + +/* + * cp1251 => wchar + */ +int +mbfl_filt_conv_cp1251_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int s; + + if (c >= 0 && c < cp1251_ucs_table_min) { + s = c; + } else if (c >= cp1251_ucs_table_min && c < 0x100) { + s = cp1251_ucs_table[c - cp1251_ucs_table_min]; + if (s <= 0) { + s = c; + s &= MBFL_WCSPLANE_MASK; + s |= MBFL_WCSPLANE_CP1251; + } + } else { + s = c; + s &= MBFL_WCSGROUP_MASK; + s |= MBFL_WCSGROUP_THROUGH; + } + + CK((*filter->output_function)(s, filter->data)); + + return c; +} + +/* + * wchar => cp1251 + */ +int +mbfl_filt_conv_wchar_cp1251(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int s, n; + + if (c >= 0 && c < cp1251_ucs_table_min) { + s = c; + } else { + s = -1; + n = cp1251_ucs_table_len-1; + while (n >= 0) { + if (c == cp1251_ucs_table[n]) { + s = cp1251_ucs_table_min + n; + break; + } + n--; + } + if (s <= 0 && (c & ~MBFL_WCSPLANE_MASK) == MBFL_WCSPLANE_CP1251) { + s = c & MBFL_WCSPLANE_MASK; + } + } + + if (s >= 0) { + CK((*filter->output_function)(s, filter->data)); + } else { + if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { + CK(mbfl_filt_conv_illegal_output(c, filter)); + } + } + + return c; +} + +/* + * cp866 => wchar + */ +int +mbfl_filt_conv_cp866_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int s; + + if (c >= 0 && c < cp866_ucs_table_min) { + s = c; + } else if (c >= cp866_ucs_table_min && c < 0x100) { + s = cp866_ucs_table[c - cp866_ucs_table_min]; + if (s <= 0) { + s = c; + s &= MBFL_WCSPLANE_MASK; + s |= MBFL_WCSPLANE_CP866; + } + } else { + s = c; + s &= MBFL_WCSGROUP_MASK; + s |= MBFL_WCSGROUP_THROUGH; + } + + CK((*filter->output_function)(s, filter->data)); + + return c; +} + +/* + * wchar => cp866 + */ +int +mbfl_filt_conv_wchar_cp866(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int s, n; + + if (c >= 0 && c < cp866_ucs_table_min) { + s = c; + } else { + s = -1; + n = cp866_ucs_table_len-1; + while (n >= 0) { + if (c == cp866_ucs_table[n]) { + s = cp866_ucs_table_min + n; + break; + } + n--; + } + if (s <= 0 && (c & ~MBFL_WCSPLANE_MASK) == MBFL_WCSPLANE_CP866) { + s = c & MBFL_WCSPLANE_MASK; + } + } + + if (s >= 0) { + CK((*filter->output_function)(s, filter->data)); + } else { + if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { + CK(mbfl_filt_conv_illegal_output(c, filter)); + } + } + + return c; +} + +/* + * koi8r => wchar + */ +int +mbfl_filt_conv_koi8r_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int s; + + if (c >= 0 && c < koi8r_ucs_table_min) { + s = c; + } else if (c >= koi8r_ucs_table_min && c < 0x100) { + s = koi8r_ucs_table[c - koi8r_ucs_table_min]; + if (s <= 0) { + s = c; + s &= MBFL_WCSPLANE_MASK; + s |= MBFL_WCSPLANE_KOI8R; + } + } else { + s = c; + s &= MBFL_WCSGROUP_MASK; + s |= MBFL_WCSGROUP_THROUGH; + } + + CK((*filter->output_function)(s, filter->data)); + + return c; +} + +/* + * wchar => koi8r + */ +int +mbfl_filt_conv_wchar_koi8r(int c, mbfl_convert_filter *filter TSRMLS_DC) +{ + int s, n; + + if (c >= 0 && c < koi8r_ucs_table_min) { + s = c; + } else { + s = -1; + n = koi8r_ucs_table_len-1; + while (n >= 0) { + if (c == koi8r_ucs_table[n]) { + s = koi8r_ucs_table_min + n; + break; + } + n--; + } + if (s <= 0 && (c & ~MBFL_WCSPLANE_MASK) == MBFL_WCSPLANE_KOI8R) { + s = c & MBFL_WCSPLANE_MASK; + } + } + + if (s >= 0) { + CK((*filter->output_function)(s, filter->data)); + } else { + if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { + CK(mbfl_filt_conv_illegal_output(c, filter)); + } + } + + return c; +} + +#endif /* HAVE_MBSTR_TW */ + diff --git a/ext/mbstring/mbfilter_ru.h b/ext/mbstring/mbfilter_ru.h new file mode 100644 index 0000000000..9bbe31aab1 --- /dev/null +++ b/ext/mbstring/mbfilter_ru.h @@ -0,0 +1,11 @@ +#ifndef MBFL_MBFILTER_RU_H +#define MBFL_MBFILTER_RU_H + +int mbfl_filt_conv_cp1251_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_wchar_cp1251(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_cp866_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_wchar_cp866(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_koi8r_wchar(int c, mbfl_convert_filter *filter TSRMLS_DC); +int mbfl_filt_conv_wchar_koi8r(int c, mbfl_convert_filter *filter TSRMLS_DC); + +#endif /* MBFL_MBFILTER_RU_H */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 782474f403..5d8941dca1 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -65,7 +65,7 @@ #include "php_content_types.h" #include "SAPI.h" -#if ZEND_MULTIBYTE +#ifdef ZEND_MULTIBYTE #include "zend_multibyte.h" #endif /* ZEND_MULTIBYTE */ @@ -113,6 +113,16 @@ static const enum mbfl_no_encoding php_mbstr_default_identify_list[] = { }; #endif +#if defined(HAVE_MBSTR_RU) & !defined(HAVE_MBSTR_JA) & !defined(HAVE_MBSTR_TW) & !defined(HAVE_MBSTR_KR) +static const enum mbfl_no_encoding php_mbstr_default_identify_list[] = { + mbfl_no_encoding_ascii, + mbfl_no_encoding_utf8, + mbfl_no_encoding_koi8r, + mbfl_no_encoding_cp1251, + mbfl_no_encoding_cp866 +}; +#endif + static const int php_mbstr_default_identify_list_size = sizeof(php_mbstr_default_identify_list)/sizeof(enum mbfl_no_encoding); static unsigned char third_and_rest_force_ref[] = { 3, BYREF_NONE, BYREF_NONE, BYREF_FORCE_REST }; @@ -2059,10 +2069,6 @@ PHP_FUNCTION(mb_strcut) if (from < 0) { from = 0; } - } - if (Z_STRLEN_PP(arg1) < from) { - /* keep index within string */ - from = Z_STRLEN_PP(arg1); } /* if "length" position is negative, set it to the length @@ -2074,10 +2080,6 @@ PHP_FUNCTION(mb_strcut) len = 0; } } - if (Z_STRLEN_PP(arg1) < (from + len)) { - /* limit span to characters in string */ - len = Z_STRLEN_PP(arg1) - from; - } ret = mbfl_strcut(&string, &result, from, len TSRMLS_CC); if (ret != NULL) { @@ -2269,7 +2271,7 @@ PHPAPI char * php_mb_convert_encoding(char *input, size_t length, char *_to_enco string.no_encoding = from_encoding; } } else { - php_error(E_WARNING, "$s() illegal character encoding specified", + php_error(E_WARNING, "%s() illegal character encoding specified", get_active_function_name(TSRMLS_C)); } if (list != NULL) { diff --git a/ext/mbstring/unicode_table_ru.h b/ext/mbstring/unicode_table_ru.h new file mode 100644 index 0000000000..74d02fef34 --- /dev/null +++ b/ext/mbstring/unicode_table_ru.h @@ -0,0 +1,69 @@ +// cp1251 to Unicode table +static const unsigned short cp1251_ucs_table[] = { + 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021, + 0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f, + 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, + 0x003f, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f, + 0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7, + 0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407, + 0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7, + 0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457, + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f +}; +static const int cp1251_ucs_table_min = 0x80; +static const int cp1251_ucs_table_len = (sizeof (cp1251_ucs_table) / sizeof (unsigned short)); +static const int cp1251_ucs_table_max = 0x80 + (sizeof (cp1251_ucs_table) / sizeof (unsigned short)); + +// cp866_DOSCyrillicRussian to Unicode table +static const unsigned short cp866_ucs_table[] = { + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, + 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, + 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, + 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, + 0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040e, 0x045e, + 0x00b0, 0x2219, 0x00b7, 0x221a, 0x2116, 0x00a4, 0x25a0, 0x00a0 +}; +static const int cp866_ucs_table_min = 0x80; +static const int cp866_ucs_table_len = (sizeof (cp866_ucs_table) / sizeof (unsigned short)); +static const int cp866_ucs_table_max = 0x80 + (sizeof (cp866_ucs_table) / sizeof (unsigned short)); + +// KOI8-R (RFC1489) to Unicode +static const unsigned short koi8r_ucs_table[] = { + 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524, + 0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, + 0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248, + 0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7, + 0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556, + 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e, + 0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565, + 0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9, + 0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, + 0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, + 0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, + 0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a, + 0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, + 0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, + 0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, + 0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a +}; +static const int koi8r_ucs_table_min = 0x80; +static const int koi8r_ucs_table_len = (sizeof (koi8r_ucs_table) / sizeof (unsigned short)); +static const int koi8r_ucs_table_max = 0x80 + (sizeof (koi8r_ucs_table) / sizeof (unsigned short)); + -- 2.50.1