#if defined(HAVE_MBSTR_KR)
#include "mbfilter_kr.h"
#endif
-#if defined(HAVE_MBSTR_RU)
+#if defined(HAVE_MBSTR_KR)
#include "mbfilter_ru.h"
-#include "unicode_table_ru.h"
#endif
#include "zend.h"
#endif /* HAVE_MBSTR_KR */
#if defined(HAVE_MBSTR_RU)
-static int mbfl_filt_get_rating_cp1251(int c, mbfl_identify_filter *filter TSRMLS_DC);
-static int mbfl_filt_get_rating_cp866(int c, mbfl_identify_filter *filter TSRMLS_DC);
-static int mbfl_filt_get_rating_koi8r(int c, mbfl_identify_filter *filter TSRMLS_DC);
static int mbfl_filt_ident_cp1251(int c, mbfl_identify_filter *filter TSRMLS_DC);
static int mbfl_filt_ident_cp866(int c, mbfl_identify_filter *filter TSRMLS_DC);
static int mbfl_filt_ident_koi8r(int c, mbfl_identify_filter *filter TSRMLS_DC);
mbfl_no_encoding_ascii,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_ascii,
- NULL };
+ mbfl_filt_ident_ascii };
static const struct mbfl_identify_vtbl vtbl_identify_utf8 = {
mbfl_no_encoding_utf8,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_utf8,
- NULL };
+ mbfl_filt_ident_utf8 };
static const struct mbfl_identify_vtbl vtbl_identify_utf7 = {
mbfl_no_encoding_utf7,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_utf7,
- NULL };
+ mbfl_filt_ident_utf7 };
#if defined(HAVE_MBSTR_JA)
static const struct mbfl_identify_vtbl vtbl_identify_eucjp = {
mbfl_no_encoding_euc_jp,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_eucjp,
- NULL };
+ mbfl_filt_ident_eucjp };
static const struct mbfl_identify_vtbl vtbl_identify_eucjpwin = {
mbfl_no_encoding_eucjp_win,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_eucjp,
- NULL };
+ mbfl_filt_ident_eucjp };
static const struct mbfl_identify_vtbl vtbl_identify_sjis = {
mbfl_no_encoding_sjis,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_sjis,
- NULL };
+ mbfl_filt_ident_sjis };
static const struct mbfl_identify_vtbl vtbl_identify_sjiswin = {
mbfl_no_encoding_sjis_win,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_sjiswin,
- NULL };
+ mbfl_filt_ident_sjiswin };
static const struct mbfl_identify_vtbl vtbl_identify_jis = {
mbfl_no_encoding_jis,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_jis,
- NULL };
+ mbfl_filt_ident_jis };
static const struct mbfl_identify_vtbl vtbl_identify_2022jp = {
mbfl_no_encoding_2022jp,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_2022jp,
- NULL };
+ mbfl_filt_ident_2022jp };
#endif /* HAVE_MBSTR_JA */
#if defined(HAVE_MBSTR_CN)
mbfl_no_encoding_euc_cn,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_euccn,
- NULL };
+ mbfl_filt_ident_euccn };
static struct mbfl_identify_vtbl vtbl_identify_cp936 = {
mbfl_no_encoding_cp936,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_cp936,
- NULL };
+ mbfl_filt_ident_cp936 };
static struct mbfl_identify_vtbl vtbl_identify_hz = {
mbfl_no_encoding_hz,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_hz,
- NULL };
+ mbfl_filt_ident_hz };
#endif /* HAVE_MBSTR_CN */
mbfl_no_encoding_euc_tw,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_euctw,
- NULL };
+ mbfl_filt_ident_euctw };
static struct mbfl_identify_vtbl vtbl_identify_big5 = {
mbfl_no_encoding_big5,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_big5,
- NULL };
+ mbfl_filt_ident_big5 };
#endif /* HAVE_MBSTR_TW */
#if defined(HAVE_MBSTR_KR)
mbfl_no_encoding_euc_kr,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_euckr,
- NULL };
+ mbfl_filt_ident_euckr };
static struct mbfl_identify_vtbl vtbl_identify_uhc = {
mbfl_no_encoding_uhc,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_uhc,
- NULL };
+ mbfl_filt_ident_uhc };
static struct mbfl_identify_vtbl vtbl_identify_2022kr = {
mbfl_no_encoding_2022kr,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_2022kr,
- NULL };
+ mbfl_filt_ident_2022kr };
#endif /* HAVE_MBSTR_KR */
mbfl_no_encoding_cp1251,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_cp1251,
- mbfl_filt_get_rating_cp1251 };
+ mbfl_filt_ident_cp1251 };
static struct mbfl_identify_vtbl vtbl_identify_cp866 = {
mbfl_no_encoding_cp866,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_cp866,
- mbfl_filt_get_rating_cp866 };
+ mbfl_filt_ident_cp866 };
static struct mbfl_identify_vtbl vtbl_identify_koi8r = {
mbfl_no_encoding_koi8r,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_koi8r,
- mbfl_filt_get_rating_koi8r };
+ mbfl_filt_ident_koi8r };
#endif /* HAVE_MBSTR_RU */
static const struct mbfl_identify_vtbl vtbl_identify_cp1252 = {
mbfl_no_encoding_cp1252,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_cp1252,
- NULL };
+ mbfl_filt_ident_cp1252 };
static const struct mbfl_identify_vtbl vtbl_identify_8859_1 = {
mbfl_no_encoding_8859_1,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_2 = {
mbfl_no_encoding_8859_2,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_3 = {
mbfl_no_encoding_8859_3,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_4 = {
mbfl_no_encoding_8859_4,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_5 = {
mbfl_no_encoding_8859_5,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_6 = {
mbfl_no_encoding_8859_6,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_7 = {
mbfl_no_encoding_8859_7,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_8 = {
mbfl_no_encoding_8859_8,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_9 = {
mbfl_no_encoding_8859_9,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_10 = {
mbfl_no_encoding_8859_10,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_13 = {
mbfl_no_encoding_8859_13,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_14 = {
mbfl_no_encoding_8859_14,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_8859_15 = {
mbfl_no_encoding_8859_15,
mbfl_filt_ident_common_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_true,
- NULL };
+ mbfl_filt_ident_true };
static const struct mbfl_identify_vtbl vtbl_identify_false = {
mbfl_no_encoding_pass,
mbfl_filt_ident_false_ctor,
mbfl_filt_ident_common_dtor,
- mbfl_filt_ident_false,
- NULL };
+ mbfl_filt_ident_false };
static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
&vtbl_identify_utf8,
}
#if defined(HAVE_MBSTR_RU)
-static int
-mbfl_filt_get_rating_cp1251(int c, mbfl_identify_filter *filter TSRMLS_DC)
-{
- if (c >= cp1251_char_ratings_table_min && c < (cp1251_char_ratings_table_min + cp1251_char_ratings_table_len) )
- return cp1251_char_ratings_table[c - cp1251_char_ratings_table_min];
- return 0;
-}
-
-static int
-mbfl_filt_get_rating_cp866(int c, mbfl_identify_filter *filter TSRMLS_DC)
-{
- if (c >= cp866_char_ratings_table_min && c < (cp866_char_ratings_table_min + cp866_char_ratings_table_len) )
- return cp866_char_ratings_table[c - cp866_char_ratings_table_min];
- return 0;
-}
-
-static int
-mbfl_filt_get_rating_koi8r(int c, mbfl_identify_filter *filter TSRMLS_DC)
-{
- if (c >= koi8r_char_ratings_table_min && c < (koi8r_char_ratings_table_min + koi8r_char_ratings_table_len) )
- return koi8r_char_ratings_table[c - koi8r_char_ratings_table_min];
- return 0;
-}
-
// all of this is so ugly now!
static int
mbfl_filt_ident_cp1251(int c, mbfl_identify_filter *filter TSRMLS_DC)
filter->filter_ctor = vtbl->filter_ctor;
filter->filter_dtor = vtbl->filter_dtor;
filter->filter_function = vtbl->filter_function;
- filter->get_rating_function = vtbl->get_rating_function;
}
}
}
}
-/*
- * guess encoding - uses another algorithm for charset detection based on symbols rating
- */
-const mbfl_encoding *
-mbfl_guess_encoding(mbfl_string *string, enum mbfl_no_encoding *elist, int eliztsz TSRMLS_DC)
-{
- int i, n, num, num_actual, bad, overflow;
- unsigned char *p;
- const struct mbfl_identify_vtbl *vtbl;
- mbfl_identify_filter *flist, *filter;
- const mbfl_encoding *encoding;
- unsigned long *ratings,add_rating,max_rating;
-
- if (elist == NULL)
- return NULL;
-
- /* initialize */
- flist = (mbfl_identify_filter *)mbfl_calloc(eliztsz, sizeof(mbfl_identify_filter));
- if (flist == NULL) {
- return NULL;
- }
- i = 0;
- num = 0;
- num_actual = 0;
- if (elist != NULL) {
- while (i < eliztsz) {
- vtbl = mbfl_identify_filter_get_vtbl(elist[i]);
- if (vtbl != NULL) {
- filter = &flist[num];
- mbfl_identify_filter_set_vtbl(filter, vtbl);
- filter->encoding = mbfl_no2encoding(vtbl->encoding);
- (*filter->filter_ctor)(filter TSRMLS_CC);
- num++;
- if (filter->get_rating_function)
- num_actual++;
- }
- i++;
- }
- }
- if (num_actual == 0) {
- /* no filters with character rating routines - exit */
- mbfl_free((void *)flist);
- return NULL;
- }
-
- ratings = (unsigned long *)mbfl_calloc(eliztsz, sizeof(unsigned long));
- if (ratings == NULL) {
- mbfl_free((void *)flist);
- return NULL;
- }
-
-
- /* feed data */
- n = string->len;
- p = string->val;
- if (p != NULL) {
- while (n > 0) {
- i = 0;
- bad = 0;
- overflow = 0;
- while (i < num) {
- filter = &flist[i];
- add_rating=(*filter->get_rating_function)(*p, filter TSRMLS_CC);
- if ( (ratings[i] + add_rating) < ratings[i] )
- overflow = 1;
- ratings[i] += add_rating;
- i++;
- }
- if (overflow)
- // overflow - enough data now - exit
- break;
- p++;
- n--;
- }
- }
-
- /* judge */
- max_rating = 0;
- i = 0;
- encoding = NULL;
- while (i < num) {
- filter = &flist[i];
- if (ratings[i] > max_rating) {
- max_rating = ratings[i];
- encoding = filter->encoding;
- }
- (*filter->filter_dtor)(filter TSRMLS_CC);
- i++;
- }
- mbfl_free((void *)ratings);
- mbfl_free((void *)flist);
-
- return encoding;
-}
-
-const char*
-mbfl_guess_encoding_name(mbfl_string *string, enum mbfl_no_encoding *elist, int eliztsz TSRMLS_DC)
-{
- const mbfl_encoding *encoding;
-
- encoding = mbfl_guess_encoding(string, elist, eliztsz TSRMLS_CC);
- if (encoding != NULL &&
- encoding->no_encoding > mbfl_no_encoding_charset_min &&
- encoding->no_encoding < mbfl_no_encoding_charset_max) {
- return encoding->name;
- } else {
- return NULL;
- }
-}
-
const enum mbfl_no_encoding
mbfl_identify_encoding_no(mbfl_string *string, enum mbfl_no_encoding *elist, int eliztsz TSRMLS_DC)
{
static const int koi8r_ucs_table_len = (sizeof (koi8r_ucs_table) / sizeof (unsigned short));
static const int koi8r_ucs_table_max = 0x80 + (sizeof (koi8r_ucs_table) / sizeof (unsigned short));
-
-
-static const unsigned int cp1251_char_ratings_table[] = {
- 14985, 3207, 9044, 2847, 6015,18094, 2305, 3456,
- 15786, 2472, 6531, 7803, 6341,13494,21800, 6267,
- 10139,10398,13877, 5094, 536, 2201, 855, 2665,
- 1127, 981, 99, 4460, 3805, 426, 1516, 4341
-};
-static const int cp1251_char_ratings_table_min = 0xe0;
-static const int cp1251_char_ratings_table_len = (sizeof (cp1251_char_ratings_table) / sizeof (unsigned int));
-
-
-static const unsigned int cp866_char_ratings_table[] = {
- 99, 0, 1516,14985, 2305, 536, 855, 6015,
- 5094, 4460, 2201,15786, 2472, 6531, 7803, 6341,
- 13494,21800, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 4341,10139,10398,13877,18094, 2847,
- 3207, 981, 3456, 426, 2665, 3805, 9044, 1127,
- 0, 6267
-};
-static const int cp866_char_ratings_table_min = 0x9e;
-static const int cp866_char_ratings_table_len = (sizeof (cp866_char_ratings_table) / sizeof (unsigned int));
-
-
-static const unsigned int koi8r_char_ratings_table[] = {
- 1516,14985, 3207, 855, 6015,18094, 536, 2847,
- 2201,15786, 2472, 6531, 7803, 6341,13494,21800,
- 6267, 4341,10139,10398,13877, 5094, 2305, 9044,
- 3805, 4460, 3456, 1127, 426, 981, 2665, 99
-};
-static const int koi8r_char_ratings_table_min = 0xc0;
-static const int koi8r_char_ratings_table_len = (sizeof (koi8r_char_ratings_table) / sizeof (unsigned int));