]> granicus.if.org Git - php/commitdiff
Added new function mb_guess_encoding, based on symbols rating
authorDen V. Tsopa <dets@php.net>
Wed, 14 Aug 2002 05:41:40 +0000 (05:41 +0000)
committerDen V. Tsopa <dets@php.net>
Wed, 14 Aug 2002 05:41:40 +0000 (05:41 +0000)
ext/mbstring/mbfilter.c
ext/mbstring/mbfilter.h
ext/mbstring/mbstring.c
ext/mbstring/mbstring.h
ext/mbstring/unicode_table_ru.h

index 2bf816279a1752c856f64ffa8908db94281183d4..4e7c942afc701e6a47159b1cb700c29ef61e2d57 100644 (file)
 #if defined(HAVE_MBSTR_KR)
 #include "mbfilter_kr.h"
 #endif
-#if defined(HAVE_MBSTR_KR)
+#if defined(HAVE_MBSTR_RU)
 #include "mbfilter_ru.h"
+#include "unicode_table_ru.h"
 #endif
 
 #include "zend.h"
@@ -1242,6 +1243,9 @@ static int mbfl_filt_ident_2022kr(int c, mbfl_identify_filter *filter TSRMLS_DC)
 #endif /* HAVE_MBSTR_KR */
 
 #if defined(HAVE_MBSTR_RU)
+static int mbfl_filt_get_rating_cp1251(int c, mbfl_identify_filter *filter TSRMLS_DC);
+static int mbfl_filt_get_rating_cp866(int c, mbfl_identify_filter *filter TSRMLS_DC);
+static int mbfl_filt_get_rating_koi8r(int c, mbfl_identify_filter *filter TSRMLS_DC);
 static int mbfl_filt_ident_cp1251(int c, mbfl_identify_filter *filter TSRMLS_DC);
 static int mbfl_filt_ident_cp866(int c, mbfl_identify_filter *filter TSRMLS_DC);
 static int mbfl_filt_ident_koi8r(int c, mbfl_identify_filter *filter TSRMLS_DC);
@@ -2294,56 +2298,65 @@ static const struct mbfl_identify_vtbl vtbl_identify_ascii = {
        mbfl_no_encoding_ascii,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_ascii };
+       mbfl_filt_ident_ascii,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_utf8 = {
        mbfl_no_encoding_utf8,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_utf8 };
+       mbfl_filt_ident_utf8,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_utf7 = {
        mbfl_no_encoding_utf7,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_utf7 };
+       mbfl_filt_ident_utf7,
+       NULL };
 
 #if defined(HAVE_MBSTR_JA)
 static const struct mbfl_identify_vtbl vtbl_identify_eucjp = {
        mbfl_no_encoding_euc_jp,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_eucjp };
+       mbfl_filt_ident_eucjp,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_eucjpwin = {
        mbfl_no_encoding_eucjp_win,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_eucjp };
+       mbfl_filt_ident_eucjp,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_sjis = {
        mbfl_no_encoding_sjis,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_sjis };
+       mbfl_filt_ident_sjis,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_sjiswin = {
        mbfl_no_encoding_sjis_win,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_sjiswin };
+       mbfl_filt_ident_sjiswin,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_jis = {
        mbfl_no_encoding_jis,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_jis };
+       mbfl_filt_ident_jis,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_2022jp = {
        mbfl_no_encoding_2022jp,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_2022jp };
+       mbfl_filt_ident_2022jp,
+       NULL };
 #endif /* HAVE_MBSTR_JA */
 
 #if defined(HAVE_MBSTR_CN)
@@ -2351,19 +2364,22 @@ static struct mbfl_identify_vtbl vtbl_identify_euccn = {
        mbfl_no_encoding_euc_cn,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_euccn };
+       mbfl_filt_ident_euccn,
+       NULL };
 
 static struct mbfl_identify_vtbl vtbl_identify_cp936 = {
        mbfl_no_encoding_cp936,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_cp936 };
+       mbfl_filt_ident_cp936,
+       NULL };
 
 static struct mbfl_identify_vtbl vtbl_identify_hz = {
        mbfl_no_encoding_hz,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_hz };
+       mbfl_filt_ident_hz,
+       NULL };
 
 #endif /* HAVE_MBSTR_CN */
 
@@ -2372,13 +2388,15 @@ static struct mbfl_identify_vtbl vtbl_identify_euctw = {
        mbfl_no_encoding_euc_tw,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_euctw };
+       mbfl_filt_ident_euctw,
+       NULL };
 
 static struct mbfl_identify_vtbl vtbl_identify_big5 = {
        mbfl_no_encoding_big5,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_big5 };
+       mbfl_filt_ident_big5,
+       NULL };
 #endif /* HAVE_MBSTR_TW */
 
 #if defined(HAVE_MBSTR_KR)
@@ -2386,19 +2404,22 @@ static struct mbfl_identify_vtbl vtbl_identify_euckr = {
        mbfl_no_encoding_euc_kr,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_euckr };
+       mbfl_filt_ident_euckr,
+       NULL };
 
 static struct mbfl_identify_vtbl vtbl_identify_uhc = {
        mbfl_no_encoding_uhc,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_uhc };
+       mbfl_filt_ident_uhc,
+       NULL };
 
 static struct mbfl_identify_vtbl vtbl_identify_2022kr = {
        mbfl_no_encoding_2022kr,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_2022kr };
+       mbfl_filt_ident_2022kr,
+       NULL };
 
 #endif /* HAVE_MBSTR_KR */
 
@@ -2407,110 +2428,128 @@ static struct mbfl_identify_vtbl vtbl_identify_cp1251 = {
        mbfl_no_encoding_cp1251,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_cp1251 };
+       mbfl_filt_ident_cp1251,
+       mbfl_filt_get_rating_cp1251 };
 
 static struct mbfl_identify_vtbl vtbl_identify_cp866 = {
        mbfl_no_encoding_cp866,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_cp866 };
+       mbfl_filt_ident_cp866,
+       mbfl_filt_get_rating_cp866 };
 
 static struct mbfl_identify_vtbl vtbl_identify_koi8r = {
        mbfl_no_encoding_koi8r,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_koi8r };
+       mbfl_filt_ident_koi8r, 
+       mbfl_filt_get_rating_koi8r };
 #endif /* HAVE_MBSTR_RU */
 
 static const struct mbfl_identify_vtbl vtbl_identify_cp1252 = {
        mbfl_no_encoding_cp1252,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_cp1252 };
+       mbfl_filt_ident_cp1252,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_1 = {
        mbfl_no_encoding_8859_1,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_2 = {
        mbfl_no_encoding_8859_2,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_3 = {
        mbfl_no_encoding_8859_3,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_4 = {
        mbfl_no_encoding_8859_4,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_5 = {
        mbfl_no_encoding_8859_5,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_6 = {
        mbfl_no_encoding_8859_6,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_7 = {
        mbfl_no_encoding_8859_7,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_8 = {
        mbfl_no_encoding_8859_8,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_9 = {
        mbfl_no_encoding_8859_9,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_10 = {
        mbfl_no_encoding_8859_10,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_13 = {
        mbfl_no_encoding_8859_13,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_14 = {
        mbfl_no_encoding_8859_14,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_8859_15 = {
        mbfl_no_encoding_8859_15,
        mbfl_filt_ident_common_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_true };
+       mbfl_filt_ident_true,
+       NULL };
 
 static const struct mbfl_identify_vtbl vtbl_identify_false = {
        mbfl_no_encoding_pass,
        mbfl_filt_ident_false_ctor,
        mbfl_filt_ident_common_dtor,
-       mbfl_filt_ident_false };
+       mbfl_filt_ident_false,
+       NULL };
 
 static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
        &vtbl_identify_utf8,
@@ -6537,6 +6576,30 @@ mbfl_filt_ident_cp1252(int c, mbfl_identify_filter *filter TSRMLS_DC)
 }
 
 #if defined(HAVE_MBSTR_RU)
+static int
+mbfl_filt_get_rating_cp1251(int c, mbfl_identify_filter *filter TSRMLS_DC)
+{
+       if (c >= cp1251_char_ratings_table_min && c < (cp1251_char_ratings_table_min + cp1251_char_ratings_table_len) )
+               return cp1251_char_ratings_table[c - cp1251_char_ratings_table_min];
+       return 0;       
+}
+
+static int
+mbfl_filt_get_rating_cp866(int c, mbfl_identify_filter *filter TSRMLS_DC)
+{
+       if (c >= cp866_char_ratings_table_min && c < (cp866_char_ratings_table_min + cp866_char_ratings_table_len) )
+               return cp866_char_ratings_table[c - cp866_char_ratings_table_min];
+       return 0;       
+}
+
+static int
+mbfl_filt_get_rating_koi8r(int c, mbfl_identify_filter *filter TSRMLS_DC)
+{
+       if (c >= koi8r_char_ratings_table_min && c < (koi8r_char_ratings_table_min + koi8r_char_ratings_table_len) )
+               return koi8r_char_ratings_table[c - koi8r_char_ratings_table_min];
+       return 0;       
+}
+
 // all of this is so ugly now!
 static int
 mbfl_filt_ident_cp1251(int c, mbfl_identify_filter *filter TSRMLS_DC)
@@ -6981,6 +7044,7 @@ mbfl_identify_filter_set_vtbl(mbfl_identify_filter *filter, const struct mbfl_id
                filter->filter_ctor = vtbl->filter_ctor;
                filter->filter_dtor = vtbl->filter_dtor;
                filter->filter_function = vtbl->filter_function;
+               filter->get_rating_function = vtbl->get_rating_function;
        }
 }
 
@@ -7552,6 +7616,116 @@ mbfl_identify_encoding_name(mbfl_string *string, enum mbfl_no_encoding *elist, i
        }
 }
 
+/*
+ * guess encoding - uses another algorithm for charset detection based on symbols rating
+ */
+const mbfl_encoding *
+mbfl_guess_encoding(mbfl_string *string, enum mbfl_no_encoding *elist, int eliztsz TSRMLS_DC)
+{
+       int i, n, num, num_actual, bad, overflow;
+       unsigned char *p;
+       const struct mbfl_identify_vtbl *vtbl;
+       mbfl_identify_filter *flist, *filter;
+       const mbfl_encoding *encoding;
+       unsigned long *ratings,add_rating,max_rating;
+
+       if (elist == NULL)
+               return NULL;
+
+       /* initialize */
+       flist = (mbfl_identify_filter *)mbfl_calloc(eliztsz, sizeof(mbfl_identify_filter));
+       if (flist == NULL) {
+               return NULL;
+       }
+       i = 0;
+       num = 0;
+       num_actual = 0;
+       if (elist != NULL) {
+               while (i < eliztsz) {
+                       vtbl = mbfl_identify_filter_get_vtbl(elist[i]);
+                       if (vtbl != NULL) {
+                               filter = &flist[num];
+                               mbfl_identify_filter_set_vtbl(filter, vtbl);
+                               filter->encoding = mbfl_no2encoding(vtbl->encoding);
+                               (*filter->filter_ctor)(filter TSRMLS_CC);                               
+                               num++;
+                               if (filter->get_rating_function)
+                                       num_actual++;
+                       }
+                       i++;
+               }
+       }
+       if (num_actual == 0) {
+               /* no filters with character rating routines - exit */
+               mbfl_free((void *)flist);               
+               return NULL;
+       }
+
+       ratings = (unsigned long *)mbfl_calloc(eliztsz, sizeof(unsigned long));
+       if (ratings == NULL) {
+               mbfl_free((void *)flist);               
+               return NULL;
+       }
+
+       
+       /* feed data */
+       n = string->len;
+       p = string->val;
+       if (p != NULL) {
+               while (n > 0) {
+                       i = 0;
+                       bad = 0;
+                       overflow = 0;
+                       while (i < num) {
+                               filter = &flist[i];
+                               add_rating=(*filter->get_rating_function)(*p, filter TSRMLS_CC);
+                               if ( (ratings[i] + add_rating) < ratings[i] ) 
+                                       overflow = 1;   
+                               ratings[i] += add_rating;
+                               i++;
+                       }
+                       if (overflow)
+                               // overflow - enough data now - exit
+                               break;
+                       p++;
+                       n--;
+               }
+       }
+
+       /* judge */
+       max_rating = 0;
+       i = 0;
+       encoding = NULL;
+       while (i < num) {
+               filter = &flist[i];
+               if (ratings[i] > max_rating) {
+                       max_rating = ratings[i];
+                       encoding = filter->encoding;
+               }
+               (*filter->filter_dtor)(filter TSRMLS_CC);
+               i++;
+       }
+       mbfl_free((void *)ratings);             
+       mbfl_free((void *)flist);
+
+       return encoding;
+}
+
+const char*
+mbfl_guess_encoding_name(mbfl_string *string, enum mbfl_no_encoding *elist, int eliztsz TSRMLS_DC)
+{
+       const mbfl_encoding *encoding;
+
+       encoding = mbfl_guess_encoding(string, elist, eliztsz TSRMLS_CC);
+       if (encoding != NULL &&
+           encoding->no_encoding > mbfl_no_encoding_charset_min &&
+           encoding->no_encoding < mbfl_no_encoding_charset_max) {
+               return encoding->name;
+       } else {
+               return NULL;
+       }
+}
+
 const enum mbfl_no_encoding
 mbfl_identify_encoding_no(mbfl_string *string, enum mbfl_no_encoding *elist, int eliztsz TSRMLS_DC)
 {
index 563a11ea89ab9089dd820bfe226be83ff152af17..58da85c9ad2d8bd3fde30cc6384c889c7dc7ebcf 100644 (file)
@@ -389,6 +389,7 @@ struct _mbfl_identify_filter {
        void (*filter_ctor)(mbfl_identify_filter *filter TSRMLS_DC);
        void (*filter_dtor)(mbfl_identify_filter *filter TSRMLS_DC);
        int (*filter_function)(int c, mbfl_identify_filter *filter TSRMLS_DC);
+       int (*get_rating_function)(int c, mbfl_identify_filter *filter TSRMLS_DC);
        int status;
        int flag;
        int score;
@@ -400,6 +401,7 @@ struct mbfl_identify_vtbl {
        void (*filter_ctor)(mbfl_identify_filter *filter TSRMLS_DC);
        void (*filter_dtor)(mbfl_identify_filter *filter TSRMLS_DC);
        int (*filter_function)(int c, mbfl_identify_filter *filter TSRMLS_DC);
+       int (*get_rating_function)(int c, mbfl_identify_filter *filter TSRMLS_DC);
 };
 
 mbfl_identify_filter * mbfl_identify_filter_new(enum mbfl_no_encoding encoding TSRMLS_DC);
@@ -458,6 +460,12 @@ mbfl_convert_encoding(mbfl_string *string, mbfl_string *result, enum mbfl_no_enc
 /*
  * identify encoding
  */
+const mbfl_encoding *
+mbfl_guess_encoding(mbfl_string *string, enum mbfl_no_encoding *elist, int eliztsz TSRMLS_DC);
+
+const char *
+mbfl_guess_encoding_name(mbfl_string *string, enum mbfl_no_encoding *elist, int eliztsz TSRMLS_DC);
+
 const mbfl_encoding *
 mbfl_identify_encoding(mbfl_string *string, enum mbfl_no_encoding *elist, int eliztsz TSRMLS_DC);
 
index 197ad3bc57ba8b6c35fabd947ef4092f0e75cd76..c0c82e759afdb6e64aec57891d903946dc6bb007 100644 (file)
@@ -194,6 +194,7 @@ function_entry mbstring_functions[] = {
        PHP_FE(mb_strimwidth,                           NULL)
        PHP_FE(mb_convert_encoding,             NULL)
        PHP_FE(mb_detect_encoding,              NULL)
+       PHP_FE(mb_guess_encoding,               NULL)
        PHP_FE(mb_convert_kana,                 NULL)
        PHP_FE(mb_encode_mimeheader,            NULL)
        PHP_FE(mb_decode_mimeheader,            NULL)
@@ -2607,6 +2608,83 @@ PHP_FUNCTION(mb_detect_encoding)
 
 
 
+/* {{{ proto string mb_guess_encoding(string str [, mixed encoding_list])
+   Encodings of the given string is returned (as a string) */
+PHP_FUNCTION(mb_guess_encoding)
+{
+       pval **arg_str, **arg_list;
+       mbfl_string string;
+       const char *ret;
+       enum mbfl_no_encoding *elist;
+       int size, *list;
+
+       if (ZEND_NUM_ARGS() == 1) {
+               if (zend_get_parameters_ex(1, &arg_str) == FAILURE) {
+                       WRONG_PARAM_COUNT;
+               }
+       } else if (ZEND_NUM_ARGS() == 2) {
+               if (zend_get_parameters_ex(2, &arg_str, &arg_list) == FAILURE) {
+                       WRONG_PARAM_COUNT;
+               }
+       } else {
+               WRONG_PARAM_COUNT;
+       }
+
+       /* make encoding list */
+       list = NULL;
+       size = 0;
+       if (ZEND_NUM_ARGS() >= 2) {
+               switch (Z_TYPE_PP(arg_list)) {
+               case IS_ARRAY:
+                       if (!php_mbstring_parse_encoding_array(*arg_list, &list, &size, 0)) {
+                               if (list) {
+                                       efree(list);
+                                       size = 0;
+                               }
+                       }
+                       break;
+               default:
+                       convert_to_string_ex(arg_list);
+                       if (!php_mbstring_parse_encoding_list(Z_STRVAL_PP(arg_list), Z_STRLEN_PP(arg_list), &list, &size, 0)) {
+                               if (list) {
+                                       efree(list);
+                                       size = 0;
+                               }
+                       }
+                       break;
+               }
+               if (size <= 0) {
+                       php_error(E_WARNING, "%s() illegal argument",
+                                         get_active_function_name(TSRMLS_C));
+               }
+       }
+
+       if (size > 0 && list != NULL) {
+               elist = list;
+       } else {
+               elist = MBSTRG(current_detect_order_list);
+               size = MBSTRG(current_detect_order_list_size);
+       }
+
+       convert_to_string_ex(arg_str);
+       mbfl_string_init(&string);
+       string.no_language = MBSTRG(current_language);
+       string.val = Z_STRVAL_PP(arg_str);
+       string.len = Z_STRLEN_PP(arg_str);
+       ret = mbfl_guess_encoding_name(&string, elist, size TSRMLS_CC);
+       if (list != NULL) {
+               efree((void *)list);
+       }
+       if (ret != NULL) {
+               RETVAL_STRING((char *)ret, 1);
+       } else {
+               RETVAL_FALSE;
+       }
+}
+/* }}} */
+
+
+
 /* {{{ proto string mb_encode_mimeheader(string str [, string charset [, string transfer-encoding [, string linefeed]]])
    Converts the string to MIME "encoded-word" in the format of =?charset?(B|Q)?encoded_string?= */
 PHP_FUNCTION(mb_encode_mimeheader)
index 094ecc4cb323beb8459c3562e7a3b42e1c6d9fa7..e2d51501ffbd4e50b1de56da83a9ffe6c197a6c2 100644 (file)
@@ -94,6 +94,7 @@ PHP_FUNCTION(mb_strcut);
 PHP_FUNCTION(mb_strwidth);
 PHP_FUNCTION(mb_strimwidth);
 PHP_FUNCTION(mb_convert_encoding);
+PHP_FUNCTION(mb_guess_encoding);
 PHP_FUNCTION(mb_detect_encoding);
 PHP_FUNCTION(mb_convert_kana);
 PHP_FUNCTION(mb_encode_mimeheader);
index 74d02fef3466675f9285f77eae1dde8f393d5996..270032ea930358c15dbadf7c47455303e90cf5ed 100644 (file)
@@ -67,3 +67,40 @@ static const int koi8r_ucs_table_min = 0x80;
 static const int koi8r_ucs_table_len = (sizeof (koi8r_ucs_table) / sizeof (unsigned short));
 static const int koi8r_ucs_table_max = 0x80 + (sizeof (koi8r_ucs_table) / sizeof (unsigned short));
 
+
+
+static const unsigned int cp1251_char_ratings_table[] = {
+       14985, 3207, 9044, 2847, 6015,18094, 2305, 3456,
+       15786, 2472, 6531, 7803, 6341,13494,21800, 6267,
+       10139,10398,13877, 5094,  536, 2201,  855, 2665, 
+        1127,  981,   99, 4460, 3805,  426, 1516, 4341
+};
+static const int cp1251_char_ratings_table_min = 0xe0;
+static const int cp1251_char_ratings_table_len = (sizeof (cp1251_char_ratings_table) / sizeof (unsigned int));
+
+
+static const unsigned int cp866_char_ratings_table[] = {
+          99,    0, 1516,14985, 2305,  536,  855, 6015, 
+        5094, 4460, 2201,15786, 2472, 6531, 7803, 6341,
+       13494,21800,    0,    0,    0,    0,    0,    0,    
+               0,    0,    0,    0,    0,    0,    0,    0,    
+               0,    0,    0,    0,    0,    0,    0,    0,    
+               0,    0,    0,    0,    0,    0,    0,    0,    
+               0,    0,    0,    0,    0,    0,    0,    0,    
+               0,    0,    0,    0,    0,    0,    0,    0,    
+               0,    0, 4341,10139,10398,13877,18094, 2847, 
+        3207,  981, 3456,  426, 2665, 3805, 9044, 1127,    
+           0, 6267
+};
+static const int cp866_char_ratings_table_min = 0x9e;
+static const int cp866_char_ratings_table_len = (sizeof (cp866_char_ratings_table) / sizeof (unsigned int));
+
+
+static const unsigned int koi8r_char_ratings_table[] = {
+        1516,14985, 3207,  855, 6015,18094,  536, 2847, 
+     2201,15786, 2472, 6531, 7803, 6341,13494,21800,
+     6267, 4341,10139,10398,13877, 5094, 2305, 9044,
+     3805, 4460, 3456, 1127,  426,  981, 2665,   99
+};
+static const int koi8r_char_ratings_table_min = 0xc0;
+static const int koi8r_char_ratings_table_len = (sizeof (koi8r_char_ratings_table) / sizeof (unsigned int));