From: Anatol Belski Date: Fri, 6 Apr 2018 16:14:11 +0000 (+0200) Subject: Expose functionality for NFKC_Casefold normalization X-Git-Tag: php-7.3.0alpha1~83 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=39301abbe19973e60ac4dbdb1d98da77fe7dfc52;p=php Expose functionality for NFKC_Casefold normalization --- diff --git a/UPGRADING b/UPGRADING index 9590be374f..0b3ed6176a 100644 --- a/UPGRADING +++ b/UPGRADING @@ -137,7 +137,6 @@ Intl: http://icu-project.org/apiref/icu4c/uspoof_8h.html . Added Normalizer::getRawDecomposition() and normalizer_get_raw_decomposition(), to retrieve the Decomposition_Mapping property of a character. - . Normalizer::NONE is deprecated, when PHP is linked with ICU 56 and above Standard: . Added is_countable() function, to check whether a value may be passed to @@ -170,6 +169,11 @@ JSON: FTP: . Set default transfer mode to binary + Intl: + . Normalizer::NONE is deprecated, when PHP is linked with ICU >= 56 + . Introduced Normalizer::FORM_KC_CF as Normalizer::normalize() argument + for NFKC_Casefold normalization, available when linked with ICU >= 56 + MBString: . The configuration option --with-libmbfl is no longer available. diff --git a/ext/intl/normalizer/normalizer.c b/ext/intl/normalizer/normalizer.c index 3a0d526b77..2f7555388e 100644 --- a/ext/intl/normalizer/normalizer.c +++ b/ext/intl/normalizer/normalizer.c @@ -51,6 +51,10 @@ void normalizer_register_constants( INIT_FUNC_ARGS ) NORMALIZER_EXPOSE_CLASS_CONST( NFC ); NORMALIZER_EXPOSE_CLASS_CONST( FORM_KC ); NORMALIZER_EXPOSE_CLASS_CONST( NFKC ); +#if U_ICU_VERSION_MAJOR_NUM >= 49 + NORMALIZER_EXPOSE_CLASS_CONST( FORM_KC_CF ); + NORMALIZER_EXPOSE_CLASS_CONST( NFKC_CF ); +#endif #undef NORMALIZER_EXPOSE_CUSTOM_CLASS_CONST #undef NORMALIZER_EXPOSE_CLASS_CONST diff --git a/ext/intl/normalizer/normalizer.h b/ext/intl/normalizer/normalizer.h index ddd2c38a94..8150f244f6 100644 --- a/ext/intl/normalizer/normalizer.h +++ b/ext/intl/normalizer/normalizer.h @@ -19,7 +19,7 @@ #include #include -#if U_ICU_VERSION_MAJOR_NUM < 56 +#if U_ICU_VERSION_MAJOR_NUM < 49 #include #define NORMALIZER_NONE UNORM_NONE @@ -44,6 +44,8 @@ #define NORMALIZER_NFC NORMALIZER_FORM_C #define NORMALIZER_FORM_KC 0x20 #define NORMALIZER_NFKC NORMALIZER_FORM_KC +#define NORMALIZER_FORM_KC_CF 0x30 +#define NORMALIZER_NFKC_CF NORMALIZER_FORM_KC_CF #define NORMALIZER_DEFAULT NORMALIZER_FORM_C #endif diff --git a/ext/intl/normalizer/normalizer_normalize.c b/ext/intl/normalizer/normalizer_normalize.c index 8fbe7d40c8..e9431f2372 100644 --- a/ext/intl/normalizer/normalizer_normalize.c +++ b/ext/intl/normalizer/normalizer_normalize.c @@ -51,6 +51,9 @@ static const UNormalizer2 *intl_get_normalizer(zend_long form, UErrorCode *err) case NORMALIZER_FORM_KD: return unorm2_getNFKDInstance(err); break; + case NORMALIZER_FORM_KC_CF: + return unorm2_getNFKCCasefoldInstance(err); + break; } *err = U_ILLEGAL_ARGUMENT_ERROR; @@ -146,6 +149,9 @@ PHP_FUNCTION( normalizer_normalize ) break; case NORMALIZER_FORM_C: case NORMALIZER_FORM_KC: +#if U_ICU_VERSION_MAJOR_NUM >= 56 + case NORMALIZER_FORM_KC_CF: +#endif break; default: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, @@ -279,6 +285,9 @@ PHP_FUNCTION( normalizer_is_normalized ) case NORMALIZER_FORM_KD: case NORMALIZER_FORM_C: case NORMALIZER_FORM_KC: +#if U_ICU_VERSION_MAJOR_NUM >= 56 + case NORMALIZER_FORM_KC_CF: +#endif break; default: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, diff --git a/ext/intl/tests/normalizer_normalize_kc_cf.phpt b/ext/intl/tests/normalizer_normalize_kc_cf.phpt new file mode 100644 index 0000000000..040cd29e0b --- /dev/null +++ b/ext/intl/tests/normalizer_normalize_kc_cf.phpt @@ -0,0 +1,107 @@ +--TEST-- +normalize() NFKC_Casefold +--SKIPIF-- + + +--FILE-- + 'UNORM_FORM_KC_CF', + ); + + /* just make sure all the form constants are defined as in the api spec */ + if (Normalizer::FORM_C != Normalizer::NFC) { + $res_str .= "Invalid normalization form declarations!\n"; + } + + $char_a_diaeresis = "\xC3\xA4"; // 'LATIN SMALL LETTER A WITH DIAERESIS' (U+00E4) + $char_a_ring = "\xC3\xA5"; // 'LATIN SMALL LETTER A WITH RING ABOVE' (U+00E5) + $char_o_diaeresis = "\xC3\xB6"; // 'LATIN SMALL LETTER O WITH DIAERESIS' (U+00F6) + + $char_angstrom_sign = "\xE2\x84\xAB"; // 'ANGSTROM SIGN' (U+212B) + $char_A_ring = "\xC3\x85"; // 'LATIN CAPITAL LETTER A WITH RING ABOVE' (U+00C5) + + $char_ohm_sign = "\xE2\x84\xA6"; // 'OHM SIGN' (U+2126) + $char_omega = "\xCE\xA9"; // 'GREEK CAPITAL LETTER OMEGA' (U+03A9) + + $char_combining_ring_above = "\xCC\x8A"; // 'COMBINING RING ABOVE' (U+030A) + + $char_fi_ligature = "\xEF\xAC\x81"; // 'LATIN SMALL LIGATURE FI' (U+FB01) + + $char_long_s_dot = "\xE1\xBA\x9B"; // 'LATIN SMALL LETTER LONG S WITH DOT ABOVE' (U+1E9B) + + $strs = array( + 'ABC', + 'abc', + $char_a_diaeresis . '||' . $char_a_ring . '||' . $char_o_diaeresis, + $char_angstrom_sign . '||' . $char_A_ring . '||' . 'A' . $char_combining_ring_above, + $char_ohm_sign . '||' . $char_omega, + $char_fi_ligature, + $char_long_s_dot, + ); + + foreach( $forms as $form ) + { + foreach( $strs as $str ) + { + if (Normalizer::NONE == $form) { + /* Hide deprecation warning. */ + $str_norm = @ut_norm_normalize( $str, $form ); + } else { + $str_norm = ut_norm_normalize( $str, $form ); + } + $error_code = intl_get_error_code(); + $error_message = intl_get_error_message(); + + $str_hex = urlencode($str); + $str_norm_hex = urlencode($str_norm); + $res_str .= "'$str_hex' normalized to form '{$forms_str[$form]}' is '$str_norm_hex'" + . "\terror info: '$error_message' ($error_code)\n" + . ""; + + $is_norm = ut_norm_is_normalized( $str, $form ); + $error_code = intl_get_error_code(); + $error_message = intl_get_error_message(); + + $res_str .= " is in form '{$forms_str[$form]}'? = " . ($is_norm ? "yes" : "no") + . "\terror info: '$error_message' ($error_code)\n" + . ""; + } + } + + return $res_str; +} + +include_once( 'ut_common.inc' ); +ut_run(); + +?> +--EXPECT-- +'ABC' normalized to form 'UNORM_FORM_KC_CF' is 'abc' error info: 'U_ZERO_ERROR' (0) + is in form 'UNORM_FORM_KC_CF'? = no error info: 'U_ZERO_ERROR' (0) +'abc' normalized to form 'UNORM_FORM_KC_CF' is 'abc' error info: 'U_ZERO_ERROR' (0) + is in form 'UNORM_FORM_KC_CF'? = yes error info: 'U_ZERO_ERROR' (0) +'%C3%A4%7C%7C%C3%A5%7C%7C%C3%B6' normalized to form 'UNORM_FORM_KC_CF' is '%C3%A4%7C%7C%C3%A5%7C%7C%C3%B6' error info: 'U_ZERO_ERROR' (0) + is in form 'UNORM_FORM_KC_CF'? = yes error info: 'U_ZERO_ERROR' (0) +'%E2%84%AB%7C%7C%C3%85%7C%7CA%CC%8A' normalized to form 'UNORM_FORM_KC_CF' is '%C3%A5%7C%7C%C3%A5%7C%7C%C3%A5' error info: 'U_ZERO_ERROR' (0) + is in form 'UNORM_FORM_KC_CF'? = no error info: 'U_ZERO_ERROR' (0) +'%E2%84%A6%7C%7C%CE%A9' normalized to form 'UNORM_FORM_KC_CF' is '%CF%89%7C%7C%CF%89' error info: 'U_ZERO_ERROR' (0) + is in form 'UNORM_FORM_KC_CF'? = no error info: 'U_ZERO_ERROR' (0) +'%EF%AC%81' normalized to form 'UNORM_FORM_KC_CF' is 'fi' error info: 'U_ZERO_ERROR' (0) + is in form 'UNORM_FORM_KC_CF'? = no error info: 'U_ZERO_ERROR' (0) +'%E1%BA%9B' normalized to form 'UNORM_FORM_KC_CF' is '%E1%B9%A1' error info: 'U_ZERO_ERROR' (0) + is in form 'UNORM_FORM_KC_CF'? = no error info: 'U_ZERO_ERROR' (0)