Expose functionality for NFKC_Casefold normalization

author Anatol Belski <ab@php.net>

Fri, 6 Apr 2018 16:14:11 +0000 (18:14 +0200)

committer Anatol Belski <ab@php.net>

Fri, 6 Apr 2018 16:14:11 +0000 (18:14 +0200)
author Anatol Belski <ab@php.net>
Fri, 6 Apr 2018 16:14:11 +0000 (18:14 +0200)
committer Anatol Belski <ab@php.net>
Fri, 6 Apr 2018 16:14:11 +0000 (18:14 +0200)
diff --git a/UPGRADING b/UPGRADING

index 9590be374f17ae95856a3142a474a7c924550462..0b3ed6176a49e57f283fdd219527219c8e8debf5 100644 (file)
--- a/UPGRADING
+++ b/UPGRADING
@@ -137,7 +137,6 @@ Intl:
      http://icu-project.org/apiref/icu4c/uspoof_8h.html
    . Added Normalizer::getRawDecomposition() and normalizer_get_raw_decomposition(),
      to retrieve the Decomposition_Mapping property of a character.
-  . Normalizer::NONE is deprecated, when PHP is linked with ICU 56 and above
  
  Standard:
    . Added is_countable() function, to check whether a value may be passed to
@@ -170,6 +169,11 @@ JSON:
   FTP:
    . Set default transfer mode to binary
  
+ Intl:
+  . Normalizer::NONE is deprecated, when PHP is linked with ICU >= 56
+  . Introduced Normalizer::FORM_KC_CF as Normalizer::normalize() argument
+    for NFKC_Casefold normalization, available when linked with ICU >= 56
+
   MBString:
    . The configuration option --with-libmbfl is no longer available.
  
diff --git a/ext/intl/normalizer/normalizer.c b/ext/intl/normalizer/normalizer.c

index 3a0d526b778b1580969055e96d0e94f7a7423dc9..2f7555388ef0188bd7987368c9f55c729c3d1542 100644 (file)
--- a/ext/intl/normalizer/normalizer.c
+++ b/ext/intl/normalizer/normalizer.c
@@ -51,6 +51,10 @@ void normalizer_register_constants( INIT_FUNC_ARGS )
         NORMALIZER_EXPOSE_CLASS_CONST( NFC );
         NORMALIZER_EXPOSE_CLASS_CONST( FORM_KC );
         NORMALIZER_EXPOSE_CLASS_CONST( NFKC );
+#if U_ICU_VERSION_MAJOR_NUM >= 49
+       NORMALIZER_EXPOSE_CLASS_CONST( FORM_KC_CF );
+       NORMALIZER_EXPOSE_CLASS_CONST( NFKC_CF );
+#endif
  
         #undef NORMALIZER_EXPOSE_CUSTOM_CLASS_CONST
         #undef NORMALIZER_EXPOSE_CLASS_CONST
diff --git a/ext/intl/normalizer/normalizer.h b/ext/intl/normalizer/normalizer.h

index ddd2c38a946e3c48c729d8ed1159c2373189ab20..8150f244f60cc1f94208b9aedce1160bf7523220 100644 (file)
--- a/ext/intl/normalizer/normalizer.h
+++ b/ext/intl/normalizer/normalizer.h
@@ -19,7 +19,7 @@
  
  #include <php.h>
  #include <unicode/utypes.h>
-#if U_ICU_VERSION_MAJOR_NUM < 56
+#if U_ICU_VERSION_MAJOR_NUM < 49
  #include <unicode/unorm.h>
  
  #define NORMALIZER_NONE UNORM_NONE
@@ -44,6 +44,8 @@
  #define NORMALIZER_NFC NORMALIZER_FORM_C
  #define NORMALIZER_FORM_KC 0x20
  #define NORMALIZER_NFKC NORMALIZER_FORM_KC
+#define NORMALIZER_FORM_KC_CF 0x30
+#define NORMALIZER_NFKC_CF NORMALIZER_FORM_KC_CF
  #define NORMALIZER_DEFAULT NORMALIZER_FORM_C
  #endif
  
diff --git a/ext/intl/normalizer/normalizer_normalize.c b/ext/intl/normalizer/normalizer_normalize.c

index 8fbe7d40c89156071528c689e44173858c2062de..e9431f2372cf818afce7b2e894c68bb77aa6c872 100644 (file)
--- a/ext/intl/normalizer/normalizer_normalize.c
+++ b/ext/intl/normalizer/normalizer_normalize.c
@@ -51,6 +51,9 @@ static const UNormalizer2 *intl_get_normalizer(zend_long form, UErrorCode *err)
                 case NORMALIZER_FORM_KD:
                         return unorm2_getNFKDInstance(err);
                         break;
+               case NORMALIZER_FORM_KC_CF:
+                       return unorm2_getNFKCCasefoldInstance(err);
+                       break;
         }
  
         *err = U_ILLEGAL_ARGUMENT_ERROR;
@@ -146,6 +149,9 @@ PHP_FUNCTION( normalizer_normalize )
                         break;
                 case NORMALIZER_FORM_C:
                 case NORMALIZER_FORM_KC:
+#if U_ICU_VERSION_MAJOR_NUM >= 56
+               case NORMALIZER_FORM_KC_CF:
+#endif
                         break;
                 default:
                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
@@ -279,6 +285,9 @@ PHP_FUNCTION( normalizer_is_normalized )
                 case NORMALIZER_FORM_KD:
                 case NORMALIZER_FORM_C:
                 case NORMALIZER_FORM_KC:
+#if U_ICU_VERSION_MAJOR_NUM >= 56
+               case NORMALIZER_FORM_KC_CF:
+#endif
                         break;
                 default:
                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
diff --git a/ext/intl/tests/normalizer_normalize_kc_cf.phpt b/ext/intl/tests/normalizer_normalize_kc_cf.phpt

new file mode 100644 (file)

index 0000000..040cd29
--- /dev/null
+++ b/ext/intl/tests/normalizer_normalize_kc_cf.phpt
@@ -0,0 +1,107 @@
+--TEST--
+normalize() NFKC_Casefold
+--SKIPIF--
+<?php if (!extension_loaded('intl')) print 'skip'; ?>
+<?php if (!defined('Normalizer::FORM_KC_CF')) print 'skip'; ?>
+--FILE--
+<?php
+
+/*
+ * Try normalization and test normalization
+ * with Procedural and Object methods.
+ */
+
+function ut_main()
+{
+       $res_str = '';
+
+       $forms = array(
+               Normalizer::FORM_KC_CF,
+       );
+
+       $forms_str = array (
+               Normalizer::FORM_KC_CF => 'UNORM_FORM_KC_CF',
+       );
+
+       /* just make sure all the form constants are defined as in the api spec */
+       if (Normalizer::FORM_C != Normalizer::NFC) {
+                       $res_str .= "Invalid normalization form declarations!\n";
+       }
+                
+       $char_a_diaeresis = "\xC3\xA4"; // 'LATIN SMALL LETTER A WITH DIAERESIS' (U+00E4)
+       $char_a_ring = "\xC3\xA5";              // 'LATIN SMALL LETTER A WITH RING ABOVE' (U+00E5)
+       $char_o_diaeresis = "\xC3\xB6";    // 'LATIN SMALL LETTER O WITH DIAERESIS' (U+00F6)
+
+       $char_angstrom_sign = "\xE2\x84\xAB"; // 'ANGSTROM SIGN' (U+212B)
+       $char_A_ring = "\xC3\x85";      // 'LATIN CAPITAL LETTER A WITH RING ABOVE' (U+00C5)
+
+       $char_ohm_sign = "\xE2\x84\xA6";        // 'OHM SIGN' (U+2126)
+       $char_omega = "\xCE\xA9";  // 'GREEK CAPITAL LETTER OMEGA' (U+03A9)
+
+       $char_combining_ring_above = "\xCC\x8A";  // 'COMBINING RING ABOVE' (U+030A)
+
+       $char_fi_ligature = "\xEF\xAC\x81";  // 'LATIN SMALL LIGATURE FI' (U+FB01)
+
+       $char_long_s_dot = "\xE1\xBA\x9B";      // 'LATIN SMALL LETTER LONG S WITH DOT ABOVE' (U+1E9B)
+                       
+       $strs = array(
+               'ABC',
+               'abc',
+               $char_a_diaeresis . '||' . $char_a_ring . '||' . $char_o_diaeresis,
+               $char_angstrom_sign . '||' . $char_A_ring . '||' . 'A' . $char_combining_ring_above,
+               $char_ohm_sign . '||' . $char_omega,
+               $char_fi_ligature,
+               $char_long_s_dot,
+       );
+       
+       foreach( $forms as $form )
+       {
+               foreach( $strs as $str )
+               {
+                       if (Normalizer::NONE == $form) {
+                               /* Hide deprecation warning. */
+                               $str_norm = @ut_norm_normalize( $str, $form );
+                       } else {
+                               $str_norm = ut_norm_normalize( $str, $form );
+                       }
+                       $error_code = intl_get_error_code();
+                       $error_message = intl_get_error_message();
+
+                       $str_hex = urlencode($str);
+                       $str_norm_hex = urlencode($str_norm);
+                       $res_str .= "'$str_hex' normalized to form '{$forms_str[$form]}' is '$str_norm_hex'" 
+                                        .      "\terror info: '$error_message' ($error_code)\n" 
+                                        .      "";
+                       
+                       $is_norm = ut_norm_is_normalized( $str, $form );
+                       $error_code = intl_get_error_code();
+                       $error_message = intl_get_error_message();
+
+                       $res_str .= "           is in form '{$forms_str[$form]}'? = " . ($is_norm ? "yes" : "no") 
+                                        .      "\terror info: '$error_message' ($error_code)\n"
+                                        .      "";
+               }
+       }
+
+       return $res_str;
+}
+
+include_once( 'ut_common.inc' );
+ut_run();
+
+?>
+--EXPECT--
+'ABC' normalized to form 'UNORM_FORM_KC_CF' is 'abc'   error info: 'U_ZERO_ERROR' (0)
+               is in form 'UNORM_FORM_KC_CF'? = no     error info: 'U_ZERO_ERROR' (0)
+'abc' normalized to form 'UNORM_FORM_KC_CF' is 'abc'   error info: 'U_ZERO_ERROR' (0)
+               is in form 'UNORM_FORM_KC_CF'? = yes    error info: 'U_ZERO_ERROR' (0)
+'%C3%A4%7C%7C%C3%A5%7C%7C%C3%B6' normalized to form 'UNORM_FORM_KC_CF' is '%C3%A4%7C%7C%C3%A5%7C%7C%C3%B6'     error info: 'U_ZERO_ERROR' (0)
+               is in form 'UNORM_FORM_KC_CF'? = yes    error info: 'U_ZERO_ERROR' (0)
+'%E2%84%AB%7C%7C%C3%85%7C%7CA%CC%8A' normalized to form 'UNORM_FORM_KC_CF' is '%C3%A5%7C%7C%C3%A5%7C%7C%C3%A5' error info: 'U_ZERO_ERROR' (0)
+               is in form 'UNORM_FORM_KC_CF'? = no     error info: 'U_ZERO_ERROR' (0)
+'%E2%84%A6%7C%7C%CE%A9' normalized to form 'UNORM_FORM_KC_CF' is '%CF%89%7C%7C%CF%89'  error info: 'U_ZERO_ERROR' (0)
+               is in form 'UNORM_FORM_KC_CF'? = no     error info: 'U_ZERO_ERROR' (0)
+'%EF%AC%81' normalized to form 'UNORM_FORM_KC_CF' is 'fi'      error info: 'U_ZERO_ERROR' (0)
+               is in form 'UNORM_FORM_KC_CF'? = no     error info: 'U_ZERO_ERROR' (0)
+'%E1%BA%9B' normalized to form 'UNORM_FORM_KC_CF' is '%E1%B9%A1'       error info: 'U_ZERO_ERROR' (0)
+               is in form 'UNORM_FORM_KC_CF'? = no     error info: 'U_ZERO_ERROR' (0)
author	Anatol Belski <ab@php.net>
	Fri, 6 Apr 2018 16:14:11 +0000 (18:14 +0200)
committer	Anatol Belski <ab@php.net>
	Fri, 6 Apr 2018 16:14:11 +0000 (18:14 +0200)
UPGRADING		patch \| blob \| history
ext/intl/normalizer/normalizer.c		patch \| blob \| history
ext/intl/normalizer/normalizer.h		patch \| blob \| history
ext/intl/normalizer/normalizer_normalize.c		patch \| blob \| history
ext/intl/tests/normalizer_normalize_kc_cf.phpt	[new file with mode: 0644]	patch \| blob