Update with info from README.UNICODE.

author Andrei Zmievski <andrei@php.net>

Wed, 10 Jan 2007 23:09:28 +0000 (23:09 +0000)

committer Andrei Zmievski <andrei@php.net>

Wed, 10 Jan 2007 23:09:28 +0000 (23:09 +0000)
author Andrei Zmievski <andrei@php.net>
Wed, 10 Jan 2007 23:09:28 +0000 (23:09 +0000)
committer Andrei Zmievski <andrei@php.net>
Wed, 10 Jan 2007 23:09:28 +0000 (23:09 +0000)
diff --git a/README.UNICODE-UPGRADES b/README.UNICODE-UPGRADES

index bb9e6cf4c5b9c17b8f0c2990673938317e35b698..797abf89d508d2a7d248ffb665277c80999e7bbb 100644 (file)
--- a/README.UNICODE-UPGRADES
+++ b/README.UNICODE-UPGRADES
@@ -6,6 +6,151 @@ Your first stop should be README.UNICODE: it covers the general Unicode
  functionality and concepts without going into technical implementation
  details.
  
+Internal Encoding
+=================
+
+UTF-16 is the internal encoding used for Unicode strings. UTF-16 consumes
+two bytes for any Unicode character in the Basic Multilingual Plane, which
+is where most of the current world's languages are represented. While being
+less memory efficient for basic ASCII text it simplifies the processing and
+makes interfacing with ICU easier, since ICU uses UTF-16 for its internal
+processing as well.
+
+
+Zval Structure Changes
+======================
+
+For IS_UNICODE type, we add another structure to the union:
+
+    union {
+    ....
+        struct {
+            UChar *val;            /* Unicode string value */
+            int len;               /* number of UChar's */
+        } ustr;
+    ....
+    } value;
+
+This cleanly separates the two types of strings and helps preserve backwards
+compatibility.
+
+To optimize access to IS_STRING and IS_UNICODE storage at runtime, we need yet
+another structure:
+
+    union {
+    ....
+        struct {                    /* Universal string type */
+            zstr val;
+            int len;
+        } uni;
+    ....
+    } value;
+
+Where zstr ia union of char*, UChar*, and void*.
+
+
+Parameter Parsing API Modifications
+===================================
+
+There are now five new specifiers: 'u', 't', 'T', 'U', 'S', 'x' and a new '&'
+modifier.
+
+  't' specifier
+  -------------
+  This specifier indicates that the caller requires the incoming parameter to be
+  string data (IS_STRING, IS_UNICODE). The caller has to provide the storage for
+  string value, length, and type.
+
+    void *str;
+    int len;
+    zend_uchar type;
+
+    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t", &str, &len, &type) == FAILURE) {
+        return;
+    }
+    if (type == IS_UNICODE) {
+       /* process Unicode string */
+    } else {
+       /* process binary string */
+    }
+
+  For IS_STRING type, the length represents the number of bytes, and for
+  IS_UNICODE the number of UChar's. When converting other types (numbers,
+  booleans, etc) to strings, the exact behavior depends on the Unicode semantics
+  switch: if on, they are converted to IS_UNICODE, otherwise to IS_STRING.
+
+
+  'u' specifier
+  -------------
+  This specifier indicates that the caller requires the incoming parameter
+  to be a Unicode encoded string. If a non-Unicode string is passed, the engine
+  creates a copy of the string and automatically convert it to Unicode type before
+  passing it to the internal function. No such conversion is necessary for Unicode
+  strings, obviously.
+
+    UChar *str;
+    int len;
+
+    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "u", &str, &len) == FAILURE) {
+        return;
+    }
+    /* process Unicode string */
+
+    
+  'T' specifier
+  -------------
+  This specifier is useful when the function takes two or more strings and
+  operates on them. Using 't' specifier for each one would be somewhat
+  problematic if the passed-in strings are of mixed types, and multiple
+  checks need to be performed in order to do anything. All parameters
+  marked by the 'T' specifier are promoted to the same type.
+  
+  If at least one of the 'T' parameters is of Unicode type, then the rest of
+  them are converted to IS_UNICODE. Otherwise all 'T' parameters are conveted to
+  IS_STRING type.
+
+
+    void *str1, *str2;
+    int len1, len2;
+    zend_uchar type1, type2;
+
+    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "TT", &str1, &len1,
+                             &type1, &str2, &len2, &type2) == FAILURE) {
+       return;
+    }
+    if (type1 == IS_UNICODE) {
+       /* process as Unicode, str2 is guaranteed to be Unicode as well */
+    } else {
+       /* process as binary string, str2 is guaranteed to be the same */
+    }
+
+
+   'x' specifier
+   -------------
+   This specifier acts as either 'u' or 's', depending on the value of the
+   unicode semantics switch. If UG(unicode) is on, it behaves as 'u', and as
+   's' otherwise.
+
+The existing 's' specifier has been modified as well. If a Unicode string is
+passed in, it automatically copies and converts the string to the runtime
+encoding, and issues a warning. If a binary type is passed-in, no conversion
+is necessary. The '&' modifier can be used after 's' specifier to force
+a different converter instead.
+
+    char *str;
+    int len;
+
+    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s&", &str, &len, UG(utf8_conv)) == FAILURE) {
+        return;
+    }
+    /* here str is in UTF-8, if a Unicode string was passed in */
+
+The 'U' and 'S' specifiers are similar to 'u' and 's' but they are more strict
+about the type of the passed-in parameter. If 'U' is specified and the binary
+string is passed in, the engine will issue a warning instead of doing automatic
+conversion. The converse applies to the 'S' specifier.
+
+
  Working in Unicode World
  ========================
author	Andrei Zmievski <andrei@php.net>
	Wed, 10 Jan 2007 23:09:28 +0000 (23:09 +0000)
committer	Andrei Zmievski <andrei@php.net>
	Wed, 10 Jan 2007 23:09:28 +0000 (23:09 +0000)