More unicode upgrading notes

author Sara Golemon <pollita@php.net>

Tue, 17 Oct 2006 21:42:28 +0000 (21:42 +0000)

committer Sara Golemon <pollita@php.net>

Tue, 17 Oct 2006 21:42:28 +0000 (21:42 +0000)
author Sara Golemon <pollita@php.net>
Tue, 17 Oct 2006 21:42:28 +0000 (21:42 +0000)
committer Sara Golemon <pollita@php.net>
Tue, 17 Oct 2006 21:42:28 +0000 (21:42 +0000)
diff --git a/README.UNICODE-UPGRADES b/README.UNICODE-UPGRADES

index e35a660df9074a3da11e538e0f489e1df934a761..c38cff1eb66a9ffb42622dced0ea3ea71c43aa15 100644 (file)
--- a/README.UNICODE-UPGRADES
+++ b/README.UNICODE-UPGRADES
@@ -407,8 +407,8 @@ substr()
  This functions returns part of a string based on offset and length
  parameters.
  
-    void *str;
-    int32_t str_len, cp_len;
+    zstr str;
+    int str_len, cp_len;
      zend_uchar str_type;
  
      if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "tl|l", &str, &str_len, &str_type, &f, &l) == FAILURE) {
@@ -417,11 +417,11 @@ parameters.
  
  The first thing we notice is that the incoming string specifier is 't',
  which means that we can accept all 3 string types. The 'str' variable is
-declared as void*, because it can point to either UChar* or char*.
+declared as zstr, because it can point to either UChar* or char*.
  The actual type of the incoming string is stored in 'str_type' variable.
  
      if (str_type == IS_UNICODE) {
-        cp_len = u_countChar32(str, str_len);
+        cp_len = u_countChar32(str.u, str_len);
      } else {
          cp_len = str_len;
      }
@@ -435,10 +435,10 @@ string. Nothing new here. Then we locate the appropriate segment.
  
      if (str_type == IS_UNICODE) {
          int32_t start = 0, end = 0;
-        U16_FWD_N((UChar*)str, end, str_len, f);
+        U16_FWD_N(str.u, end, str_len, f);
          start = end;
-        U16_FWD_N((UChar*)str, end, str_len, l);
-        RETURN_UNICODEL((UChar*)str + start, end-start, 1);
+        U16_FWD_N(str.u, end, str_len, l);
+        RETURN_UNICODEL(str.u + start, end-start, ZSTR_DUPLICATE);
  
  Since codepoint (character) #n is not necessarily at offset #n in Unicode
  strings, we start at the beginning and iterate forward until we have gone
@@ -448,10 +448,10 @@ of codepoints specified by the offset. Once that's done, we can return the
  segment as a Unicode string.
  
      } else {
-        RETURN_STRINGL((char*)str + f, l, 1);
+        RETURN_STRINGL(str.s + f, l, ZSTR_DUPLICATE);
      }
  
-For native and binary types, we can return the segment directly.
+For native strings, we can return the segment directly.
  
  
  strrev()
@@ -486,9 +486,9 @@ strrev() obtains its single argument, a string, and unless the string is of
  Unicode type, processes it exactly as before, simply swapping bytes around.
  For Unicode case, the magic is like this:
  
-       int32_t i, x1, x2;
-       UChar32 ch;
-       UChar *u_s, *u_n, *u_p;
+    int32_t i, x1, x2;
+    UChar32 ch;
+    UChar *u_s, *u_n, *u_p;
  
      u_n = eumalloc(Z_USTRLEN_PP(str)+1);
      u_p = u_n;
@@ -525,6 +525,98 @@ Note that the code uses zend_codepoint_to_uchar() to convert full Unicode
  characters (UChar32 type) to 1 or 2 UTF-16 code units (UChar type).
  
  
+realpath()
+----------
+
+Filenames use their own converter as it's not uncommon, for example,
+to need to access files on a filesystem with latin1 entries while outputting
+UTF8 runtime content.
+
+The most common approach to parsing filenames can be found in realpath():
+
+zval **ppfilename;
+char *filename;
+int filename_len;
+
+if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Z", &ppfilename) == FAILURE ||
+       php_stream_path_param_encode(ppfilename, &filename, &filename_len, REPORT_ERRORS, FG(default_context)) == FAILURE) {
+       return;
+}
+
+Here, the filename is taken first as a generic zval**, then converted (separating if necessary)
+and populated into local char* and int storage.  The filename will be converted according to
+unicode.filesystem_encoding unless the wrapper specified overrides this with its own conversion
+function (The http:// wrapper, for example, enforces utf8 conversion).
+
+
+rmdir()
+-------
+
+If the function accepts a context parameter, then this context should be used in place of FG(default_context)
+
+zval **ppdir, *zcontext = NULL;
+char *dir;
+int dir_len;
+
+if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Z|r", &ppdir, &zcontext) == FAILURE) {
+       return;
+}
+
+context = php_stream_context_from_zval(zcontext, 0);
+if (php_stream_path_param_encode(ppdir, &dir, &dir_len, REPORT_ERRORS, context) == FAILURE) {
+       return;
+}
+
+
+sqlite_query()
+--------------
+
+If the function's underlying library expects a particular encoding (i.e. UTF8), then the alternate form of
+the string parameter may be used with zend_parse_parameters().
+
+char *sql;
+int sql_len;
+
+if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s&", &sql, &sql_len, UG(utf8_conv)) == FAILURE) {
+    return;
+}
+
+Converters
+==========
+
+Standard Converters
+-------------------
+
+The following converters (UConverter*) are initialized by Zend and are always available (regardless of UG(unicode) mode):
+  UG(utf8_conv)
+  UG(ascii_conv)
+  UG(fallback_encoding_conv) - UTF8 unless overridden by INI setting unicode.fallback_encoding
+
+Additional converters will be optionally initialized depending on INI settings:
+  UG(runtime_encoding_conv) - unicode.runtime_encoding
+   . Unicode output generated by a script will be encoding using this converter
+
+  UG(script_encoding_conv) - unicode.script_encoding
+   . Scripts read from disk will be decoded using this converter
+
+  UG(http_input_encoding_conv) - unicode.http_input_encoding
+   . HTTP Request data ($_GET / $_POST) will be decoded using this converter
+
+  UG(filesystem_encoding_conv) - unicode.filesystem_encoding
+   . Filenames and paths will be encoding using this converter
+
+
+Since these additional converters may not be instatiated (because their INI value is not set), all uses of these converters must
+be wrapped in ZEND_U_CONVERTER() for safety.  If the converter hasn't been instantiated, then UG(fallback_encoding_conv) will be
+used instead.
+
+For example, RETURN_RT_STRING("foo", ZSTR_DUPLICATE); expands out to:
+  RETURN_U_STRING(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), "foo", ZSTR_DUPLICATE);
+
+Which uses UG(runtime_encoding_conv) if it's been set, otherwise using UG(fallback_encoding_conv).
+
+Note that the INI setting unicode.stream_encoding does not instantiate a UConverter* automatically for use by the process/thread,
+it stores the value as a string for use during fopen() style calls where a UConverter* is instantiated for that particular stream.
  
  References
  ==========
author	Sara Golemon <pollita@php.net>
	Tue, 17 Oct 2006 21:42:28 +0000 (21:42 +0000)
committer	Sara Golemon <pollita@php.net>
	Tue, 17 Oct 2006 21:42:28 +0000 (21:42 +0000)