]> granicus.if.org Git - python/commitdiff
Two changes to improve (I hope) Unicode support.
authorGuido van Rossum <guido@python.org>
Thu, 4 May 2000 15:07:16 +0000 (15:07 +0000)
committerGuido van Rossum <guido@python.org>
Thu, 4 May 2000 15:07:16 +0000 (15:07 +0000)
1. In Tcl 8.2 and later, use Tcl_NewUnicodeObj() when passing a Python
Unicode object rather than going through UTF-8.  (This function
doesn't exist in Tcl 8.1, so there the original UTF-8 code is still
used; in Tcl 8.0 there is no support for Unicode.)  This assumes that
Tcl_UniChar is the same thing as Py_UNICODE; a run-time error is
issued if this is not the case.

2. In Tcl 8.1 and later (i.e., whenever Tcl supports Unicode), when a
string returned from Tcl contains bytes with the top bit set, we
assume it is encoded in UTF-8, and decode it into a Unicode string
object.

Notes:

- Passing Unicode strings to Tcl 8.0 does not do the right thing; this
isn't worth fixing.

- When passing an 8-bit string to Tcl 8.1 or later that has bytes with
the top bit set, Tcl tries to interpret it as UTF-8; it seems to fall
back on Latin-1 for non-UTF-8 bytes.  I'm not sure what to do about
this besides telling the user to disambiguate such strings by
converting them to Unicode (forcing the user to be explicit about the
encoding).

- Obviously it won't be possible to get binary data out of Tk this
way.  Do we need that ability?  How to do it?

Modules/_tkinter.c

index 15cc7e750a5efd3cc3b6078071c5527964551389..882715f12db574fc50e26ca1c3249084c95b49c1 100644 (file)
@@ -550,6 +550,8 @@ AsObj(value)
                return result;
        }
        else if (PyUnicode_Check(value)) {
+#if TKMAJORMINOR <= 8001
+               /* In Tcl 8.1 we must use UTF-8 */
                PyObject* utf8 = PyUnicode_AsUTF8String (value);
                if (!utf8)
                        return 0;
@@ -557,6 +559,17 @@ AsObj(value)
                                         PyString_GET_SIZE (utf8));
                Py_DECREF(utf8);
                return result;
+#else /* TKMAJORMINOR > 8001 */
+               /* In Tcl 8.2 and later, use Tcl_NewUnicodeObj() */
+               if (sizeof(Py_UNICODE) != sizeof(Tcl_UniChar)) {
+                       /* XXX Should really test this at compile time */
+                       PyErr_SetString(PyExc_SystemError,
+                                       "Py_UNICODE and Tcl_UniChar differ in size");
+                       return 0;
+               }
+               return Tcl_NewUnicodeObj(PyUnicode_AS_UNICODE(value),
+                                        PyUnicode_GET_SIZE(value));
+#endif /* TKMAJORMINOR > 8001 */
        }
        else {
                PyObject *v = PyObject_Str(value);
@@ -624,10 +637,26 @@ Tkapp_Call(self, args)
        ENTER_OVERLAP
        if (i == TCL_ERROR)
                Tkinter_Error(self);
-       else
+       else {
                /* We could request the object result here, but doing
                   so would confuse applications that expect a string. */
-               res = PyString_FromString(Tcl_GetStringResult(interp));
+               char *s = Tcl_GetStringResult(interp);
+               char *p = s;
+               /* If the result contains any bytes with the top bit set,
+                  it's UTF-8 and we should decode it to Unicode */
+               while (*p != '\0') {
+                       if (*p & 0x80)
+                               break;
+                       p++;
+               }
+               if (*p == '\0')
+                       res = PyString_FromStringAndSize(s, (int)(p-s));
+               else {
+                       /* Convert UTF-8 to Unicode string */
+                       p = strchr(p, '\0');
+                       res = PyUnicode_DecodeUTF8(s, (int)(p-s), "ignore");
+               }
+       }
 
        LEAVE_OVERLAP_TCL