Slightly revised version of patch #1538956:

author Marc-André Lemburg <mal@egenix.com>

Mon, 14 Aug 2006 10:55:19 +0000 (10:55 +0000)

committer Marc-André Lemburg <mal@egenix.com>

Mon, 14 Aug 2006 10:55:19 +0000 (10:55 +0000)
author Marc-André Lemburg <mal@egenix.com>
Mon, 14 Aug 2006 10:55:19 +0000 (10:55 +0000)
committer Marc-André Lemburg <mal@egenix.com>
Mon, 14 Aug 2006 10:55:19 +0000 (10:55 +0000)
diff --git a/Doc/api/concrete.tex b/Doc/api/concrete.tex

index 4c7487c4ffc8962c261787d7476ff6a9506ad476..cd9d8d561a32fc0765f12bc6c60356a15a4b98a7 100644 (file)
--- a/Doc/api/concrete.tex
+++ b/Doc/api/concrete.tex
@@ -1560,6 +1560,31 @@ They all return \NULL{} or \code{-1} if an exception occurs.
    greater than, respectively.
  \end{cfuncdesc}
  
+\begin{cfuncdesc}{int}{PyUnicode_RichCompare}{PyObject *left, 
+                                              PyObject *right, 
+                                              int op}
+
+% This entry could use some polishing - my TeX is too
+% rusty these days... (MAL)
+
+  Rich compare two strings and return one of the following:
+\begin{verbatim}
+   - NULL in case an exception was raised
+   - Py_True or Py_False for successfuly comparisons
+   - Py_NotImplemented in case the type combination is unknown
+\end{verbatim}
+
+   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
+   case the conversion of the arguments to Unicode fails with a
+   UnicodeDecodeError.
+
+   Possible values for \var{op}:
+\begin{verbatim}
+     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
+\end{verbatim}
+
+\end{cfuncdesc}
+
  \begin{cfuncdesc}{PyObject*}{PyUnicode_Format}{PyObject *format,
                                                PyObject *args}
    Return a new string object from \var{format} and \var{args}; this
diff --git a/Doc/api/exceptions.tex b/Doc/api/exceptions.tex

index cb75d5051b71257480dd64879ed7a014cb81366c..057c1da9254e87f1f08d03e3dd78813b90933951 100644 (file)
--- a/Doc/api/exceptions.tex
+++ b/Doc/api/exceptions.tex
@@ -288,10 +288,11 @@ for each thread.
    names are \samp{PyExc_} followed by the Python exception name.
    These have the type \ctype{PyObject*}; they are all class objects.
    Their names are \cdata{PyExc_Warning}, \cdata{PyExc_UserWarning},
-  \cdata{PyExc_DeprecationWarning}, \cdata{PyExc_SyntaxWarning},
-  \cdata{PyExc_RuntimeWarning}, and \cdata{PyExc_FutureWarning}.
-  \cdata{PyExc_Warning} is a subclass of \cdata{PyExc_Exception}; the
-  other warning categories are subclasses of \cdata{PyExc_Warning}.
+  \cdata{PyExc_UnicodeWarning}, \cdata{PyExc_DeprecationWarning},
+  \cdata{PyExc_SyntaxWarning}, \cdata{PyExc_RuntimeWarning}, and
+  \cdata{PyExc_FutureWarning}.  \cdata{PyExc_Warning} is a subclass of
+  \cdata{PyExc_Exception}; the other warning categories are subclasses
+  of \cdata{PyExc_Warning}.
  
    For information about warning control, see the documentation for the
    \module{warnings} module and the \programopt{-W} option in the
diff --git a/Doc/lib/libexcs.tex b/Doc/lib/libexcs.tex

index bef8bf1e6c8d8f415e8ab331c0a0704afd0d62aa..6d2a3c5d2f34070dce571d4692ffa29d5a4646da 100644 (file)
--- a/Doc/lib/libexcs.tex
+++ b/Doc/lib/libexcs.tex
@@ -456,6 +456,11 @@ Base class for warnings about probable mistakes in module imports.
  \versionadded{2.5}
  \end{excdesc}
  
+\begin{excdesc}{UnicodeWarning}
+Base class for warnings related to Unicode.
+\versionadded{2.5}
+\end{excdesc}
+
  The class hierarchy for built-in exceptions is:
  
  \verbatiminput{../../Lib/test/exception_hierarchy.txt}
diff --git a/Doc/lib/libwarnings.tex b/Doc/lib/libwarnings.tex

index 08c0340241aeb51beeee069a016188b7d5f7aa8e..a37a9f5334816684c8fc6ab3455238c9c7782bb5 100644 (file)
--- a/Doc/lib/libwarnings.tex
+++ b/Doc/lib/libwarnings.tex
@@ -76,6 +76,9 @@ features that will be deprecated in the future (ignored by default).}
  
  \lineii{ImportWarning}{Base category for warnings triggered during the
  process of importing a module (ignored by default).}
+
+\lineii{UnicodeWarning}{Base category for warnings related to Unicode.}
+
  \end{tableii}
  
  While these are technically built-in exceptions, they are documented
diff --git a/Include/pyerrors.h b/Include/pyerrors.h

index ae1d9905eb641e97bbbef51520f166ffef2f76f5..9532e32b469e8897a509fc6a3dedc5f8ba2f4c96 100644 (file)
--- a/Include/pyerrors.h
+++ b/Include/pyerrors.h
@@ -173,6 +173,7 @@ PyAPI_DATA(PyObject *) PyExc_SyntaxWarning;
  PyAPI_DATA(PyObject *) PyExc_RuntimeWarning;
  PyAPI_DATA(PyObject *) PyExc_FutureWarning;
  PyAPI_DATA(PyObject *) PyExc_ImportWarning;
+PyAPI_DATA(PyObject *) PyExc_UnicodeWarning;
  
  
  /* Convenience functions */
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index c7e07a86ef4bdcbb1f2e4484e5b1da832912b7dd..33aa185814abe39c7489c3238ba47bdb17f3e840 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -189,6 +189,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
  # define PyUnicode_RSplit PyUnicodeUCS2_RSplit
  # define PyUnicode_Replace PyUnicodeUCS2_Replace
  # define PyUnicode_Resize PyUnicodeUCS2_Resize
+# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
  # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
  # define PyUnicode_Split PyUnicodeUCS2_Split
  # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
@@ -266,6 +267,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
  # define PyUnicode_RSplit PyUnicodeUCS4_RSplit
  # define PyUnicode_Replace PyUnicodeUCS4_Replace
  # define PyUnicode_Resize PyUnicodeUCS4_Resize
+# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
  # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
  # define PyUnicode_Split PyUnicodeUCS4_Split
  # define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
@@ -1139,6 +1141,28 @@ PyAPI_FUNC(int) PyUnicode_Compare(
      PyObject *right            /* Right string */
      );
  
+/* Rich compare two strings and return one of the following:
+
+   - NULL in case an exception was raised
+   - Py_True or Py_False for successfuly comparisons
+   - Py_NotImplemented in case the type combination is unknown
+
+   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
+   case the conversion of the arguments to Unicode fails with a
+   UnicodeDecodeError.
+
+   Possible values for op:
+
+     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
+
+*/
+
+PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
+    PyObject *left,            /* Left string */ 
+    PyObject *right,           /* Right string */
+    int op                     /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
+    );
+
  /* Apply a argument tuple or dictionary to a format string and return
     the resulting Unicode string. */
  
diff --git a/Lib/test/exception_hierarchy.txt b/Lib/test/exception_hierarchy.txt

index 58131d7aaf7e21ce0f60873618c99358766d9954..a03f7bbd71876130ad6642844813e235614baeb0 100644 (file)
--- a/Lib/test/exception_hierarchy.txt
+++ b/Lib/test/exception_hierarchy.txt
@@ -45,3 +45,4 @@ BaseException
             +-- UserWarning
             +-- FutureWarning
            +-- ImportWarning
+          +-- UnicodeWarning
diff --git a/Misc/NEWS b/Misc/NEWS

index 5894c16be118e6e9a53c117d61268457e88c25a2..981c17bf594c2cc62ba76dbf5d3edef1059f94fd 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,18 +12,18 @@ What's New in Python 2.5 release candidate 1?
  Core and builtins
  -----------------
  
-- Fix segfault when doing string formatting on subclasses of long.
-
-- Fix bug related to __len__ functions using values > 2**32 on 64-bit machines
-  with new-style classes.
-  
-- Fix bug related to __len__ functions returning negative values with
-  classic classes.
-  
-- Patch #1538606, Fix __index__() clipping.  There were some problems
-  discovered with the API and how integers that didn't fit into Py_ssize_t
-  were handled.  This patch attempts to provide enough alternatives
-  to effectively use __index__.
+- Unicode objects will no longer raise an exception when being
+  compared equal or unequal to a string and causing a
+  UnicodeDecodeError exception, e.g. as result of a decoding failure.
+
+  Instead, the equal (==) and unequal (!=) comparison operators will
+  now issue a UnicodeWarning and interpret the two objects as
+  unequal. The UnicodeWarning can be filtered as desired using
+  the warning framework, e.g. silenced completely, turned into an
+  exception, logged, etc.
+
+  Note that compare operators other than equal and unequal will still
+  raise UnicodeDecodeError exceptions as they've always done.
  
  - Bug #1536021: __hash__ may now return long int; the final hash
    value is obtained by invoking hash on the long int.
@@ -99,6 +99,8 @@ Build
  C API
  -----
  
+- New API for Unicode rich comparisons: PyUnicode_RichCompare()
+
  - Bug #1069160.  Internal correctness changes were made to
    ``PyThreadState_SetAsyncExc()``.  A test case was added, and
    the documentation was changed to state that the return value
diff --git a/Objects/exceptions.c b/Objects/exceptions.c

index be9627c4a60d45fe70115e6ac27ed96a3a10a2fd..c3ead698e81597f25e5d60dc242680a705a4aff1 100644 (file)
--- a/Objects/exceptions.c
+++ b/Objects/exceptions.c
@@ -1948,6 +1948,14 @@ SimpleExtendsException(PyExc_Warning, ImportWarning,
            "Base class for warnings about probable mistakes in module imports");
  
  
+/*
+ *    UnicodeWarning extends Warning
+ */
+SimpleExtendsException(PyExc_Warning, UnicodeWarning,
+    "Base class for warnings about Unicode related problems, mostly\n"
+    "related to conversion problems.");
+
+
  /* Pre-computed MemoryError instance.  Best to create this as early as
   * possible and not wait until a MemoryError is actually raised!
   */
@@ -2048,6 +2056,7 @@ _PyExc_Init(void)
      PRE_INIT(RuntimeWarning)
      PRE_INIT(FutureWarning)
      PRE_INIT(ImportWarning)
+    PRE_INIT(UnicodeWarning)
  
      m = Py_InitModule4("exceptions", functions, exceptions_doc,
          (PyObject *)NULL, PYTHON_API_VERSION);
@@ -2113,6 +2122,7 @@ _PyExc_Init(void)
      POST_INIT(RuntimeWarning)
      POST_INIT(FutureWarning)
      POST_INIT(ImportWarning)
+    POST_INIT(UnicodeWarning)
  
      PyExc_MemoryErrorInst = BaseException_new(&_PyExc_MemoryError, NULL, NULL);
      if (!PyExc_MemoryErrorInst)
diff --git a/Objects/object.c b/Objects/object.c

index 73c89417eb8c4f1dd92b895054a425048307ff59..b0672f30e65df0ac33a1187e9d0e6e3a899d0968 100644 (file)
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -731,23 +731,6 @@ default_3way_compare(PyObject *v, PyObject *w)
                 return (vv < ww) ? -1 : (vv > ww) ? 1 : 0;
         }
  
-#ifdef Py_USING_UNICODE
-       /* Special case for Unicode */
-       if (PyUnicode_Check(v) || PyUnicode_Check(w)) {
-               c = PyUnicode_Compare(v, w);
-               if (!PyErr_Occurred())
-                       return c;
-               /* TypeErrors are ignored: if Unicode coercion fails due
-                  to one of the arguments not having the right type, we
-                  continue as defined by the coercion protocol (see
-                  above).  Luckily, decoding errors are reported as
-                  ValueErrors and are not masked by this technique. */
-               if (!PyErr_ExceptionMatches(PyExc_TypeError))
-                       return -2;
-               PyErr_Clear();
-       }
-#endif
-
         /* None is smaller than anything */
         if (v == Py_None)
                 return -1;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index ababda1bb5b715609d5beaf6e14322bfc160579b..f4e37556e567a7fd785b9e2dc0042b904242506c 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5405,6 +5405,82 @@ onError:
      return -1;
  }
  
+PyObject *PyUnicode_RichCompare(PyObject *left,
+                                PyObject *right,
+                                int op)
+{
+    int result;
+
+    result = PyUnicode_Compare(left, right);
+    if (result == -1 && PyErr_Occurred())
+        goto onError;
+
+    /* Convert the return value to a Boolean */
+    switch (op) {
+    case Py_EQ:
+        result = (result == 0);
+        break;
+    case Py_NE:
+        result = (result != 0);
+        break;
+    case Py_LE:
+        result = (result <= 0);
+        break;
+    case Py_GE:
+        result = (result >= 0);
+        break;
+    case Py_LT:
+        result = (result == -1);
+        break;
+    case Py_GT:
+        result = (result == 1);
+        break;
+    }
+    return PyBool_FromLong(result);
+
+ onError:
+
+    /* Standard case
+
+       Type errors mean that PyUnicode_FromObject() could not convert
+       one of the arguments (usually the right hand side) to Unicode,
+       ie. we can't handle the comparison request. However, it is
+       possible that the other object knows a comparison method, which
+       is why we return Py_NotImplemented to give the other object a
+       chance.
+
+    */
+    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+        PyErr_Clear();
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    if (op != Py_EQ && op != Py_NE)
+        return NULL;
+
+    /* Equality comparison.
+
+       This is a special case: we silence any PyExc_UnicodeDecodeError
+       and instead turn it into a PyErr_UnicodeWarning.
+
+    */
+    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
+        return NULL;
+    PyErr_Clear();
+    if (PyErr_Warn(PyExc_UnicodeWarning, 
+                   (op == Py_EQ) ? 
+                   "Unicode equal comparison "
+                   "failed to convert both arguments to Unicode - "
+                   "interpreting them as being unequal" :
+                   "Unicode unequal comparison "
+                   "failed to convert both arguments to Unicode - "
+                   "interpreting them as being unequal"
+                   ) < 0)
+        return NULL;
+    result = (op == Py_NE);
+    return PyBool_FromLong(result);
+}
+
  int PyUnicode_Contains(PyObject *container,
                        PyObject *element)
  {
@@ -6985,11 +7061,14 @@ static PySequenceMethods unicode_as_sequence = {
      PyUnicode_Contains,                /* sq_contains */
  };
  
+#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
+
  static PyObject*
  unicode_subscript(PyUnicodeObject* self, PyObject* item)
  {
-    if (PyIndex_Check(item)) {
-        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
+    PyNumberMethods *nb = item->ob_type->tp_as_number;
+    if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
+        Py_ssize_t i = nb->nb_index(item);
          if (i == -1 && PyErr_Occurred())
              return NULL;
          if (i < 0)
@@ -7859,7 +7938,7 @@ PyTypeObject PyUnicode_Type = {
      0,                                         /* tp_print */
      0,                                 /* tp_getattr */
      0,                                         /* tp_setattr */
-    (cmpfunc) unicode_compare,                 /* tp_compare */
+    0,                                         /* tp_compare */
      unicode_repr,                      /* tp_repr */
      &unicode_as_number,                /* tp_as_number */
      &unicode_as_sequence,              /* tp_as_sequence */
@@ -7875,7 +7954,7 @@ PyTypeObject PyUnicode_Type = {
      unicode_doc,                       /* tp_doc */
      0,                                 /* tp_traverse */
      0,                                 /* tp_clear */
-    0,                                 /* tp_richcompare */
+    PyUnicode_RichCompare,             /* tp_richcompare */
      0,                                 /* tp_weaklistoffset */
      0,                                 /* tp_iter */
      0,                                 /* tp_iternext */
author	Marc-André Lemburg <mal@egenix.com>
	Mon, 14 Aug 2006 10:55:19 +0000 (10:55 +0000)
committer	Marc-André Lemburg <mal@egenix.com>
	Mon, 14 Aug 2006 10:55:19 +0000 (10:55 +0000)
Doc/api/concrete.tex		patch \| blob \| history
Doc/api/exceptions.tex		patch \| blob \| history
Doc/lib/libexcs.tex		patch \| blob \| history
Doc/lib/libwarnings.tex		patch \| blob \| history
Include/pyerrors.h		patch \| blob \| history
Include/unicodeobject.h		patch \| blob \| history
Lib/test/exception_hierarchy.txt		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Objects/exceptions.c		patch \| blob \| history
Objects/object.c		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history