needforspeed: partition implementation, part two.

author Fredrik Lundh <fredrik@pythonware.com>

Fri, 26 May 2006 08:54:28 +0000 (08:54 +0000)

committer Fredrik Lundh <fredrik@pythonware.com>

Fri, 26 May 2006 08:54:28 +0000 (08:54 +0000)
author Fredrik Lundh <fredrik@pythonware.com>
Fri, 26 May 2006 08:54:28 +0000 (08:54 +0000)
committer Fredrik Lundh <fredrik@pythonware.com>
Fri, 26 May 2006 08:54:28 +0000 (08:54 +0000)
diff --git a/Doc/lib/libstdtypes.tex b/Doc/lib/libstdtypes.tex

index 6760e478f03ba70b188a535f32c9bd83fbd02f6c..80d27173a5b72c305024d2feea8e07339373a24e 100644 (file)
--- a/Doc/lib/libstdtypes.tex
+++ b/Doc/lib/libstdtypes.tex
@@ -727,6 +727,14 @@ a prefix; rather, all combinations of its values are stripped:
  \versionchanged[Support for the \var{chars} argument]{2.2.2}
  \end{methoddesc}
  
+\begin{methoddesc}[string]{partition}{sep}
+Splits the string at the \var{sep}, and return a 3-tuple containing
+the part before the separator, the separator itself, and the part
+after the separator.  If the separator is not found, return a 3-tuple
+containing the string itself, followed by two empty strings.
+\versionadded{2.5}
+\end{methoddesc}
+
  \begin{methoddesc}[string]{replace}{old, new\optional{, count}}
  Return a copy of the string with all occurrences of substring
  \var{old} replaced by \var{new}.  If the optional argument
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index 82a023275902afb56e71582b59789f66bc305708..664578223c9853fd582d565e3b05f061c2732c43 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -184,6 +184,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
  # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
  # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
  # define PyUnicode_Join PyUnicodeUCS2_Join
+# define PyUnicode_Partition PyUnicodeUCS2_Partition
  # define PyUnicode_Replace PyUnicodeUCS2_Replace
  # define PyUnicode_Resize PyUnicodeUCS2_Resize
  # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
@@ -259,6 +260,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
  # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
  # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
  # define PyUnicode_Join PyUnicodeUCS4_Join
+# define PyUnicode_Partition PyUnicodeUCS4_Partition
  # define PyUnicode_Replace PyUnicodeUCS4_Replace
  # define PyUnicode_Resize PyUnicodeUCS4_Resize
  # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
@@ -1018,6 +1020,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
      int keepends               /* If true, line end markers are included */
      );         
  
+/* Partition a string using a given separator. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Partition(
+    PyObject *s,               /* String to partition */
+    PyObject *sep              /* String separator */
+    );         
+
  /* Split a string giving a list of Unicode strings.
  
     If sep is NULL, splitting will be done at all whitespace
diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py

index d4fdd8fc0cc31407424a75265190d4bcc3b0f357..260b2d83d9053ecbf227fc450e3fbf30ae2b092f 100644 (file)
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@@ -900,6 +900,21 @@ class MixinStrUnicodeUserStringTest:
          self.checkequal('A', 'a', 'title')
          self.checkequal(True, 'a', 'islower')
  
+    def test_partition(self):
+
+        self.checkequal(('this', ' is ', 'the partition method'),
+            'this is the partition method', 'partition', ' is ')
+
+        # from raymond's original specification
+        S = 'http://www.python.org'
+        self.checkequal(('http', '://', 'www.python.org'), S, 'partition', '://')
+        self.checkequal(('http://www.python.org', '', ''), S, 'partition', '?')
+        self.checkequal(('', 'http://', 'www.python.org'), S, 'partition', 'http://')
+        self.checkequal(('http://www.python.', 'org', ''), S, 'partition', 'org')
+
+        self.checkraises(ValueError, S, 'partition', '')
+        self.checkraises(TypeError, S, 'partition', None)
+
  
  class MixinStrStringUserStringTest:
      # Additional tests for 8bit strings, i.e. str, UserString and
diff --git a/Objects/stringobject.c b/Objects/stringobject.c

index 2dfac03a906bff4da5765417c3deee6068560fdb..0e0af89e5fad6647ec8f99553d1ce063f081a9f2 100644 (file)
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@@ -1610,20 +1610,20 @@ string_partition(PyStringObject *self, PyObject *args)
  {
         Py_ssize_t len = PyString_GET_SIZE(self), sep_len, pos;
         const char *str = PyString_AS_STRING(self), *sep;
-       PyObject *sepobj;
+       PyObject *sep_obj;
         PyObject * out;
  
-       if (!PyArg_ParseTuple(args, "O:partition", &sepobj))
+       if (!PyArg_ParseTuple(args, "O:partition", &sep_obj))
                 return NULL;
-       if (PyString_Check(sepobj)) {
-               sep = PyString_AS_STRING(sepobj);
-               sep_len = PyString_GET_SIZE(sepobj);
+       if (PyString_Check(sep_obj)) {
+               sep = PyString_AS_STRING(sep_obj);
+               sep_len = PyString_GET_SIZE(sep_obj);
         }
-#ifdef Py_USING_UNICODE_NOTYET
-       else if (PyUnicode_Check(sepobj))
-               return PyUnicode_Partition((PyObject *)self, sepobj);
+#ifdef Py_USING_UNICODE
+       else if (PyUnicode_Check(sep_obj))
+               return PyUnicode_Partition((PyObject *)self, sep_obj);
  #endif
-       else if (PyObject_AsCharBuffer(sepobj, &sep, &sep_len))
+       else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
                 return NULL;
  
         if (sep_len == 0) {
@@ -1644,13 +1644,13 @@ string_partition(PyStringObject *self, PyObject *args)
                 Py_INCREF(nullstring);
                 PyTuple_SET_ITEM(out, 2, (PyObject*) nullstring);
         } else {
-               Py_INCREF(sepobj);
+               PyObject* obj;
                 PyTuple_SET_ITEM(out, 0, PyString_FromStringAndSize(str, pos));
-               PyTuple_SET_ITEM(out, 1, sepobj);
-               PyTuple_SET_ITEM(out, 2,
-                       PyString_FromStringAndSize(str + sep_len + pos,
-                                                  len - sep_len - pos)
-                       );
+               Py_INCREF(sep_obj);
+               PyTuple_SET_ITEM(out, 1, sep_obj);
+               pos += sep_len;
+               obj = PyString_FromStringAndSize(str + pos, len - pos);
+               PyTuple_SET_ITEM(out, 2, obj);
                 if (PyErr_Occurred()) {
                         Py_DECREF(out);
                         return NULL;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index aff14f593aa12f033a2ab7c7c34c844393ede6f7..770224884cc6923934d7e851772a3985d6d13a9c 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4,6 +4,9 @@ Unicode implementation based on original code by Fredrik Lundh,
  modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
  Unicode Integration Proposal (see file Misc/unicode.txt).
  
+Major speed upgrades to the method implementations at the Reykjavik
+NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
+
  Copyright (c) Corporation for National Research Initiatives.
  
  --------------------------------------------------------------------
@@ -193,6 +196,7 @@ int unicode_resize(register PyUnicodeObject *unicode,
      /* Resizing shared object (unicode_empty or single character
         objects) in-place is not allowed. Use PyUnicode_Resize()
         instead ! */
+
      if (unicode == unicode_empty || 
         (unicode->length == 1 && 
          unicode->str[0] < 256U &&
@@ -202,8 +206,11 @@ int unicode_resize(register PyUnicodeObject *unicode,
          return -1;
      }
  
-    /* We allocate one more byte to make sure the string is
-       Ux0000 terminated -- XXX is this needed ? */
+    /* We allocate one more byte to make sure the string is Ux0000 terminated.
+       The overallocation is also used by fastsearch, which assumes that it's
+       safe to look at str[length] (without makeing any assumptions about what
+       it contains). */
+
      oldstr = unicode->str;
      PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
      if (!unicode->str) {
@@ -3859,8 +3866,6 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
  
  /* --- Helpers ------------------------------------------------------------ */
  
-#define USE_FAST /* experimental fast search implementation */
-
  /* fast search/count implementation, based on a mix between boyer-
     moore and horspool, with a few more bells and whistles on the top.
     for some more background, see: http://effbot.org/stringlib */
@@ -3936,10 +3941,8 @@ fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode)
              /* miss: check if next character is part of pattern */
              if (!(mask & (1 << (s[i+m] & 0x1F))))
                  i = i + m;
-            else {
+            else
                  i = i + skip;
-                continue;
-            }
          } else {
              /* skip: check if next character is part of pattern */
              if (!(mask & (1 << (s[i+m] & 0x1F))))
@@ -3973,23 +3976,13 @@ LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
      if (substring->length == 0)
         return (end - start + 1);
  
-#ifdef USE_FAST
      count = fastsearch(
          PyUnicode_AS_UNICODE(self) + start, end - start,
          substring->str, substring->length, FAST_COUNT
          );
+
      if (count < 0)
          count = 0; /* no match */
-#else    
-    end -= substring->length;
-
-    while (start <= end)
-        if (Py_UNICODE_MATCH(self, start, substring)) {
-            count++;
-            start += substring->length;
-        } else
-            start++;
-#endif
  
      return count;
  }
@@ -4040,30 +4033,19 @@ static Py_ssize_t findstring(PyUnicodeObject *self,
      if (substring->length == 0)
         return (direction > 0) ? start : end;
  
-#ifdef USE_FAST
      if (direction > 0) {
          Py_ssize_t pos = fastsearch(
              PyUnicode_AS_UNICODE(self) + start, end - start,
              substring->str, substring->length, FAST_SEARCH
              );
-        if (pos < 0)
-            return pos;
-        return pos + start;
-    }
-#endif
-
-    end -= substring->length;
-
-    if (direction < 0) {
+        if (pos >= 0)
+            return pos + start;
+    } else {
+        end -= substring->length;
          for (; end >= start; end--)
              if (Py_UNICODE_MATCH(self, end, substring))
                  return end;
-    } else {
-        for (; start <= end; start++)
-            if (Py_UNICODE_MATCH(self, start, substring))
-                return start;
      }
-
      return -1;
  }
  
@@ -5167,11 +5149,8 @@ int PyUnicode_Contains(PyObject *container,
                        PyObject *element)
  {
      PyUnicodeObject *u, *v;
-    int result;
      Py_ssize_t size;
-#ifdef USE_FAST
      Py_ssize_t pos;
-#endif
  
      /* Coerce the two arguments */
      v = (PyUnicodeObject *) PyUnicode_FromObject(element);
@@ -5189,44 +5168,19 @@ int PyUnicode_Contains(PyObject *container,
  
      size = PyUnicode_GET_SIZE(v);
      if (!size) {
-        result = 1;
+        pos = 0;
          goto done;
      }
  
-#ifdef USE_FAST
      pos = fastsearch(
          PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u),
          PyUnicode_AS_UNICODE(v), size, FAST_SEARCH
          );
-    result = (pos != -1);
-#else    
-    result = 0;
-
-    if (size == 1) {
-        Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0];
-        Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u);
-       Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u);
-       for (; ptr < end; ptr++) {
-           if (*ptr == chr) {
-               result = 1;
-               break;
-           }
-       }
-    } else {
-        Py_ssize_t start = 0;
-        Py_ssize_t end = PyUnicode_GET_SIZE(u) - size;
-        for (; start <= end; start++)
-            if (Py_UNICODE_MATCH(u, start, v)) {
-                result = 1;
-                break;
-            }
-    }
-#endif
  
  done:
      Py_DECREF(u);
      Py_DECREF(v);
-    return result;
+    return (pos != -1);
  }
  
  /* Concat to string or Unicode object giving a new Unicode object. */
@@ -6335,6 +6289,84 @@ unicode_split(PyUnicodeObject *self, PyObject *args)
         return PyUnicode_Split((PyObject *)self, substring, maxcount);
  }
  
+PyObject *
+PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
+{
+    PyObject* str_obj;
+    PyObject* sep_obj;
+    Py_UNICODE *str, *sep;
+    Py_ssize_t len, sep_len, pos;
+    PyObject* out;
+    
+    str_obj = PyUnicode_FromObject(str_in);
+    if (!str_obj)
+       return NULL;
+    sep_obj = PyUnicode_FromObject(sep_in);
+    if (!sep_obj)
+        goto error;
+
+    str = PyUnicode_AS_UNICODE(str_obj);
+    len = PyUnicode_GET_SIZE(str_obj);
+
+    sep = PyUnicode_AS_UNICODE(sep_obj);
+    sep_len = PyUnicode_GET_SIZE(sep_obj);
+
+    if (sep_len == 0) {
+        PyErr_SetString(PyExc_ValueError, "empty separator");
+        goto error;
+    }
+
+    out = PyTuple_New(3);
+    if (!out)
+        goto error;
+
+    pos = fastsearch(str, len, sep, sep_len, FAST_SEARCH);
+    if (pos < 0) {
+        Py_INCREF(str_obj);
+        PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
+        Py_INCREF(unicode_empty);
+        PyTuple_SET_ITEM(out, 1, (PyObject*) unicode_empty);
+        Py_INCREF(unicode_empty);
+        PyTuple_SET_ITEM(out, 2, (PyObject*) unicode_empty);
+    } else {
+        PyObject* obj;
+        PyTuple_SET_ITEM(out, 0, PyUnicode_FromUnicode(str, pos));
+        Py_INCREF(sep_obj);
+        PyTuple_SET_ITEM(out, 1, sep_obj);
+        obj = PyUnicode_FromUnicode(str + sep_len + pos, len - sep_len - pos);
+        PyTuple_SET_ITEM(out, 2, obj);
+        if (PyErr_Occurred()) {
+            Py_DECREF(out);
+            goto error;
+        }
+    }
+
+    return out;
+
+error:
+    Py_XDECREF(sep_obj);
+    Py_DECREF(str_obj);
+    return NULL;
+}
+
+PyDoc_STRVAR(partition__doc__,
+"S.partition(sep) -> (head, sep, tail)\n\
+\n\
+Searches for the separator sep in S, and returns the part before it,\n\
+the separator itself, and the part after it.  If the separator is not\n\
+found, returns S and two empty strings.");
+
+static PyObject*
+unicode_partition(PyUnicodeObject *self, PyObject *args)
+{
+    PyObject *separator;
+
+    if (!PyArg_ParseTuple(args, "O:partition", &separator))
+        return NULL;
+
+    return PyUnicode_Partition((PyObject *)self, separator);
+}
+
  PyObject *PyUnicode_RSplit(PyObject *s,
                            PyObject *sep,
                            Py_ssize_t maxsplit)
@@ -6588,6 +6620,7 @@ static PyMethodDef unicode_methods[] = {
      {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
      {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
      {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
+    {"partition", (PyCFunction) unicode_partition, METH_VARARGS, partition__doc__},
      {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
      {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
      {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
author	Fredrik Lundh <fredrik@pythonware.com>
	Fri, 26 May 2006 08:54:28 +0000 (08:54 +0000)
committer	Fredrik Lundh <fredrik@pythonware.com>
	Fri, 26 May 2006 08:54:28 +0000 (08:54 +0000)
Doc/lib/libstdtypes.tex		patch \| blob \| history
Include/unicodeobject.h		patch \| blob \| history
Lib/test/string_tests.py		patch \| blob \| history
Objects/stringobject.c		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history