Optimize bytes.fromhex() and bytearray.fromhex()

author Victor Stinner <victor.stinner@gmail.com>

Wed, 14 Oct 2015 09:25:33 +0000 (11:25 +0200)

committer Victor Stinner <victor.stinner@gmail.com>

Wed, 14 Oct 2015 09:25:33 +0000 (11:25 +0200)
author Victor Stinner <victor.stinner@gmail.com>
Wed, 14 Oct 2015 09:25:33 +0000 (11:25 +0200)
committer Victor Stinner <victor.stinner@gmail.com>
Wed, 14 Oct 2015 09:25:33 +0000 (11:25 +0200)
diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst

index d1e5045dd9052b45301b31812718cdc95213f424..edacea12e6a124de99662ff9110ac9b6e77a6ead 100644 (file)
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@@ -161,6 +161,9 @@ Optimizations
  * ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by
    Victor Stinner in :issue:`25399`).
  
+* Optimize :meth:`bytes.fromhex` and :meth:`bytearray.fromhex`: they are now
+  between 2x and 3.5x faster. (Contributed by Victor Stinner in :issue:`25401`).
+
  
  Build and C API Changes
  =======================
diff --git a/Include/bytesobject.h b/Include/bytesobject.h

index b5b37efd251a888ae6bb5530082b2eb46e8eb0a5..4046c1cf85b282ea9a48e34d9899830f5dbf58f5 100644 (file)
--- a/Include/bytesobject.h
+++ b/Include/bytesobject.h
@@ -67,6 +67,9 @@ PyAPI_FUNC(PyObject*) _PyBytes_FormatEx(
      Py_ssize_t format_len,
      PyObject *args,
      int use_bytearray);
+PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
+    PyObject *string,
+    int use_bytearray);
  #endif
  PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
                                                    const char *, Py_ssize_t,
diff --git a/Include/longobject.h b/Include/longobject.h

index ab92495a23f6838c1379e9b9cdc906b47b9e83f3..9574f05a2d3b7180b43b48b4d2e32ee29baae467 100644 (file)
--- a/Include/longobject.h
+++ b/Include/longobject.h
@@ -65,7 +65,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
  #  error "void* different in size from int, long and long long"
  #endif /* SIZEOF_VOID_P */
  
-/* Used by Python/mystrtoul.c. */
+/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */
  #ifndef Py_LIMITED_API
  PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
  #endif
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py

index 947d959e84e61eda0959f8d5da8b5e914c79db20..0fe33b5cb3568189346b0bd200064a01ed614bc3 100644 (file)
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -301,6 +301,20 @@ class BaseBytesTest:
          self.assertRaises(ValueError, self.type2test.fromhex, '\x00')
          self.assertRaises(ValueError, self.type2test.fromhex, '12   \x00   34')
  
+        for data, pos in (
+            # invalid first hexadecimal character
+            ('12 x4 56', 3),
+            # invalid second hexadecimal character
+            ('12 3x 56', 4),
+            # two invalid hexadecimal characters
+            ('12 xy 56', 3),
+            # test non-ASCII string
+            ('12 3\xff 56', 4),
+        ):
+            with self.assertRaises(ValueError) as cm:
+                self.type2test.fromhex(data)
+            self.assertIn('at position %s' % pos, str(cm.exception))
+
      def test_hex(self):
          self.assertRaises(TypeError, self.type2test.hex)
          self.assertRaises(TypeError, self.type2test.hex, 1)
diff --git a/Misc/NEWS b/Misc/NEWS

index 97e5e7740b869cadaf3dfca28016de4d8ac924fe..312cde02c5f4f84abe728dd0bd4016c8b6781aa7 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
  Core and Builtins
  -----------------
  
+- Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now
+  between 2x and 3.5x faster.
+
  - Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter
    API. Formatting is now between 2.5 and 5 times faster.
  
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c

index e535bce8d73136119bf1dd0c8c5f392174ef3269..b270fcccc679e4e4252cc64c0eefa503f8e1d522 100644 (file)
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -2823,48 +2823,7 @@ static PyObject *
  bytearray_fromhex_impl(PyObject*cls, PyObject *string)
  /*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
  {
-    PyObject *newbytes;
-    char *buf;
-    Py_ssize_t hexlen, byteslen, i, j;
-    int top, bot;
-    void *data;
-    unsigned int kind;
-
-    assert(PyUnicode_Check(string));
-    if (PyUnicode_READY(string))
-        return NULL;
-    kind = PyUnicode_KIND(string);
-    data = PyUnicode_DATA(string);
-    hexlen = PyUnicode_GET_LENGTH(string);
-
-    byteslen = hexlen/2; /* This overestimates if there are spaces */
-    newbytes = PyByteArray_FromStringAndSize(NULL, byteslen);
-    if (!newbytes)
-        return NULL;
-    buf = PyByteArray_AS_STRING(newbytes);
-    for (i = j = 0; i < hexlen; i += 2) {
-        /* skip over spaces in the input */
-        while (PyUnicode_READ(kind, data, i) == ' ')
-            i++;
-        if (i >= hexlen)
-            break;
-        top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
-        bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
-        if (top == -1 || bot == -1) {
-            PyErr_Format(PyExc_ValueError,
-                         "non-hexadecimal number found in "
-                         "fromhex() arg at position %zd", i);
-            goto error;
-        }
-        buf[j++] = (top << 4) + bot;
-    }
-    if (PyByteArray_Resize(newbytes, j) < 0)
-        goto error;
-    return newbytes;
-
-  error:
-    Py_DECREF(newbytes);
-    return NULL;
+    return _PyBytes_FromHex(string, 1);
  }
  
  PyDoc_STRVAR(hex__doc__,
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c

index 20b11fb375f075d8fc085d3107a7aceed5537c6b..2d4cf4b9ec09bd950666ae1c45c84a14a6cb5333 100644 (file)
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -30,6 +30,10 @@ static PyBytesObject *nullstring;
  */
  #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
  
+/* Forward declaration */
+Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer,
+                                                   char *str);
+
  /*
     For PyBytes_FromString(), the parameter `str' points to a null-terminated
     string containing exactly `size' bytes.
@@ -3078,22 +3082,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends)
          );
  }
  
-static int
-hex_digit_to_int(Py_UCS4 c)
-{
-    if (c >= 128)
-        return -1;
-    if (Py_ISDIGIT(c))
-        return c - '0';
-    else {
-        if (Py_ISUPPER(c))
-            c = Py_TOLOWER(c);
-        if (c >= 'a' && c <= 'f')
-            return c - 'a' + 10;
-    }
-    return -1;
-}
-
  /*[clinic input]
  @classmethod
  bytes.fromhex
@@ -3111,47 +3099,83 @@ static PyObject *
  bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
  /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
  {
-    PyObject *newstring;
+    return _PyBytes_FromHex(string, 0);
+}
+
+PyObject*
+_PyBytes_FromHex(PyObject *string, int use_bytearray)
+{
      char *buf;
-    Py_ssize_t hexlen, byteslen, i, j;
-    int top, bot;
-    void *data;
-    unsigned int kind;
+    Py_ssize_t hexlen, invalid_char;
+    unsigned int top, bot;
+    Py_UCS1 *str, *end;
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
+    writer.use_bytearray = use_bytearray;
  
      assert(PyUnicode_Check(string));
      if (PyUnicode_READY(string))
          return NULL;
-    kind = PyUnicode_KIND(string);
-    data = PyUnicode_DATA(string);
      hexlen = PyUnicode_GET_LENGTH(string);
  
-    byteslen = hexlen/2; /* This overestimates if there are spaces */
-    newstring = PyBytes_FromStringAndSize(NULL, byteslen);
-    if (!newstring)
+    if (!PyUnicode_IS_ASCII(string)) {
+        void *data = PyUnicode_DATA(string);
+        unsigned int kind = PyUnicode_KIND(string);
+        Py_ssize_t i;
+
+        /* search for the first non-ASCII character */
+        for (i = 0; i < hexlen; i++) {
+            if (PyUnicode_READ(kind, data, i) >= 128)
+                break;
+        }
+        invalid_char = i;
+        goto error;
+    }
+
+    assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND);
+    str = PyUnicode_1BYTE_DATA(string);
+
+    /* This overestimates if there are spaces */
+    buf = _PyBytesWriter_Alloc(&writer, hexlen / 2);
+    if (buf == NULL)
          return NULL;
-    buf = PyBytes_AS_STRING(newstring);
-    for (i = j = 0; i < hexlen; i += 2) {
+
+    end = str + hexlen;
+    while (str < end) {
          /* skip over spaces in the input */
-        while (PyUnicode_READ(kind, data, i) == ' ')
-            i++;
-        if (i >= hexlen)
-            break;
-        top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
-        bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
-        if (top == -1 || bot == -1) {
-            PyErr_Format(PyExc_ValueError,
-                         "non-hexadecimal number found in "
-                         "fromhex() arg at position %zd", i);
+        if (*str == ' ') {
+            do {
+                str++;
+            } while (*str == ' ');
+            if (str >= end)
+                break;
+        }
+
+        top = _PyLong_DigitValue[*str];
+        if (top >= 16) {
+            invalid_char = str - PyUnicode_1BYTE_DATA(string);
              goto error;
          }
-        buf[j++] = (top << 4) + bot;
+        str++;
+
+        bot = _PyLong_DigitValue[*str];
+        if (bot >= 16) {
+            invalid_char = str - PyUnicode_1BYTE_DATA(string);
+            goto error;
+        }
+        str++;
+
+        *buf++ = (unsigned char)((top << 4) + bot);
      }
-    if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0)
-        goto error;
-    return newstring;
+
+    return _PyBytesWriter_Finish(&writer, buf);
  
    error:
-    Py_XDECREF(newstring);
+    PyErr_Format(PyExc_ValueError,
+                 "non-hexadecimal number found in "
+                 "fromhex() arg at position %zd", invalid_char);
+    _PyBytesWriter_Dealloc(&writer);
      return NULL;
  }
  
@@ -3888,7 +3912,7 @@ _PyBytesWriter_AsString(_PyBytesWriter *writer)
  }
  
  Py_LOCAL_INLINE(Py_ssize_t)
-_PyBytesWriter_GetPos(_PyBytesWriter *writer, char *str)
+_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str)
  {
      char *start = _PyBytesWriter_AsString(writer);
      assert(str != NULL);
@@ -3963,7 +3987,7 @@ _PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size)
          allocated += allocated / OVERALLOCATE_FACTOR;
      }
  
-    pos = _PyBytesWriter_GetPos(writer, str);
+    pos = _PyBytesWriter_GetSize(writer, str);
      if (!writer->use_small_buffer) {
          if (writer->use_bytearray) {
              if (PyByteArray_Resize(writer->buffer, allocated))
@@ -4041,33 +4065,33 @@ _PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size)
  PyObject *
  _PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
  {
-    Py_ssize_t pos;
+    Py_ssize_t size;
      PyObject *result;
  
      _PyBytesWriter_CheckConsistency(writer, str);
  
-    pos = _PyBytesWriter_GetPos(writer, str);
-    if (pos == 0 && !writer->use_bytearray) {
+    size = _PyBytesWriter_GetSize(writer, str);
+    if (size == 0 && !writer->use_bytearray) {
          Py_CLEAR(writer->buffer);
          /* Get the empty byte string singleton */
          result = PyBytes_FromStringAndSize(NULL, 0);
      }
      else if (writer->use_small_buffer) {
-        result = PyBytes_FromStringAndSize(writer->small_buffer, pos);
+        result = PyBytes_FromStringAndSize(writer->small_buffer, size);
      }
      else {
          result = writer->buffer;
          writer->buffer = NULL;
  
-        if (pos != writer->allocated) {
+        if (size != writer->allocated) {
              if (writer->use_bytearray) {
-                if (PyByteArray_Resize(result, pos)) {
+                if (PyByteArray_Resize(result, size)) {
                      Py_DECREF(result);
                      return NULL;
                  }
              }
              else {
-                if (_PyBytes_Resize(&result, pos)) {
+                if (_PyBytes_Resize(&result, size)) {
                      assert(result == NULL);
                      return NULL;
                  }
author	Victor Stinner <victor.stinner@gmail.com>
	Wed, 14 Oct 2015 09:25:33 +0000 (11:25 +0200)
committer	Victor Stinner <victor.stinner@gmail.com>
	Wed, 14 Oct 2015 09:25:33 +0000 (11:25 +0200)
Doc/whatsnew/3.6.rst		patch \| blob \| history
Include/bytesobject.h		patch \| blob \| history
Include/longobject.h		patch \| blob \| history
Lib/test/test_bytes.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Objects/bytearrayobject.c		patch \| blob \| history
Objects/bytesobject.c		patch \| blob \| history