]> granicus.if.org Git - python/commitdiff
Issue #4751: For hashlib algorithms provided by OpenSSL, the Python
authorGregory P. Smith <greg@mad-scientist.com>
Mon, 4 May 2009 00:16:49 +0000 (00:16 +0000)
committerGregory P. Smith <greg@mad-scientist.com>
Mon, 4 May 2009 00:16:49 +0000 (00:16 +0000)
GIL is now released during computation on data lengths >= 2048 bytes.

Doc/library/hashlib.rst
Lib/test/test_hashlib.py
Misc/NEWS
Modules/_hashopenssl.c

index 73e6e4ea1005536071e3eb4ec8b43b09b058cf82..b7b13710ab742f2656f8768589342e99419057a7 100644 (file)
@@ -95,6 +95,12 @@ A hash object has the following methods:
    a single call with the concatenation of all the arguments: ``m.update(a);
    m.update(b)`` is equivalent to ``m.update(a+b)``.
 
+   .. versionchanged:: 2.7
+
+      The Python GIL is released to allow other threads to run while
+      hash updates on data larger than 2048 bytes is taking place when
+      using hash algorithms supplied by OpenSSL.
+
 
 .. method:: hash.digest()
 
index e7ce198486910b46d6d99e83b17c59e4aabf0e0f..4ba07b1cea40ce016cc4cf5fa5bcf0c010caf57d 100644 (file)
@@ -2,11 +2,16 @@
 #
 # $Id$
 #
-#  Copyright (C) 2005   Gregory P. Smith (greg@krypto.org)
+#  Copyright (C) 2005-2009   Gregory P. Smith (greg@krypto.org)
 #  Licensed to PSF under a Contributor Agreement.
 #
 
 import hashlib
+import StringIO
+try:
+    import threading
+except ImportError:
+    threading = None
 import unittest
 from test import test_support
 from test.test_support import _4G, precisionbigmemtest
@@ -61,10 +66,10 @@ class HashLibTestCase(unittest.TestCase):
     def check(self, name, data, digest):
         # test the direct constructors
         computed = getattr(hashlib, name)(data).hexdigest()
-        self.assert_(computed == digest)
+        self.assertEqual(computed, digest)
         # test the general new() interface
         computed = hashlib.new(name, data).hexdigest()
-        self.assert_(computed == digest)
+        self.assertEqual(computed, digest)
 
     def check_no_unicode(self, algorithm_name):
         # Unicode objects are not allowed as input.
@@ -211,6 +216,44 @@ class HashLibTestCase(unittest.TestCase):
           "e718483d0ce769644e2e42c7bc15b4638e1f98b13b2044285632a803afa973eb"+
           "de0ff244877ea60a4cb0432ce577c31beb009c5c2c49aa2e4eadb217ad8cc09b")
 
+    def test_threaded_hashing(self):
+        if not threading:
+            raise unittest.SkipTest('No threading module.')
+
+        # Updating the same hash object from several threads at once
+        # using data chunk sizes containing the same byte sequences.
+        #
+        # If the internal locks are working to prevent multiple
+        # updates on the same object from running at once, the resulting
+        # hash will be the same as doing it single threaded upfront.
+        hasher = hashlib.sha1()
+        num_threads = 5
+        smallest_data = 'swineflu'
+        data = smallest_data*200000
+        expected_hash = hashlib.sha1(data*num_threads).hexdigest()
+
+        def hash_in_chunks(chunk_size, event):
+            index = 0
+            while index < len(data):
+                hasher.update(data[index:index+chunk_size])
+                index += chunk_size
+            event.set()
+
+        events = []
+        for threadnum in xrange(num_threads):
+            chunk_size = len(data) // (10**threadnum)
+            assert chunk_size > 0
+            assert chunk_size % len(smallest_data) == 0
+            event = threading.Event()
+            events.append(event)
+            threading.Thread(target=hash_in_chunks,
+                             args=(chunk_size, event)).start()
+
+        for event in events:
+            event.wait()
+
+        self.assertEqual(expected_hash, hasher.hexdigest())
+
 
 def test_main():
     test_support.run_unittest(HashLibTestCase)
index 64e473ac58989425ac5e0ae7794ca14af3dfaf01..e372a623609a8fd6818c49e4aae9c28d598a4171 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -905,6 +905,9 @@ C-API
 Extension Modules
 -----------------
 
+- Issue #4751: For hashlib algorithms provided by OpenSSL, the Python
+  GIL is now released during computation on data lengths >= 2048 bytes.
+
 - Issue #3745: Fix hashlib to always reject unicode and non buffer-api
   supporting objects as input no matter how it was compiled (built in
   implementations or external openssl library).
index 7b5a2e52bc899bb035804024b8eef4e9dfd045b3..8dbaa208e494c9685510995b300ee0997afa4d56 100644 (file)
@@ -1,7 +1,7 @@
 /* Module that wraps all OpenSSL hash algorithms */
 
 /*
- * Copyright (C) 2005-2007   Gregory P. Smith (greg@krypto.org)
+ * Copyright (C) 2005-2009   Gregory P. Smith (greg@krypto.org)
  * Licensed to PSF under a Contributor Agreement.
  *
  * Derived from a skeleton of shamodule.c containing work performed by:
 #include "structmember.h"
 #include "hashlib.h"
 
+#ifdef WITH_THREAD
+#include "pythread.h"
+    #define ENTER_HASHLIB(obj) \
+        if ((obj)->lock) \
+        { \
+            if (!PyThread_acquire_lock((obj)->lock, 0)) \
+            { \
+                Py_BEGIN_ALLOW_THREADS \
+                PyThread_acquire_lock((obj)->lock, 1); \
+                Py_END_ALLOW_THREADS \
+            } \
+        }
+    #define LEAVE_HASHLIB(obj) \
+        if ((obj)->lock) \
+        { \
+            PyThread_release_lock((obj)->lock); \
+        }
+#else
+    #define ENTER_HASHLIB(obj)
+    #define LEAVE_HASHLIB(obj)
+#endif
+
 /* EVP is the preferred interface to hashing in OpenSSL */
 #include <openssl/evp.h>
 
 #define MUNCH_SIZE INT_MAX
 
+/* TODO(gps): We should probably make this a module or EVPobject attribute
+ * to allow the user to optimize based on the platform they're using. */
+#define HASHLIB_GIL_MINSIZE 2048
 
 #ifndef HASH_OBJ_CONSTRUCTOR
 #define HASH_OBJ_CONSTRUCTOR 0
 #endif
 
+
 typedef struct {
     PyObject_HEAD
     PyObject            *name;  /* name of this hash algorithm */
     EVP_MD_CTX          ctx;    /* OpenSSL message digest context */
-    /*
-     * TODO investigate performance impact of including a lock for this object
-     * here and releasing the Python GIL while hash updates are in progress.
-     * (perhaps only release GIL if input length will take long to process?)
-     */
+#ifdef WITH_THREAD
+    PyThread_type_lock  lock;   /* OpenSSL context lock */
+#endif
 } EVPobject;
 
 
@@ -64,26 +88,57 @@ newEVPobject(PyObject *name)
     if (retval != NULL) {
         Py_INCREF(name);
         retval->name = name;
+#ifdef WITH_THREAD
+        retval->lock = NULL;
+#endif
     }
 
     return retval;
 }
 
+static void
+EVP_hash(EVPobject *self, const void *vp, Py_ssize_t len)
+{
+    unsigned int process;
+    const unsigned char *cp = (const unsigned char *)vp;
+    while (0 < len)
+    {
+        if (len > (Py_ssize_t)MUNCH_SIZE)
+            process = MUNCH_SIZE;
+        else
+            process = Py_SAFE_DOWNCAST(len, Py_ssize_t, unsigned int);
+        EVP_DigestUpdate(&self->ctx, (const void*)cp, process);
+        len -= process;
+        cp += process;
+    }
+}
+
 /* Internal methods for a hash object */
 
 static void
-EVP_dealloc(PyObject *ptr)
+EVP_dealloc(EVPobject *self)
 {
-    EVP_MD_CTX_cleanup(&((EVPobject *)ptr)->ctx);
-    Py_XDECREF(((EVPobject *)ptr)->name);
-    PyObject_Del(ptr);
+#ifdef WITH_THREAD
+    if (self->lock != NULL)
+        PyThread_free_lock(self->lock);
+#endif
+    EVP_MD_CTX_cleanup(&self->ctx);
+    Py_XDECREF(self->name);
+    PyObject_Del(self);
 }
 
+static void locked_EVP_MD_CTX_copy(EVP_MD_CTX *new_ctx_p, EVPobject *self)
+{
+    ENTER_HASHLIB(self);
+    EVP_MD_CTX_copy(new_ctx_p, &self->ctx);
+    LEAVE_HASHLIB(self);
+}
 
 /* External methods for a hash object */
 
 PyDoc_STRVAR(EVP_copy__doc__, "Return a copy of the hash object.");
 
+
 static PyObject *
 EVP_copy(EVPobject *self, PyObject *unused)
 {
@@ -92,7 +147,7 @@ EVP_copy(EVPobject *self, PyObject *unused)
     if ( (newobj = newEVPobject(self->name))==NULL)
         return NULL;
 
-    EVP_MD_CTX_copy(&newobj->ctx, &self->ctx);
+    locked_EVP_MD_CTX_copy(&newobj->ctx, self);
     return (PyObject *)newobj;
 }
 
@@ -107,7 +162,7 @@ EVP_digest(EVPobject *self, PyObject *unused)
     PyObject *retval;
     unsigned int digest_size;
 
-    EVP_MD_CTX_copy(&temp_ctx, &self->ctx);
+    locked_EVP_MD_CTX_copy(&temp_ctx, self);
     digest_size = EVP_MD_CTX_size(&temp_ctx);
     EVP_DigestFinal(&temp_ctx, digest, NULL);
 
@@ -129,7 +184,7 @@ EVP_hexdigest(EVPobject *self, PyObject *unused)
     unsigned int i, j, digest_size;
 
     /* Get the raw (binary) digest value */
-    EVP_MD_CTX_copy(&temp_ctx, &self->ctx);
+    locked_EVP_MD_CTX_copy(&temp_ctx, self);
     digest_size = EVP_MD_CTX_size(&temp_ctx);
     EVP_DigestFinal(&temp_ctx, digest, NULL);
 
@@ -174,19 +229,26 @@ EVP_update(EVPobject *self, PyObject *args)
 
     GET_BUFFER_VIEW_OR_ERROUT(obj, &view, NULL);
 
-    if (view.len > 0 && view.len <= MUNCH_SIZE) {
-        EVP_DigestUpdate(&self->ctx, (unsigned char*)view.buf,
-                         Py_SAFE_DOWNCAST(view.len, Py_ssize_t, unsigned int));
+#ifdef WITH_THREAD
+    if (self->lock == NULL && view.len >= HASHLIB_GIL_MINSIZE)
+    {
+        self->lock = PyThread_allocate_lock();
+        /* fail? lock = NULL and we fail over to non-threaded code. */
+    }
+
+    if (self->lock != NULL)
+    {
+        Py_BEGIN_ALLOW_THREADS
+        PyThread_acquire_lock(self->lock, 1);
+        EVP_hash(self, view.buf, view.len);
+        PyThread_release_lock(self->lock);
+        Py_END_ALLOW_THREADS
     } else {
-        Py_ssize_t len = view.len;
-        unsigned char *cp = (unsigned char *)view.buf;
-        while (len > 0) {
-            unsigned int process = len > MUNCH_SIZE ? MUNCH_SIZE : len;
-            EVP_DigestUpdate(&self->ctx, cp, process);
-            len -= process;
-            cp += process;
-        }
+        EVP_hash(self, view.buf, view.len);
     }
+#else
+    EVP_hash(self, view.buf, view.len);
+#endif
 
     PyBuffer_Release(&view);
 
@@ -205,13 +267,17 @@ static PyMethodDef EVP_methods[] = {
 static PyObject *
 EVP_get_block_size(EVPobject *self, void *closure)
 {
-    return PyInt_FromLong(EVP_MD_CTX_block_size(&((EVPobject *)self)->ctx));
+    long block_size;
+    block_size = EVP_MD_CTX_block_size(&self->ctx);
+    return PyLong_FromLong(block_size);
 }
 
 static PyObject *
 EVP_get_digest_size(EVPobject *self, void *closure)
 {
-    return PyInt_FromLong(EVP_MD_CTX_size(&((EVPobject *)self)->ctx));
+    long size;
+    size = EVP_MD_CTX_size(&self->ctx);
+    return PyLong_FromLong(size);
 }
 
 static PyMemberDef EVP_members[] = {
@@ -286,19 +352,14 @@ EVP_tp_init(EVPobject *self, PyObject *args, PyObject *kwds)
     Py_INCREF(self->name);
 
     if (data_obj) {
-        if (view.len > 0 && view.len <= MUNCH_SIZE) {
-            EVP_DigestUpdate(&self->ctx, (unsigned char*)view.buf,
-                    Py_SAFE_DOWNCAST(view.len, Py_ssize_t, unsigned int));
+        if (view.len >= HASHLIB_GIL_MINSIZE)
+        {
+            Py_BEGIN_ALLOW_THREADS
+            EVP_hash(self, view.buf, view.len);
+            Py_END_ALLOW_THREADS
         } else {
-            Py_ssize_t len = view.len;
-            unsigned char *cp = (unsigned char*)view.buf;
-            while (len > 0) {
-                unsigned int process = len > MUNCH_SIZE ? MUNCH_SIZE : len;
-                EVP_DigestUpdate(&self->ctx, cp, process);
-                len -= process;
-                cp += process;
-            }
-        }
+            EVP_hash(self, view.buf, view.len);
+        }        
         PyBuffer_Release(&view);
     }
 
@@ -329,7 +390,7 @@ static PyTypeObject EVPtype = {
     sizeof(EVPobject), /*tp_basicsize*/
     0,                 /*tp_itemsize*/
     /* methods */
-    EVP_dealloc,       /*tp_dealloc*/
+    (destructor)EVP_dealloc,   /*tp_dealloc*/
     0,                 /*tp_print*/
     0,                  /*tp_getattr*/
     0,                  /*tp_setattr*/
@@ -389,17 +450,13 @@ EVPnew(PyObject *name_obj,
     }
 
     if (cp && len) {
-        if (len > 0 && len <= MUNCH_SIZE) {
-            EVP_DigestUpdate(&self->ctx, cp, Py_SAFE_DOWNCAST(len, Py_ssize_t,
-                                                              unsigned int));
+        if (len >= HASHLIB_GIL_MINSIZE)
+        {
+            Py_BEGIN_ALLOW_THREADS
+            EVP_hash(self, cp, len);
+            Py_END_ALLOW_THREADS
         } else {
-            Py_ssize_t offset = 0;
-            while (len > 0) {
-                unsigned int process = len > MUNCH_SIZE ? MUNCH_SIZE : len;
-                EVP_DigestUpdate(&self->ctx, cp + offset, process);
-                len -= process;
-                offset += process;
-            }
+            EVP_hash(self, cp, len);
         }
     }