bpo-26219: per opcode cache for LOAD_GLOBAL (GH-12884)

author Inada Naoki <songofacandy@gmail.com>

Mon, 3 Jun 2019 12:30:58 +0000 (21:30 +0900)

committer GitHub <noreply@github.com>

Mon, 3 Jun 2019 12:30:58 +0000 (21:30 +0900)
author Inada Naoki <songofacandy@gmail.com>
Mon, 3 Jun 2019 12:30:58 +0000 (21:30 +0900)
committer GitHub <noreply@github.com>
Mon, 3 Jun 2019 12:30:58 +0000 (21:30 +0900)
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst

index 9474a2f4aafe6da3c3a3e3e929c150b25bfecc33..da97a72850999268a9a6e6075b0f89f4f023bad2 100644 (file)
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -860,6 +860,10 @@ Optimizations
    methods up to 20--50%.  (Contributed by Serhiy Storchaka in :issue:`23867`,
    :issue:`35582` and :issue:`36127`.)
  
+* ``LOAD_GLOBAL`` instruction now uses new "per opcode cache" mechanism.
+  It is about 40% faster now.  (Contributed by Yury Selivanov and Inada Naoki in
+  :issue:`26219`.)
+
  
  Build and C API Changes
  =======================
diff --git a/Include/code.h b/Include/code.h

index 933de97d0782829bbf7f62ebb7fcb12b49116eb8..b79d977394e026452e7bbabd24e19dbdc8db7f0a 100644 (file)
--- a/Include/code.h
+++ b/Include/code.h
@@ -17,6 +17,8 @@ typedef uint16_t _Py_CODEUNIT;
  #  define _Py_OPARG(word) ((word) >> 8)
  #endif
  
+typedef struct _PyOpcache _PyOpcache;
+
  /* Bytecode object */
  typedef struct {
      PyObject_HEAD
@@ -49,6 +51,21 @@ typedef struct {
         Type is a void* to keep the format private in codeobject.c to force
         people to go through the proper APIs. */
      void *co_extra;
+
+    /* Per opcodes just-in-time cache
+     *
+     * To reduce cache size, we use indirect mapping from opcode index to
+     * cache object:
+     *   cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1]
+     */
+
+    // co_opcache_map is indexed by (next_instr - first_instr).
+    //  * 0 means there is no cache for this opcode.
+    //  * n > 0 means there is cache in co_opcache[n-1].
+    unsigned char *co_opcache_map;
+    _PyOpcache *co_opcache;
+    int co_opcache_flag;  // used to determine when create a cache.
+    unsigned char co_opcache_size;  // length of co_opcache.
  } PyCodeObject;
  
  /* Masks for co_flags above */
diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h

index d44afdf4fa46a55b3580579802ac7410b23c59e1..2c6df9a9af9e3b182036270165e85a00b3d5c178 100644 (file)
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -31,6 +31,9 @@ PyAPI_FUNC(void) _PyEval_SignalAsyncExc(
  PyAPI_FUNC(void) _PyEval_ReInitThreads(
      _PyRuntimeState *runtime);
  
+/* Private function */
+void _PyEval_Fini(void);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h

new file mode 100644 (file)

index 0000000..88956f1
--- /dev/null
+++ b/Include/internal/pycore_code.h
@@ -0,0 +1,27 @@
+#ifndef Py_INTERNAL_CODE_H
+#define Py_INTERNAL_CODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+ 
+typedef struct {
+    PyObject *ptr;  /* Cached pointer (borrowed reference) */
+    uint64_t globals_ver;  /* ma_version of global dict */
+    uint64_t builtins_ver; /* ma_version of builtin dict */
+} _PyOpcache_LoadGlobal;
+
+struct _PyOpcache {
+    union {
+        _PyOpcache_LoadGlobal lg;
+    } u;
+    char optimized;
+};
+
+/* Private API */
+int _PyCode_InitOpcache(PyCodeObject *co);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CODE_H */
diff --git a/Lib/test/test_dict_version.py b/Lib/test/test_dict_version.py

index 5671f9fc7c226558298c1617b9a1e62f2b4a3a64..cb79e7434c29d4f17858e00a655bd198b6b9e7e5 100644 (file)
--- a/Lib/test/test_dict_version.py
+++ b/Lib/test/test_dict_version.py
@@ -80,14 +80,14 @@ class DictVersionTests(unittest.TestCase):
  
          # setting a key to the same value with dict.__setitem__
          # must change the version
-        self.check_version_changed(d, d.__setitem__, 'key', value)
+        self.check_version_dont_change(d, d.__setitem__, 'key', value)
  
          # setting a key to the same value with dict.update
          # must change the version
-        self.check_version_changed(d, d.update, key=value)
+        self.check_version_dont_change(d, d.update, key=value)
  
          d2 = self.new_dict(key=value)
-        self.check_version_changed(d, d.update, d2)
+        self.check_version_dont_change(d, d.update, d2)
  
      def test_setitem_equal(self):
          class AlwaysEqual:
diff --git a/Makefile.pre.in b/Makefile.pre.in

index 0bf5df4b68e57bc55da19f5d39f13f1cfda82814..a0bc9c1f1cc8978fed7074b81945dc03126121d6 100644 (file)
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -1070,6 +1070,7 @@ PYTHON_HEADERS= \
                 $(srcdir)/Include/internal/pycore_accu.h \
                 $(srcdir)/Include/internal/pycore_atomic.h \
                 $(srcdir)/Include/internal/pycore_ceval.h \
+               $(srcdir)/Include/internal/pycore_code.h \
                 $(srcdir)/Include/internal/pycore_condvar.h \
                 $(srcdir)/Include/internal/pycore_context.h \
                 $(srcdir)/Include/internal/pycore_fileutils.h \
diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst b/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst

new file mode 100644 (file)

index 0000000..a4ee7b5
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst
@@ -0,0 +1,3 @@
+Implemented per opcode cache mechanism and ``LOAD_GLOBAL`` instruction use
+it. ``LOAD_GLOBAL`` is now about 40% faster. Contributed by Yury Selivanov,
+and Inada Naoki.
diff --git a/Objects/codeobject.c b/Objects/codeobject.c

index 233307562a5537281e2d6830a78f8a58dd602d4d..0d9e5d16e768152e0d581e07ad6d24a875657eeb 100644 (file)
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -2,7 +2,9 @@
  
  #include "Python.h"
  #include "code.h"
+#include "opcode.h"
  #include "structmember.h"
+#include "pycore_code.h"
  #include "pycore_pystate.h"
  #include "pycore_tupleobject.h"
  #include "clinic/codeobject.c.h"
@@ -233,9 +235,56 @@ PyCode_New(int argcount, int posonlyargcount, int kwonlyargcount,
      co->co_zombieframe = NULL;
      co->co_weakreflist = NULL;
      co->co_extra = NULL;
+
+    co->co_opcache_map = NULL;
+    co->co_opcache = NULL;
+    co->co_opcache_flag = 0;
+    co->co_opcache_size = 0;
      return co;
  }
  
+int
+_PyCode_InitOpcache(PyCodeObject *co)
+{
+    Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT);
+    co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1);
+    if (co->co_opcache_map == NULL) {
+        return -1;
+    }
+
+    _Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code);
+    Py_ssize_t opts = 0;
+
+    for (Py_ssize_t i = 0; i < co_size;) {
+        unsigned char opcode = _Py_OPCODE(opcodes[i]);
+        i++;  // 'i' is now aligned to (next_instr - first_instr)
+
+        // TODO: LOAD_METHOD, LOAD_ATTR
+        if (opcode == LOAD_GLOBAL) {
+            co->co_opcache_map[i] = ++opts;
+            if (opts > 254) {
+                break;
+            }
+        }
+    }
+
+    if (opts) {
+        co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache));
+        if (co->co_opcache == NULL) {
+            PyMem_FREE(co->co_opcache_map);
+            return -1;
+        }
+    }
+    else {
+        PyMem_FREE(co->co_opcache_map);
+        co->co_opcache_map = NULL;
+        co->co_opcache = NULL;
+    }
+
+    co->co_opcache_size = opts;
+    return 0;
+}
+
  PyCodeObject *
  PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno)
  {
@@ -458,6 +507,15 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw)
  static void
  code_dealloc(PyCodeObject *co)
  {
+    if (co->co_opcache != NULL) {
+        PyMem_FREE(co->co_opcache);
+    }
+    if (co->co_opcache_map != NULL) {
+        PyMem_FREE(co->co_opcache_map);
+    }
+    co->co_opcache_flag = 0;
+    co->co_opcache_size = 0;
+
      if (co->co_extra != NULL) {
          PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
          _PyCodeObjectExtra *co_extra = co->co_extra;
@@ -504,6 +562,13 @@ code_sizeof(PyCodeObject *co, PyObject *Py_UNUSED(args))
          res += sizeof(_PyCodeObjectExtra) +
                 (co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]);
      }
+    if (co->co_opcache != NULL) {
+        assert(co->co_opcache_map != NULL);
+        // co_opcache_map
+        res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT);
+        // co_opcache
+        res += co->co_opcache_size * sizeof(_PyOpcache);
+    }
      return PyLong_FromSsize_t(res);
  }
  
diff --git a/Objects/dictobject.c b/Objects/dictobject.c

index 2b04b0b679651a9a765fb38891f9916d981e9c12..0cc144375006999f0329e15a399f52febc76fd94 100644 (file)
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -1080,20 +1080,21 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
          return 0;
      }
  
-    if (_PyDict_HasSplitTable(mp)) {
-        mp->ma_values[ix] = value;
-        if (old_value == NULL) {
-            /* pending state */
-            assert(ix == mp->ma_used);
-            mp->ma_used++;
+    if (old_value != value) {
+        if (_PyDict_HasSplitTable(mp)) {
+            mp->ma_values[ix] = value;
+            if (old_value == NULL) {
+                /* pending state */
+                assert(ix == mp->ma_used);
+                mp->ma_used++;
+            }
          }
+        else {
+            assert(old_value != NULL);
+            DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
+        }
+        mp->ma_version_tag = DICT_NEXT_VERSION();
      }
-    else {
-        assert(old_value != NULL);
-        DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
-    }
-
-    mp->ma_version_tag = DICT_NEXT_VERSION();
      Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */
      ASSERT_CONSISTENT(mp);
      Py_DECREF(key);
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj

index 329f9feb2bdf025040432c41e1b327e2d4f98e16..89625da944e1ab48c55d411cba8d73341662d64b 100644 (file)
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -158,6 +158,7 @@
      <ClInclude Include="..\Include\import.h" />
      <ClInclude Include="..\Include\internal\pycore_accu.h" />
      <ClInclude Include="..\Include\internal\pycore_atomic.h" />
+    <ClInclude Include="..\Include\internal\pycore_code.h" />
      <ClInclude Include="..\Include\internal\pycore_ceval.h" />
      <ClInclude Include="..\Include\internal\pycore_condvar.h" />
      <ClInclude Include="..\Include\internal\pycore_context.h" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters

index d80d05fb15a0cf8523df037a075cb703b0a97b39..63ab88b4a01ddc1c759fee63cd8e316f572e70b6 100644 (file)
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -177,6 +177,9 @@
      <ClInclude Include="..\Include\internal\pycore_atomic.h">
        <Filter>Include</Filter>
      </ClInclude>
+    <ClInclude Include="..\Include\internal\pycore_code.h">
+      <Filter>Include</Filter>
+    </ClInclude>
      <ClInclude Include="..\Include\internal\pycore_ceval.h">
        <Filter>Include</Filter>
      </ClInclude>
diff --git a/Python/ceval.c b/Python/ceval.c

index a092a23556413604875173900b0b92c25720b4b3..411ba3d73c5f2ec35ff79bb7080d62c34788c468 100644 (file)
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -11,6 +11,7 @@
  
  #include "Python.h"
  #include "pycore_ceval.h"
+#include "pycore_code.h"
  #include "pycore_object.h"
  #include "pycore_pyerrors.h"
  #include "pycore_pylifecycle.h"
@@ -101,6 +102,20 @@ static long dxp[256];
  #endif
  #endif
  
+/* per opcode cache */
+#define OPCACHE_MIN_RUNS 1024  /* create opcache when code executed this time */
+#define OPCACHE_STATS 0  /* Enable stats */
+
+#if OPCACHE_STATS
+static size_t opcache_code_objects = 0;
+static size_t opcache_code_objects_extra_mem = 0;
+
+static size_t opcache_global_opts = 0;
+static size_t opcache_global_hits = 0;
+static size_t opcache_global_misses = 0;
+#endif
+
+
  /* This can set eval_breaker to 0 even though gil_drop_request became
     1.  We believe this is all right because the eval loop will release
     the GIL eventually anyway. */
@@ -225,6 +240,35 @@ exit_thread_if_finalizing(PyThreadState *tstate)
      }
  }
  
+void
+_PyEval_Fini(void)
+{
+#if OPCACHE_STATS
+    fprintf(stderr, "-- Opcode cache number of objects  = %zd\n",
+            opcache_code_objects);
+
+    fprintf(stderr, "-- Opcode cache total extra mem    = %zd\n",
+            opcache_code_objects_extra_mem);
+
+    fprintf(stderr, "\n");
+
+    fprintf(stderr, "-- Opcode cache LOAD_GLOBAL hits   = %zd (%d%%)\n",
+            opcache_global_hits,
+            (int) (100.0 * opcache_global_hits /
+                (opcache_global_hits + opcache_global_misses)));
+
+    fprintf(stderr, "-- Opcode cache LOAD_GLOBAL misses = %zd (%d%%)\n",
+            opcache_global_misses,
+            (int) (100.0 * opcache_global_misses /
+                (opcache_global_hits + opcache_global_misses)));
+
+    fprintf(stderr, "-- Opcode cache LOAD_GLOBAL opts   = %zd\n",
+            opcache_global_opts);
+
+    fprintf(stderr, "\n");
+#endif
+}
+
  void
  PyEval_AcquireLock(void)
  {
@@ -799,6 +843,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
      const _Py_CODEUNIT *first_instr;
      PyObject *names;
      PyObject *consts;
+    _PyOpcache *co_opcache;
  
  #ifdef LLTRACE
      _Py_IDENTIFIER(__ltrace__);
@@ -1061,6 +1106,49 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
          Py_XDECREF(traceback); \
      } while(0)
  
+    /* macros for opcode cache */
+#define OPCACHE_CHECK() \
+    do { \
+        co_opcache = NULL; \
+        if (co->co_opcache != NULL) { \
+            unsigned char co_opt_offset = \
+                co->co_opcache_map[next_instr - first_instr]; \
+            if (co_opt_offset > 0) { \
+                assert(co_opt_offset <= co->co_opcache_size); \
+                co_opcache = &co->co_opcache[co_opt_offset - 1]; \
+                assert(co_opcache != NULL); \
+                if (co_opcache->optimized < 0) { \
+                    co_opcache = NULL; \
+                } \
+            } \
+        } \
+    } while (0)
+
+#if OPCACHE_STATS
+
+#define OPCACHE_STAT_GLOBAL_HIT() \
+    do { \
+        if (co->co_opcache != NULL) opcache_global_hits++; \
+    } while (0)
+
+#define OPCACHE_STAT_GLOBAL_MISS() \
+    do { \
+        if (co->co_opcache != NULL) opcache_global_misses++; \
+    } while (0)
+
+#define OPCACHE_STAT_GLOBAL_OPT() \
+    do { \
+        if (co->co_opcache != NULL) opcache_global_opts++; \
+    } while (0)
+
+#else /* OPCACHE_STATS */
+
+#define OPCACHE_STAT_GLOBAL_HIT()
+#define OPCACHE_STAT_GLOBAL_MISS()
+#define OPCACHE_STAT_GLOBAL_OPT()
+
+#endif
+
  /* Start of code */
  
      /* push frame */
@@ -1142,6 +1230,20 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
      f->f_stacktop = NULL;       /* remains NULL unless yield suspends frame */
      f->f_executing = 1;
  
+    if (co->co_opcache_flag < OPCACHE_MIN_RUNS) {
+        co->co_opcache_flag++;
+        if (co->co_opcache_flag == OPCACHE_MIN_RUNS) {
+            if (_PyCode_InitOpcache(co) < 0) {
+                return NULL;
+            }
+#if OPCACHE_STATS
+            opcache_code_objects_extra_mem +=
+                PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT) +
+                sizeof(_PyOpcache) * co->co_opcache_size;
+            opcache_code_objects++;
+#endif
+        }
+    }
  
  #ifdef LLTRACE
      lltrace = _PyDict_GetItemId(f->f_globals, &PyId___ltrace__) != NULL;
@@ -2451,11 +2553,30 @@ main_loop:
          }
  
          case TARGET(LOAD_GLOBAL): {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name;
              PyObject *v;
              if (PyDict_CheckExact(f->f_globals)
                  && PyDict_CheckExact(f->f_builtins))
              {
+                OPCACHE_CHECK();
+                if (co_opcache != NULL && co_opcache->optimized > 0) {
+                    _PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
+
+                    if (lg->globals_ver ==
+                            ((PyDictObject *)f->f_globals)->ma_version_tag
+                        && lg->builtins_ver ==
+                           ((PyDictObject *)f->f_builtins)->ma_version_tag)
+                    {
+                        PyObject *ptr = lg->ptr;
+                        OPCACHE_STAT_GLOBAL_HIT();
+                        assert(ptr != NULL);
+                        Py_INCREF(ptr);
+                        PUSH(ptr);
+                        DISPATCH();
+                    }
+                }
+
+                name = GETITEM(names, oparg);
                  v = _PyDict_LoadGlobal((PyDictObject *)f->f_globals,
                                         (PyDictObject *)f->f_builtins,
                                         name);
@@ -2468,12 +2589,32 @@ main_loop:
                      }
                      goto error;
                  }
+
+                if (co_opcache != NULL) {
+                    _PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
+
+                    if (co_opcache->optimized == 0) {
+                        /* Wasn't optimized before. */
+                        OPCACHE_STAT_GLOBAL_OPT();
+                    } else {
+                        OPCACHE_STAT_GLOBAL_MISS();
+                    }
+
+                    co_opcache->optimized = 1;
+                    lg->globals_ver =
+                        ((PyDictObject *)f->f_globals)->ma_version_tag;
+                    lg->builtins_ver =
+                        ((PyDictObject *)f->f_builtins)->ma_version_tag;
+                    lg->ptr = v; /* borrowed */
+                }
+
                  Py_INCREF(v);
              }
              else {
                  /* Slow-path if globals or builtins is not a dict */
  
                  /* namespace 1: globals */
+                name = GETITEM(names, oparg);
                  v = PyObject_GetItem(f->f_globals, name);
                  if (v == NULL) {
                      if (!_PyErr_ExceptionMatches(tstate, PyExc_KeyError)) {
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c

index 3de5528811ae16ee9bcaae5e4037d4be8d8bd230..fc7e5510b2cdc80868439c8ee2dd317bd8286ce9 100644 (file)
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1242,6 +1242,9 @@ Py_FinalizeEx(void)
      /* Destroy all modules */
      PyImport_Cleanup();
  
+    /* Print debug stats if any */
+    _PyEval_Fini();
+
      /* Flush sys.stdout and sys.stderr (again, in case more was printed) */
      if (flush_std_files() < 0) {
          status = -1;
author	Inada Naoki <songofacandy@gmail.com>
	Mon, 3 Jun 2019 12:30:58 +0000 (21:30 +0900)
committer	GitHub <noreply@github.com>
	Mon, 3 Jun 2019 12:30:58 +0000 (21:30 +0900)
Doc/whatsnew/3.8.rst		patch \| blob \| history
Include/code.h		patch \| blob \| history
Include/internal/pycore_ceval.h		patch \| blob \| history
Include/internal/pycore_code.h	[new file with mode: 0644]	patch \| blob
Lib/test/test_dict_version.py		patch \| blob \| history
Makefile.pre.in		patch \| blob \| history
Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst	[new file with mode: 0644]	patch \| blob
Objects/codeobject.c		patch \| blob \| history
Objects/dictobject.c		patch \| blob \| history
PCbuild/pythoncore.vcxproj		patch \| blob \| history
PCbuild/pythoncore.vcxproj.filters		patch \| blob \| history
Python/ceval.c		patch \| blob \| history
Python/pylifecycle.c		patch \| blob \| history