methods up to 20--50%. (Contributed by Serhiy Storchaka in :issue:`23867`,
:issue:`35582` and :issue:`36127`.)
+* ``LOAD_GLOBAL`` instruction now uses new "per opcode cache" mechanism.
+ It is about 40% faster now. (Contributed by Yury Selivanov and Inada Naoki in
+ :issue:`26219`.)
+
Build and C API Changes
=======================
# define _Py_OPARG(word) ((word) >> 8)
#endif
+typedef struct _PyOpcache _PyOpcache;
+
/* Bytecode object */
typedef struct {
PyObject_HEAD
Type is a void* to keep the format private in codeobject.c to force
people to go through the proper APIs. */
void *co_extra;
+
+ /* Per opcodes just-in-time cache
+ *
+ * To reduce cache size, we use indirect mapping from opcode index to
+ * cache object:
+ * cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1]
+ */
+
+ // co_opcache_map is indexed by (next_instr - first_instr).
+ // * 0 means there is no cache for this opcode.
+ // * n > 0 means there is cache in co_opcache[n-1].
+ unsigned char *co_opcache_map;
+ _PyOpcache *co_opcache;
+ int co_opcache_flag; // used to determine when create a cache.
+ unsigned char co_opcache_size; // length of co_opcache.
} PyCodeObject;
/* Masks for co_flags above */
PyAPI_FUNC(void) _PyEval_ReInitThreads(
_PyRuntimeState *runtime);
+/* Private function */
+void _PyEval_Fini(void);
+
#ifdef __cplusplus
}
#endif
--- /dev/null
+#ifndef Py_INTERNAL_CODE_H
+#define Py_INTERNAL_CODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ PyObject *ptr; /* Cached pointer (borrowed reference) */
+ uint64_t globals_ver; /* ma_version of global dict */
+ uint64_t builtins_ver; /* ma_version of builtin dict */
+} _PyOpcache_LoadGlobal;
+
+struct _PyOpcache {
+ union {
+ _PyOpcache_LoadGlobal lg;
+ } u;
+ char optimized;
+};
+
+/* Private API */
+int _PyCode_InitOpcache(PyCodeObject *co);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CODE_H */
# setting a key to the same value with dict.__setitem__
# must change the version
- self.check_version_changed(d, d.__setitem__, 'key', value)
+ self.check_version_dont_change(d, d.__setitem__, 'key', value)
# setting a key to the same value with dict.update
# must change the version
- self.check_version_changed(d, d.update, key=value)
+ self.check_version_dont_change(d, d.update, key=value)
d2 = self.new_dict(key=value)
- self.check_version_changed(d, d.update, d2)
+ self.check_version_dont_change(d, d.update, d2)
def test_setitem_equal(self):
class AlwaysEqual:
$(srcdir)/Include/internal/pycore_accu.h \
$(srcdir)/Include/internal/pycore_atomic.h \
$(srcdir)/Include/internal/pycore_ceval.h \
+ $(srcdir)/Include/internal/pycore_code.h \
$(srcdir)/Include/internal/pycore_condvar.h \
$(srcdir)/Include/internal/pycore_context.h \
$(srcdir)/Include/internal/pycore_fileutils.h \
--- /dev/null
+Implemented per opcode cache mechanism and ``LOAD_GLOBAL`` instruction use
+it. ``LOAD_GLOBAL`` is now about 40% faster. Contributed by Yury Selivanov,
+and Inada Naoki.
#include "Python.h"
#include "code.h"
+#include "opcode.h"
#include "structmember.h"
+#include "pycore_code.h"
#include "pycore_pystate.h"
#include "pycore_tupleobject.h"
#include "clinic/codeobject.c.h"
co->co_zombieframe = NULL;
co->co_weakreflist = NULL;
co->co_extra = NULL;
+
+ co->co_opcache_map = NULL;
+ co->co_opcache = NULL;
+ co->co_opcache_flag = 0;
+ co->co_opcache_size = 0;
return co;
}
+int
+_PyCode_InitOpcache(PyCodeObject *co)
+{
+ Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT);
+ co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1);
+ if (co->co_opcache_map == NULL) {
+ return -1;
+ }
+
+ _Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code);
+ Py_ssize_t opts = 0;
+
+ for (Py_ssize_t i = 0; i < co_size;) {
+ unsigned char opcode = _Py_OPCODE(opcodes[i]);
+ i++; // 'i' is now aligned to (next_instr - first_instr)
+
+ // TODO: LOAD_METHOD, LOAD_ATTR
+ if (opcode == LOAD_GLOBAL) {
+ co->co_opcache_map[i] = ++opts;
+ if (opts > 254) {
+ break;
+ }
+ }
+ }
+
+ if (opts) {
+ co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache));
+ if (co->co_opcache == NULL) {
+ PyMem_FREE(co->co_opcache_map);
+ return -1;
+ }
+ }
+ else {
+ PyMem_FREE(co->co_opcache_map);
+ co->co_opcache_map = NULL;
+ co->co_opcache = NULL;
+ }
+
+ co->co_opcache_size = opts;
+ return 0;
+}
+
PyCodeObject *
PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno)
{
static void
code_dealloc(PyCodeObject *co)
{
+ if (co->co_opcache != NULL) {
+ PyMem_FREE(co->co_opcache);
+ }
+ if (co->co_opcache_map != NULL) {
+ PyMem_FREE(co->co_opcache_map);
+ }
+ co->co_opcache_flag = 0;
+ co->co_opcache_size = 0;
+
if (co->co_extra != NULL) {
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
_PyCodeObjectExtra *co_extra = co->co_extra;
res += sizeof(_PyCodeObjectExtra) +
(co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]);
}
+ if (co->co_opcache != NULL) {
+ assert(co->co_opcache_map != NULL);
+ // co_opcache_map
+ res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT);
+ // co_opcache
+ res += co->co_opcache_size * sizeof(_PyOpcache);
+ }
return PyLong_FromSsize_t(res);
}
return 0;
}
- if (_PyDict_HasSplitTable(mp)) {
- mp->ma_values[ix] = value;
- if (old_value == NULL) {
- /* pending state */
- assert(ix == mp->ma_used);
- mp->ma_used++;
+ if (old_value != value) {
+ if (_PyDict_HasSplitTable(mp)) {
+ mp->ma_values[ix] = value;
+ if (old_value == NULL) {
+ /* pending state */
+ assert(ix == mp->ma_used);
+ mp->ma_used++;
+ }
}
+ else {
+ assert(old_value != NULL);
+ DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
+ }
+ mp->ma_version_tag = DICT_NEXT_VERSION();
}
- else {
- assert(old_value != NULL);
- DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
- }
-
- mp->ma_version_tag = DICT_NEXT_VERSION();
Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */
ASSERT_CONSISTENT(mp);
Py_DECREF(key);
<ClInclude Include="..\Include\import.h" />
<ClInclude Include="..\Include\internal\pycore_accu.h" />
<ClInclude Include="..\Include\internal\pycore_atomic.h" />
+ <ClInclude Include="..\Include\internal\pycore_code.h" />
<ClInclude Include="..\Include\internal\pycore_ceval.h" />
<ClInclude Include="..\Include\internal\pycore_condvar.h" />
<ClInclude Include="..\Include\internal\pycore_context.h" />
<ClInclude Include="..\Include\internal\pycore_atomic.h">
<Filter>Include</Filter>
</ClInclude>
+ <ClInclude Include="..\Include\internal\pycore_code.h">
+ <Filter>Include</Filter>
+ </ClInclude>
<ClInclude Include="..\Include\internal\pycore_ceval.h">
<Filter>Include</Filter>
</ClInclude>
#include "Python.h"
#include "pycore_ceval.h"
+#include "pycore_code.h"
#include "pycore_object.h"
#include "pycore_pyerrors.h"
#include "pycore_pylifecycle.h"
#endif
#endif
+/* per opcode cache */
+#define OPCACHE_MIN_RUNS 1024 /* create opcache when code executed this time */
+#define OPCACHE_STATS 0 /* Enable stats */
+
+#if OPCACHE_STATS
+static size_t opcache_code_objects = 0;
+static size_t opcache_code_objects_extra_mem = 0;
+
+static size_t opcache_global_opts = 0;
+static size_t opcache_global_hits = 0;
+static size_t opcache_global_misses = 0;
+#endif
+
+
/* This can set eval_breaker to 0 even though gil_drop_request became
1. We believe this is all right because the eval loop will release
the GIL eventually anyway. */
}
}
+void
+_PyEval_Fini(void)
+{
+#if OPCACHE_STATS
+ fprintf(stderr, "-- Opcode cache number of objects = %zd\n",
+ opcache_code_objects);
+
+ fprintf(stderr, "-- Opcode cache total extra mem = %zd\n",
+ opcache_code_objects_extra_mem);
+
+ fprintf(stderr, "\n");
+
+ fprintf(stderr, "-- Opcode cache LOAD_GLOBAL hits = %zd (%d%%)\n",
+ opcache_global_hits,
+ (int) (100.0 * opcache_global_hits /
+ (opcache_global_hits + opcache_global_misses)));
+
+ fprintf(stderr, "-- Opcode cache LOAD_GLOBAL misses = %zd (%d%%)\n",
+ opcache_global_misses,
+ (int) (100.0 * opcache_global_misses /
+ (opcache_global_hits + opcache_global_misses)));
+
+ fprintf(stderr, "-- Opcode cache LOAD_GLOBAL opts = %zd\n",
+ opcache_global_opts);
+
+ fprintf(stderr, "\n");
+#endif
+}
+
void
PyEval_AcquireLock(void)
{
const _Py_CODEUNIT *first_instr;
PyObject *names;
PyObject *consts;
+ _PyOpcache *co_opcache;
#ifdef LLTRACE
_Py_IDENTIFIER(__ltrace__);
Py_XDECREF(traceback); \
} while(0)
+ /* macros for opcode cache */
+#define OPCACHE_CHECK() \
+ do { \
+ co_opcache = NULL; \
+ if (co->co_opcache != NULL) { \
+ unsigned char co_opt_offset = \
+ co->co_opcache_map[next_instr - first_instr]; \
+ if (co_opt_offset > 0) { \
+ assert(co_opt_offset <= co->co_opcache_size); \
+ co_opcache = &co->co_opcache[co_opt_offset - 1]; \
+ assert(co_opcache != NULL); \
+ if (co_opcache->optimized < 0) { \
+ co_opcache = NULL; \
+ } \
+ } \
+ } \
+ } while (0)
+
+#if OPCACHE_STATS
+
+#define OPCACHE_STAT_GLOBAL_HIT() \
+ do { \
+ if (co->co_opcache != NULL) opcache_global_hits++; \
+ } while (0)
+
+#define OPCACHE_STAT_GLOBAL_MISS() \
+ do { \
+ if (co->co_opcache != NULL) opcache_global_misses++; \
+ } while (0)
+
+#define OPCACHE_STAT_GLOBAL_OPT() \
+ do { \
+ if (co->co_opcache != NULL) opcache_global_opts++; \
+ } while (0)
+
+#else /* OPCACHE_STATS */
+
+#define OPCACHE_STAT_GLOBAL_HIT()
+#define OPCACHE_STAT_GLOBAL_MISS()
+#define OPCACHE_STAT_GLOBAL_OPT()
+
+#endif
+
/* Start of code */
/* push frame */
f->f_stacktop = NULL; /* remains NULL unless yield suspends frame */
f->f_executing = 1;
+ if (co->co_opcache_flag < OPCACHE_MIN_RUNS) {
+ co->co_opcache_flag++;
+ if (co->co_opcache_flag == OPCACHE_MIN_RUNS) {
+ if (_PyCode_InitOpcache(co) < 0) {
+ return NULL;
+ }
+#if OPCACHE_STATS
+ opcache_code_objects_extra_mem +=
+ PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT) +
+ sizeof(_PyOpcache) * co->co_opcache_size;
+ opcache_code_objects++;
+#endif
+ }
+ }
#ifdef LLTRACE
lltrace = _PyDict_GetItemId(f->f_globals, &PyId___ltrace__) != NULL;
}
case TARGET(LOAD_GLOBAL): {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name;
PyObject *v;
if (PyDict_CheckExact(f->f_globals)
&& PyDict_CheckExact(f->f_builtins))
{
+ OPCACHE_CHECK();
+ if (co_opcache != NULL && co_opcache->optimized > 0) {
+ _PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
+
+ if (lg->globals_ver ==
+ ((PyDictObject *)f->f_globals)->ma_version_tag
+ && lg->builtins_ver ==
+ ((PyDictObject *)f->f_builtins)->ma_version_tag)
+ {
+ PyObject *ptr = lg->ptr;
+ OPCACHE_STAT_GLOBAL_HIT();
+ assert(ptr != NULL);
+ Py_INCREF(ptr);
+ PUSH(ptr);
+ DISPATCH();
+ }
+ }
+
+ name = GETITEM(names, oparg);
v = _PyDict_LoadGlobal((PyDictObject *)f->f_globals,
(PyDictObject *)f->f_builtins,
name);
}
goto error;
}
+
+ if (co_opcache != NULL) {
+ _PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
+
+ if (co_opcache->optimized == 0) {
+ /* Wasn't optimized before. */
+ OPCACHE_STAT_GLOBAL_OPT();
+ } else {
+ OPCACHE_STAT_GLOBAL_MISS();
+ }
+
+ co_opcache->optimized = 1;
+ lg->globals_ver =
+ ((PyDictObject *)f->f_globals)->ma_version_tag;
+ lg->builtins_ver =
+ ((PyDictObject *)f->f_builtins)->ma_version_tag;
+ lg->ptr = v; /* borrowed */
+ }
+
Py_INCREF(v);
}
else {
/* Slow-path if globals or builtins is not a dict */
/* namespace 1: globals */
+ name = GETITEM(names, oparg);
v = PyObject_GetItem(f->f_globals, name);
if (v == NULL) {
if (!_PyErr_ExceptionMatches(tstate, PyExc_KeyError)) {
/* Destroy all modules */
PyImport_Cleanup();
+ /* Print debug stats if any */
+ _PyEval_Fini();
+
/* Flush sys.stdout and sys.stderr (again, in case more was printed) */
if (flush_std_files() < 0) {
status = -1;