]> granicus.if.org Git - python/commitdiff
Issue #4688: Add a heuristic so that tuples and dicts containing only
authorAntoine Pitrou <solipsis@pitrou.net>
Mon, 23 Mar 2009 18:41:45 +0000 (18:41 +0000)
committerAntoine Pitrou <solipsis@pitrou.net>
Mon, 23 Mar 2009 18:41:45 +0000 (18:41 +0000)
untrackable objects are not tracked by the garbage collector. This can
reduce the size of collections and therefore the garbage collection overhead
on long-running programs, depending on their particular use of datatypes.

(trivia: this makes the "binary_trees" benchmark from the Computer Language
Shootout 40% faster)

Doc/library/gc.rst
Include/dictobject.h
Include/objimpl.h
Include/tupleobject.h
Lib/test/test_dict.py
Lib/test/test_gc.py
Lib/test/test_tuple.py
Misc/NEWS
Modules/gcmodule.c
Objects/dictobject.c
Objects/tupleobject.c

index 9ebbf06c7b5b93c3c93e640c08251c748a2bfa1e..65f0f39a1567bbac5d665371ddc5062c72b9a828 100644 (file)
@@ -140,6 +140,31 @@ The :mod:`gc` module provides the following functions:
 
    .. versionadded:: 2.3
 
+.. function:: is_tracked(obj)
+
+   Returns True if the object is currently tracked by the garbage collector,
+   False otherwise.  As a general rule, instances of atomic types aren't
+   tracked and instances of non-atomic types (containers, user-defined
+   objects...) are.  However, some type-specific optimizations can be present
+   in order to suppress the garbage collector footprint of simple instances
+   (e.g. dicts containing only atomic keys and values)::
+
+      >>> gc.is_tracked(0)
+      False
+      >>> gc.is_tracked("a")
+      False
+      >>> gc.is_tracked([])
+      True
+      >>> gc.is_tracked({})
+      False
+      >>> gc.is_tracked({"a": 1})
+      False
+      >>> gc.is_tracked({"a": []})
+      True
+
+   .. versionadded:: 2.7
+
+
 The following variable is provided for read-only access (you can mutate its
 value but should not rebind it):
 
index b83cd0e8933f40dfaa31989b18eb6cd4eca50726..06e0a7ebe74679bdcc421fa523bde40b84c28753 100644 (file)
@@ -111,6 +111,7 @@ PyAPI_FUNC(PyObject *) PyDict_Copy(PyObject *mp);
 PyAPI_FUNC(int) PyDict_Contains(PyObject *mp, PyObject *key);
 PyAPI_FUNC(int) _PyDict_Contains(PyObject *mp, PyObject *key, long hash);
 PyAPI_FUNC(PyObject *) _PyDict_NewPresized(Py_ssize_t minused);
+PyAPI_FUNC(void) _PyDict_MaybeUntrack(PyObject *mp);
 
 /* PyDict_Update(mp, other) is equivalent to PyDict_Merge(mp, other, 1). */
 PyAPI_FUNC(int) PyDict_Update(PyObject *mp, PyObject *other);
index ef47218fced0c685b4574563757cd58a2c3d0848..55186b176be682814f626e47ab7ac3ee0e8fe32d 100644 (file)
@@ -285,6 +285,17 @@ extern PyGC_Head *_PyGC_generation0;
        g->gc.gc_next = NULL; \
     } while (0);
 
+/* True if the object is currently tracked by the GC. */
+#define _PyObject_GC_IS_TRACKED(o) \
+       ((_Py_AS_GC(o))->gc.gc_refs != _PyGC_REFS_UNTRACKED)
+/* True if the object may be tracked by the GC in the future, or already is.
+   This can be useful to implement some optimizations. */
+#define _PyObject_GC_MAY_BE_TRACKED(obj) \
+       (PyObject_IS_GC(obj) && \
+               (!PyTuple_CheckExact(obj) || _PyObject_GC_IS_TRACKED(obj)))
+
+
 PyAPI_FUNC(PyObject *) _PyObject_GC_Malloc(size_t);
 PyAPI_FUNC(PyObject *) _PyObject_GC_New(PyTypeObject *);
 PyAPI_FUNC(PyVarObject *) _PyObject_GC_NewVar(PyTypeObject *, Py_ssize_t);
index 58479ee0ffeae3d73c6b76b3925b36137c1c95c3..a5ab733208b9299dfc4ac6fdc0b9a55ca267fd1c 100644 (file)
@@ -44,6 +44,7 @@ PyAPI_FUNC(int) PyTuple_SetItem(PyObject *, Py_ssize_t, PyObject *);
 PyAPI_FUNC(PyObject *) PyTuple_GetSlice(PyObject *, Py_ssize_t, Py_ssize_t);
 PyAPI_FUNC(int) _PyTuple_Resize(PyObject **, Py_ssize_t);
 PyAPI_FUNC(PyObject *) PyTuple_Pack(Py_ssize_t, ...);
+PyAPI_FUNC(void) _PyTuple_MaybeUntrack(PyObject *);
 
 /* Macro, trading safety for speed */
 #define PyTuple_GET_ITEM(op, i) (((PyTupleObject *)(op))->ob_item[i])
index 0907744ab5f024b0c16acbd3c9f8e5da3b2f9469..b73a53f580a96a19185ac4a2084d7ee2b00d0131 100644 (file)
@@ -569,6 +569,104 @@ class DictTest(unittest.TestCase):
             gc.collect()
             self.assert_(ref() is None, "Cycle was not collected")
 
+    def _not_tracked(self, t):
+        # Nested containers can take several collections to untrack
+        gc.collect()
+        gc.collect()
+        self.assertFalse(gc.is_tracked(t), t)
+
+    def _tracked(self, t):
+        self.assertTrue(gc.is_tracked(t), t)
+        gc.collect()
+        gc.collect()
+        self.assertTrue(gc.is_tracked(t), t)
+
+    def test_track_literals(self):
+        # Test GC-optimization of dict literals
+        x, y, z, w = 1.5, "a", (1, None), []
+
+        self._not_tracked({})
+        self._not_tracked({x:(), y:x, z:1})
+        self._not_tracked({1: "a", "b": 2})
+        self._not_tracked({1: 2, (None, True, False, ()): int})
+        self._not_tracked({1: object()})
+
+        # Dicts with mutable elements are always tracked, even if those
+        # elements are not tracked right now.
+        self._tracked({1: []})
+        self._tracked({1: ([],)})
+        self._tracked({1: {}})
+        self._tracked({1: set()})
+
+    def test_track_dynamic(self):
+        # Test GC-optimization of dynamically-created dicts
+        class MyObject(object):
+            pass
+        x, y, z, w, o = 1.5, "a", (1, object()), [], MyObject()
+
+        d = dict()
+        self._not_tracked(d)
+        d[1] = "a"
+        self._not_tracked(d)
+        d[y] = 2
+        self._not_tracked(d)
+        d[z] = 3
+        self._not_tracked(d)
+        self._not_tracked(d.copy())
+        d[4] = w
+        self._tracked(d)
+        self._tracked(d.copy())
+        d[4] = None
+        self._not_tracked(d)
+        self._not_tracked(d.copy())
+
+        # dd isn't tracked right now, but it may mutate and therefore d
+        # which contains it must be tracked.
+        d = dict()
+        dd = dict()
+        d[1] = dd
+        self._not_tracked(dd)
+        self._tracked(d)
+        dd[1] = d
+        self._tracked(dd)
+
+        d = dict.fromkeys([x, y, z])
+        self._not_tracked(d)
+        dd = dict()
+        dd.update(d)
+        self._not_tracked(dd)
+        d = dict.fromkeys([x, y, z, o])
+        self._tracked(d)
+        dd = dict()
+        dd.update(d)
+        self._tracked(dd)
+
+        d = dict(x=x, y=y, z=z)
+        self._not_tracked(d)
+        d = dict(x=x, y=y, z=z, w=w)
+        self._tracked(d)
+        d = dict()
+        d.update(x=x, y=y, z=z)
+        self._not_tracked(d)
+        d.update(w=w)
+        self._tracked(d)
+
+        d = dict([(x, y), (z, 1)])
+        self._not_tracked(d)
+        d = dict([(x, y), (z, w)])
+        self._tracked(d)
+        d = dict()
+        d.update([(x, y), (z, 1)])
+        self._not_tracked(d)
+        d.update([(x, y), (z, w)])
+        self._tracked(d)
+
+    def test_track_subtypes(self):
+        # Dict subtypes are always tracked
+        class MyDict(dict):
+            pass
+        self._tracked(MyDict())
+
 
 from test import mapping_tests
 
index 6e2ea4198f0ffa6056d3710ef70975c4840f50ac..e9af550bf8f4f5d878c86ae41eb228d90831bd04 100644 (file)
@@ -415,6 +415,37 @@ class GCTests(unittest.TestCase):
 
         self.assertEqual(gc.get_referents(1, 'a', 4j), [])
 
+    def test_is_tracked(self):
+        # Atomic built-in types are not tracked, user-defined objects and
+        # mutable containers are.
+        # NOTE: types with special optimizations (e.g. tuple) have tests
+        # in their own test files instead.
+        self.assertFalse(gc.is_tracked(None))
+        self.assertFalse(gc.is_tracked(1))
+        self.assertFalse(gc.is_tracked(1.0))
+        self.assertFalse(gc.is_tracked(1.0 + 5.0j))
+        self.assertFalse(gc.is_tracked(True))
+        self.assertFalse(gc.is_tracked(False))
+        self.assertFalse(gc.is_tracked("a"))
+        self.assertFalse(gc.is_tracked(u"a"))
+        self.assertFalse(gc.is_tracked(bytearray("a")))
+        self.assertFalse(gc.is_tracked(type))
+        self.assertFalse(gc.is_tracked(int))
+        self.assertFalse(gc.is_tracked(object))
+        self.assertFalse(gc.is_tracked(object()))
+
+        class OldStyle:
+            pass
+        class NewStyle(object):
+            pass
+        self.assertTrue(gc.is_tracked(gc))
+        self.assertTrue(gc.is_tracked(OldStyle))
+        self.assertTrue(gc.is_tracked(OldStyle()))
+        self.assertTrue(gc.is_tracked(NewStyle))
+        self.assertTrue(gc.is_tracked(NewStyle()))
+        self.assertTrue(gc.is_tracked([]))
+        self.assertTrue(gc.is_tracked(set()))
+
     def test_bug1055820b(self):
         # Corresponds to temp2b.py in the bug report.
 
index 15bc2956cf0c9da6c7db419fa0c9ab753b87b301..89cbe1d4c5581286e3524bf1b7aa2af4ecd88605 100644 (file)
@@ -1,5 +1,7 @@
 from test import test_support, seq_tests
 
+import gc
+
 class TupleTest(seq_tests.CommonTest):
     type2test = tuple
 
@@ -82,6 +84,69 @@ class TupleTest(seq_tests.CommonTest):
         self.assertEqual(repr(a0), "()")
         self.assertEqual(repr(a2), "(0, 1, 2)")
 
+    def _not_tracked(self, t):
+        # Nested tuples can take several collections to untrack
+        gc.collect()
+        gc.collect()
+        self.assertFalse(gc.is_tracked(t), t)
+
+    def _tracked(self, t):
+        self.assertTrue(gc.is_tracked(t), t)
+        gc.collect()
+        gc.collect()
+        self.assertTrue(gc.is_tracked(t), t)
+
+    def test_track_literals(self):
+        # Test GC-optimization of tuple literals
+        x, y, z = 1.5, "a", []
+
+        self._not_tracked(())
+        self._not_tracked((1,))
+        self._not_tracked((1, 2))
+        self._not_tracked((1, 2, "a"))
+        self._not_tracked((1, 2, (None, True, False, ()), int))
+        self._not_tracked((object(),))
+        self._not_tracked(((1, x), y, (2, 3)))
+
+        # Tuples with mutable elements are always tracked, even if those
+        # elements are not tracked right now.
+        self._tracked(([],))
+        self._tracked(([1],))
+        self._tracked(({},))
+        self._tracked((set(),))
+        self._tracked((x, y, z))
+
+    def check_track_dynamic(self, tp, always_track):
+        x, y, z = 1.5, "a", []
+
+        check = self._tracked if always_track else self._not_tracked
+        check(tp())
+        check(tp([]))
+        check(tp(set()))
+        check(tp([1, x, y]))
+        check(tp(obj for obj in [1, x, y]))
+        check(tp(set([1, x, y])))
+        check(tp(tuple([obj]) for obj in [1, x, y]))
+        check(tuple(tp([obj]) for obj in [1, x, y]))
+
+        self._tracked(tp([z]))
+        self._tracked(tp([[x, y]]))
+        self._tracked(tp([{x: y}]))
+        self._tracked(tp(obj for obj in [x, y, z]))
+        self._tracked(tp(tuple([obj]) for obj in [x, y, z]))
+        self._tracked(tuple(tp([obj]) for obj in [x, y, z]))
+
+    def test_track_dynamic(self):
+        # Test GC-optimization of dynamically constructed tuples.
+        self.check_track_dynamic(tuple, False)
+
+    def test_track_subtypes(self):
+        # Tuple subtypes must always be tracked
+        class MyTuple(tuple):
+            pass
+        self.check_track_dynamic(MyTuple, True)
+
+
 def test_main():
     test_support.run_unittest(TupleTest)
 
index e96ec1726ad89968c841a9160c99953b983f78e5..d73453698d2b56370612f01b9d6d223b5f5ea713 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,11 @@ What's New in Python 2.7 alpha 1
 Core and Builtins
 -----------------
 
+- Issue #4688: Add a heuristic so that tuples and dicts containing only
+  untrackable objects are not tracked by the garbage collector. This can
+  reduce the size of collections and therefore the garbage collection overhead
+  on long-running programs, depending on their particular use of datatypes.
+
 - Issue #5512: Rewrite PyLong long division algorithm (x_divrem) to
   improve its performance.  Long divisions and remainder operations
   are now between 50% and 150% faster.
index 4d71591466bb8aaf49589028268c9a4a68106022..d9bea73c33b0837d3b15c863e21f1c918405acb9 100644 (file)
@@ -432,7 +432,13 @@ move_unreachable(PyGC_Head *young, PyGC_Head *unreachable)
                         (void) traverse(op,
                                         (visitproc)visit_reachable,
                                         (void *)young);
-                        next = gc->gc.gc_next;
+                       next = gc->gc.gc_next;
+                       if (PyTuple_CheckExact(op)) {
+                               _PyTuple_MaybeUntrack(op);
+                       }
+                       else if (PyDict_CheckExact(op)) {
+                               _PyDict_MaybeUntrack(op);
+                       }
                }
                else {
                        /* This *may* be unreachable.  To make progress,
@@ -1264,6 +1270,26 @@ gc_get_objects(PyObject *self, PyObject *noargs)
        return result;
 }
 
+PyDoc_STRVAR(gc_is_tracked__doc__,
+"is_tracked(obj) -> bool\n"
+"\n"
+"Returns true if the object is tracked by the garbage collector.\n"
+"Simple atomic objects will return false.\n"
+);
+
+static PyObject *
+gc_is_tracked(PyObject *self, PyObject *obj)
+{
+       PyObject *result;
+       
+       if (PyObject_IS_GC(obj) && IS_TRACKED(obj))
+               result = Py_True;
+       else
+               result = Py_False;
+       Py_INCREF(result);
+       return result;
+}
+
 
 PyDoc_STRVAR(gc__doc__,
 "This module provides access to the garbage collector for reference cycles.\n"
@@ -1278,6 +1304,7 @@ PyDoc_STRVAR(gc__doc__,
 "set_threshold() -- Set the collection thresholds.\n"
 "get_threshold() -- Return the current the collection thresholds.\n"
 "get_objects() -- Return a list of all objects tracked by the collector.\n"
+"is_tracked() -- Returns true if a given object is tracked.\n"
 "get_referrers() -- Return the list of objects that refer to an object.\n"
 "get_referents() -- Return the list of objects that an object refers to.\n");
 
@@ -1293,6 +1320,7 @@ static PyMethodDef GcMethods[] = {
        {"collect",        (PyCFunction)gc_collect,
                METH_VARARGS | METH_KEYWORDS,           gc_collect__doc__},
        {"get_objects",    gc_get_objects,METH_NOARGS,  gc_get_objects__doc__},
+       {"is_tracked",     gc_is_tracked, METH_O,       gc_is_tracked__doc__},
        {"get_referrers",  gc_get_referrers, METH_VARARGS,
                gc_get_referrers__doc__},
        {"get_referents",  gc_get_referents, METH_VARARGS,
index f4d86835e9558f0ec3b9e973189d3109bd4d5e0e..5069c76398c57a1f1d73741d80136b067db648a7 100644 (file)
@@ -180,6 +180,24 @@ show_alloc(void)
 }
 #endif
 
+/* Debug statistic to count GC tracking of dicts */
+#ifdef SHOW_TRACK_COUNT
+static Py_ssize_t count_untracked = 0;
+static Py_ssize_t count_tracked = 0;
+
+static void
+show_track(void)
+{
+       fprintf(stderr, "Dicts created: %" PY_FORMAT_SIZE_T "d\n",
+               count_tracked + count_untracked);
+       fprintf(stderr, "Dicts tracked by the GC: %" PY_FORMAT_SIZE_T
+               "d\n", count_tracked);
+       fprintf(stderr, "%.2f%% dict tracking rate\n\n",
+               (100.0*count_tracked/(count_untracked+count_tracked)));
+}
+#endif
+
+
 /* Initialization macros.
    There are two ways to create a dict:  PyDict_New() is the main C API
    function, and the tp_new slot maps to dict_new().  In the latter case we
@@ -232,6 +250,9 @@ PyDict_New(void)
 #endif
 #ifdef SHOW_ALLOC_COUNT
                Py_AtExit(show_alloc);
+#endif
+#ifdef SHOW_TRACK_COUNT
+               Py_AtExit(show_track);
 #endif
        }
        if (numfree) {
@@ -262,10 +283,12 @@ PyDict_New(void)
 #endif
        }
        mp->ma_lookup = lookdict_string;
+#ifdef SHOW_TRACK_COUNT
+       count_untracked++;
+#endif
 #ifdef SHOW_CONVERSION_COUNTS
        ++created;
 #endif
-       _PyObject_GC_TRACK(mp);
        return (PyObject *)mp;
 }
 
@@ -433,6 +456,52 @@ lookdict_string(PyDictObject *mp, PyObject *key, register long hash)
        return 0;
 }
 
+#ifdef SHOW_TRACK_COUNT
+#define INCREASE_TRACK_COUNT \
+       (count_tracked++, count_untracked--);
+#define DECREASE_TRACK_COUNT \
+       (count_tracked--, count_untracked++);
+#else
+#define INCREASE_TRACK_COUNT
+#define DECREASE_TRACK_COUNT
+#endif
+
+#define MAINTAIN_TRACKING(mp, key, value) \
+       do { \
+               if (!_PyObject_GC_IS_TRACKED(mp)) { \
+                       if (_PyObject_GC_MAY_BE_TRACKED(key) || \
+                               _PyObject_GC_MAY_BE_TRACKED(value)) { \
+                               _PyObject_GC_TRACK(mp); \
+                               INCREASE_TRACK_COUNT \
+                       } \
+               } \
+       } while(0)
+
+void
+_PyDict_MaybeUntrack(PyObject *op)
+{
+       PyDictObject *mp;
+       PyObject *value;
+       Py_ssize_t mask, i;
+       PyDictEntry *ep;
+
+       if (!PyDict_CheckExact(op) || !_PyObject_GC_IS_TRACKED(op))
+               return;
+       
+       mp = (PyDictObject *) op;
+       ep = mp->ma_table;
+       mask = mp->ma_mask;
+       for (i = 0; i <= mask; i++) {
+               if ((value = ep[i].me_value) == NULL)
+                       continue;
+               if (_PyObject_GC_MAY_BE_TRACKED(value) ||
+                       _PyObject_GC_MAY_BE_TRACKED(ep[i].me_key))
+                       return;
+       }
+       _PyObject_GC_UNTRACK(op);
+}
+
+
 /*
 Internal routine to insert a new item into the table.
 Used both by the internal resize routine and by the public insert routine.
@@ -453,6 +522,7 @@ insertdict(register PyDictObject *mp, PyObject *key, long hash, PyObject *value)
                Py_DECREF(value);
                return -1;
        }
+       MAINTAIN_TRACKING(mp, key, value);
        if (ep->me_value != NULL) {
                old_value = ep->me_value;
                ep->me_value = value;
@@ -492,6 +562,7 @@ insertdict_clean(register PyDictObject *mp, PyObject *key, long hash,
        PyDictEntry *ep0 = mp->ma_table;
        register PyDictEntry *ep;
 
+       MAINTAIN_TRACKING(mp, key, value);
        i = hash & mask;
        ep = &ep0[i];
        for (perturb = hash; ep->me_key != NULL; perturb >>= PERTURB_SHIFT) {
@@ -2202,8 +2273,17 @@ dict_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
                assert(d->ma_table == NULL && d->ma_fill == 0 && d->ma_used == 0);
                INIT_NONZERO_DICT_SLOTS(d);
                d->ma_lookup = lookdict_string;
+               /* The object has been implicitely tracked by tp_alloc */
+               if (type == &PyDict_Type)
+                       _PyObject_GC_UNTRACK(d);
 #ifdef SHOW_CONVERSION_COUNTS
                ++created;
+#endif
+#ifdef SHOW_TRACK_COUNT
+               if (_PyObject_GC_IS_TRACKED(d))
+                       count_tracked++;
+               else
+                       count_untracked++;
 #endif
        }
        return self;
index 74d392a4e52aeeb534cafe304043c75ce02bbb0e..644d8a92f69661abb179489fb03157f7642dca2d 100644 (file)
@@ -23,11 +23,36 @@ Py_ssize_t fast_tuple_allocs;
 Py_ssize_t tuple_zero_allocs;
 #endif
 
+/* Debug statistic to count GC tracking of tuples.
+   Please note that tuples are only untracked when considered by the GC, and
+   many of them will be dead before. Therefore, a tracking rate close to 100%
+   does not necessarily prove that the heuristic is inefficient.
+*/
+#ifdef SHOW_TRACK_COUNT
+static Py_ssize_t count_untracked = 0;
+static Py_ssize_t count_tracked = 0;
+
+static void
+show_track(void)
+{
+       fprintf(stderr, "Tuples created: %" PY_FORMAT_SIZE_T "d\n",
+               count_tracked + count_untracked);
+       fprintf(stderr, "Tuples tracked by the GC: %" PY_FORMAT_SIZE_T
+               "d\n", count_tracked);
+       fprintf(stderr, "%.2f%% tuple tracking rate\n\n",
+               (100.0*count_tracked/(count_untracked+count_tracked)));
+}
+#endif
+
+
 PyObject *
 PyTuple_New(register Py_ssize_t size)
 {
        register PyTupleObject *op;
        Py_ssize_t i;
+#ifdef SHOW_TRACK_COUNT
+       count_tracked++;
+#endif
        if (size < 0) {
                PyErr_BadInternalCall();
                return NULL;
@@ -131,6 +156,32 @@ PyTuple_SetItem(register PyObject *op, register Py_ssize_t i, PyObject *newitem)
        return 0;
 }
 
+void
+_PyTuple_MaybeUntrack(PyObject *op)
+{
+       PyTupleObject *t;
+       Py_ssize_t i, n;
+       
+       if (!PyTuple_CheckExact(op) || !_PyObject_GC_IS_TRACKED(op))
+               return;
+       t = (PyTupleObject *) op;
+       n = Py_SIZE(t);
+       for (i = 0; i < n; i++) {
+               PyObject *elt = PyTuple_GET_ITEM(t, i);
+               /* Tuple with NULL elements aren't
+                  fully constructed, don't untrack
+                  them yet. */
+               if (!elt ||
+                       _PyObject_GC_MAY_BE_TRACKED(elt))
+                       return;
+       }
+#ifdef SHOW_TRACK_COUNT
+       count_tracked--;
+       count_untracked++;
+#endif
+       _PyObject_GC_UNTRACK(op);
+}
+
 PyObject *
 PyTuple_Pack(Py_ssize_t n, ...)
 {
@@ -880,6 +931,9 @@ PyTuple_Fini(void)
 
        (void)PyTuple_ClearFreeList();
 #endif
+#ifdef SHOW_TRACK_COUNT
+       show_track();
+#endif
 }
 
 /*********************** Tuple Iterator **************************/