Bring comments up to date (e.g., they still said the table had to be

author Tim Peters <tim.peters@gmail.com>

Wed, 13 Dec 2000 01:02:46 +0000 (01:02 +0000)

committer Tim Peters <tim.peters@gmail.com>

Wed, 13 Dec 2000 01:02:46 +0000 (01:02 +0000)
author Tim Peters <tim.peters@gmail.com>
Wed, 13 Dec 2000 01:02:46 +0000 (01:02 +0000)
committer Tim Peters <tim.peters@gmail.com>
Wed, 13 Dec 2000 01:02:46 +0000 (01:02 +0000)
diff --git a/Objects/dictobject.c b/Objects/dictobject.c

index 924928fab0c0e547a34d7b9d7ad880939cbb9326..32bf647bffd62f5b2290524af958c93d3b8163c1 100644 (file)
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -15,7 +15,7 @@
  
  /*
  Table of irreducible polynomials to efficiently cycle through
-GF(2^n)-{0}, 2<=n<=30.
+GF(2^n)-{0}, 2<=n<=30.  A table size is always a power of 2.
  */
  static long polys[] = {
         4 + 3,
@@ -54,13 +54,26 @@ static long polys[] = {
  static PyObject *dummy; /* Initialized by first call to newdictobject() */
  
  /*
-Invariant for entries: when in use, me_value is not NULL and me_key is
-not NULL and not dummy; when not in use, me_value is NULL and me_key
-is either NULL or dummy.  A dummy key value cannot be replaced by
-NULL, since otherwise other keys may be lost.
+There are three kinds of slots in the table:
+
+1. Unused.  me_key == me_value == NULL
+   Does not hold an active (key, value) pair now and never did.  Unused can
+   transition to Active upon key insertion.  This is the only case in which
+   me_key is NULL, and is each slot's initial state.
+
+2. Active.  me_key != NULL and me_key != dummy and me_value != NULL
+   Holds an active (key, value) pair.  Active can transition to Dummy upon
+   key deletion.  This is the only case in which me_value != NULL.
+
+3. Dummy.  me_key == dummy && me_value == NULL
+   Previously held an active (key, value) pair, but that was deleted and an
+   active pair has not yet overwritten the slot.  Dummy can transition to
+   Active upon key insertion.  Dummy slots cannot be made Unused again
+   (cannot have me_key set to NULL), else the probe sequence in case of
+   collision would have no way to know they were once active.
  */
  typedef struct {
-       long me_hash;
+       long me_hash;      /* cached hash code of me_key */
         PyObject *me_key;
         PyObject *me_value;
  #ifdef USE_CACHE_ALIGNED
@@ -69,20 +82,21 @@ typedef struct {
  } dictentry;
  
  /*
-To ensure the lookup algorithm terminates, the table size must be a
-prime number and there must be at least one NULL key in the table.
-The value ma_fill is the number of non-NULL keys; ma_used is the number
-of non-NULL, non-dummy keys.
-To avoid slowing down lookups on a near-full table, we resize the table
-when it is more than half filled.
+To ensure the lookup algorithm terminates, there must be at least one Unsused
+slot (NULL key) in the table.
+The value ma_fill is the number of non-NULL keys (sum of Active and Dummy);
+ma_used is the number of non-NULL, non-dummy keys (== the number of non-NULL
+values == the number of Active items).
+To avoid slowing down lookups on a near-full table, we resize the table when
+it is more than half filled.
  */
  typedef struct dictobject dictobject;
  struct dictobject {
         PyObject_HEAD
-       int ma_fill;
-       int ma_used;
-       int ma_size;
-       int ma_poly;
+       int ma_fill;  /* # Active + # Dummy */
+       int ma_used;  /* # Active */
+       int ma_size;  /* total # slots in ma_table */
+       int ma_poly;  /* appopriate entry from polys vector */
         dictentry *ma_table;
         dictentry *(*ma_lookup)(dictobject *mp, PyObject *key, long hash);
  };
@@ -138,12 +152,12 @@ This is based on Algorithm D from Knuth Vol. 3, Sec. 6.4.
  Open addressing is preferred over chaining since the link overhead for
  chaining would be substantial (100% with typical malloc overhead).
  However, instead of going through the table at constant steps, we cycle
-through the values of GF(2^n)-{0}. This avoids modulo computations, being
+through the values of GF(2^n).  This avoids modulo computations, being
  much cheaper on RISC machines, without leading to clustering.
  
  The initial probe index is computed as hash mod the table size.
-Subsequent probe indices use the values of x^i in GF(2^n) as an offset,
-where x is a root. The initial value is derived from hash, too.
+Subsequent probe indices use the values of x^i in GF(2^n)-{0} as an offset,
+where x is a root. The initial offset is derived from hash, too.
  
  All arithmetic on hash should ignore overflow.
  
@@ -168,11 +182,14 @@ lookdict(dictobject *mp, PyObject *key, register long hash)
         register int cmp;
         PyObject *err_type, *err_value, *err_tb;
         /* We must come up with (i, incr) such that 0 <= i < ma_size
-          and 0 < incr < ma_size and both are a function of hash */
+          and 0 < incr < ma_size and both are a function of hash.
+          i is the initial table index and incr the initial probe offset. */
         i = (~hash) & mask;
         /* We use ~hash instead of hash, as degenerate hash functions, such
            as for ints <sigh>, can have lots of leading zeros. It's not
-          really a performance risk, but better safe than sorry. */
+          really a performance risk, but better safe than sorry.
+          12-Dec-00 tim:  so ~hash produces lots of leading ones instead --
+          what's the gain? */
         ep = &ep0[i];
         if (ep->me_key == NULL || ep->me_key == key)
                 return ep;
@@ -514,8 +531,8 @@ PyDict_DelItem(PyObject *op, PyObject *key)
         old_value = ep->me_value;
         ep->me_value = NULL;
         mp->ma_used--;
-       Py_DECREF(old_value); 
-       Py_DECREF(old_key); 
+       Py_DECREF(old_value);
+       Py_DECREF(old_key);
         return 0;
  }
author	Tim Peters <tim.peters@gmail.com>
	Wed, 13 Dec 2000 01:02:46 +0000 (01:02 +0000)
committer	Tim Peters <tim.peters@gmail.com>
	Wed, 13 Dec 2000 01:02:46 +0000 (01:02 +0000)