--- /dev/null
+#!/usr/bin/env/python
+
+# perfect_hash.py
+#
+# Outputs C code for a minimal perfect hash.
+# The hash is produced using the algorithm described in
+# "Optimal algorithms for minimal perfect hashing",
+# G. Havas, B.S. Majewski. Available as a technical report
+# from the CS department, University of Queensland
+# (ftp://ftp.cs.uq.oz.au/).
+#
+# This is a modified version of Andrew Kuchling's code
+# (http://starship.python.net/crew/amk/python/code/perfect-hash.html)
+# and generates C fragments suitable for compilation as a Python
+# extension module.
+#
+
+# Difference between this algorithm and gperf:
+# Gperf will complete in finite time with a successful function,
+# or by giving up.
+# This algorithm may never complete, although it is extremely likely
+# when c >= 2.
+
+# The algorithm works like this:
+# 0) You have K keys, that you want to perfectly hash to a bunch
+# of hash values.
+#
+# 1) Choose a number N larger than K. This is the number of
+# vertices in a graph G, and also the size of the resulting table.
+#
+# 2) Pick two random hash functions f1, f2, that output values from
+# 0...N-1.
+#
+# 3) for key in keys:
+# h1 = f1(key) ; h2 = f2(key)
+# Draw an edge between vertices h1 and h2 of the graph.
+# Associate the desired hash value with that edge.
+#
+# 4) Check if G is acyclic; if not, go back to step 1 and pick a bigger N.
+#
+# 5) Assign values to each vertex such that, for each edge, you can
+# add the values for the two vertices and get the desired value
+# for that edge -- which is the desired hash key. This task is
+# dead easy, because the graph is acyclic. This is done by
+# picking a vertex V, and assigning it a value of 0. You then do a
+# depth-first search, assigning values to new vertices so that
+# they sum up properly.
+#
+# 6) f1, f2, and G now make up your perfect hash function.
+
+
+import sys, whrandom, string
+import pprint
+import perfhash
+import time
+
+class Hash:
+ """Random hash function
+ For simplicity and speed, this doesn't implement any byte-level hashing
+ scheme. Instead, a random string is generated and prefixing to
+ str(key), and then Python's hashing function is used."""
+
+ def __init__(self, N, caseInsensitive=0):
+ self.N = N
+ junk = ""
+ for i in range(10):
+ junk = junk + whrandom.choice(string.letters + string.digits)
+ self.junk = junk
+ self.caseInsensitive = caseInsensitive
+ self.seed = perfhash.calcSeed(junk)
+
+ def __call__(self, key):
+ key = str(key)
+ if self.caseInsensitive:
+ key = string.upper(key)
+ x = perfhash.hash(self.seed, len(self.junk), key) % self.N
+ #h = hash(self.junk + key) % self.N
+ #assert x == h
+ return x
+
+ def generate_code(self):
+ s = """{
+ register int len;
+ register unsigned char *p;
+ register long x;
+
+ len = cch;
+ p = (unsigned char *) key;
+ x = %(junkSeed)d;
+ while (--len >= 0)
+ x = (1000003*x) ^ """ % \
+ {
+ "lenJunk" : len(self.junk),
+ "junkSeed" : self.seed,
+ }
+
+ if self.caseInsensitive:
+ s = s + "toupper(*(p++));"
+ else:
+ s = s + "*(p++);"
+ s = s + """
+ x ^= cch + %(lenJunk)d;
+ if (x == -1)
+ x = -2;
+ x %%= k_cHashElements;
+ /* ensure the returned value is positive so we mimic Python's %% operator */
+ if (x < 0)
+ x += k_cHashElements;
+ return x;
+}
+""" % { "lenJunk" : len(self.junk),
+ "junkSeed" : self.seed, }
+ return s
+
+
+WHITE, GREY, BLACK = 0,1,2
+class Graph:
+ """Graph class. This class isn't particularly efficient or general,
+ and only has the features I needed to implement this algorithm.
+
+ num_vertices -- number of vertices
+ edges -- maps 2-tuples of vertex numbers to the value for this
+ edge. If there's an edge between v1 and v2 (v1<v2),
+ (v1,v2) is a key and the value is the edge's value.
+ reachable_list -- maps a vertex V to the list of vertices
+ to which V is connected by edges. Used
+ for traversing the graph.
+ values -- numeric value for each vertex
+ """
+
+ def __init__(self, num_vertices):
+ self.num_vertices = num_vertices
+ self.edges = {}
+ self.reachable_list = {}
+ self.values = [-1] * num_vertices
+
+ def connect(self, vertex1, vertex2, value):
+ """Connect 'vertex1' and 'vertex2' with an edge, with associated
+ value 'value'"""
+
+ if vertex1 > vertex2: vertex1, vertex2 = vertex2, vertex1
+# if self.edges.has_key( (vertex1, vertex2) ):
+# raise ValueError, 'Collision: vertices already connected'
+ self.edges[ (vertex1, vertex2) ] = value
+
+ # Add vertices to each other's reachable list
+ if not self.reachable_list.has_key( vertex1 ):
+ self.reachable_list[ vertex1 ] = [vertex2]
+ else:
+ self.reachable_list[vertex1].append(vertex2)
+
+ if not self.reachable_list.has_key( vertex2 ):
+ self.reachable_list[ vertex2 ] = [vertex1]
+ else:
+ self.reachable_list[vertex2].append(vertex1)
+
+ def get_edge_value(self, vertex1, vertex2):
+ """Retrieve the value corresponding to the edge between
+ 'vertex1' and 'vertex2'. Raises KeyError if no such edge"""
+ if vertex1 > vertex2:
+ vertex1, vertex2 = vertex2, vertex1
+ return self.edges[ (vertex1, vertex2) ]
+
+ def is_acyclic(self):
+ "Returns true if the graph is acyclic, otherwise false"
+
+ # This is done by doing a depth-first search of the graph;
+ # painting each vertex grey and then black. If the DFS
+ # ever finds a vertex that isn't white, there's a cycle.
+ colour = {}
+ for i in range(self.num_vertices): colour[i] = WHITE
+
+ # Loop over all vertices, taking white ones as starting
+ # points for a traversal.
+ for i in range(self.num_vertices):
+ if colour[i] == WHITE:
+
+ # List of vertices to visit
+ visit_list = [ (None,i) ]
+
+ # Do a DFS
+ while visit_list:
+ # Colour this vertex grey.
+ parent, vertex = visit_list[0] ; del visit_list[0]
+ colour[vertex] = GREY
+
+ # Make copy of list of neighbours, removing the vertex
+ # we arrived here from.
+ neighbours = self.reachable_list.get(vertex, []) [:]
+ if parent in neighbours: neighbours.remove( parent )
+
+ for neighbour in neighbours:
+ if colour[neighbour] == WHITE:
+ visit_list.insert(0, (vertex, neighbour) )
+ elif colour[neighbour] != WHITE:
+ # Aha! Already visited this node,
+ # so the graph isn't acyclic.
+ return 0
+
+ colour[vertex] = BLACK
+
+ # We got through, so the graph is acyclic.
+ return 1
+
+ def assign_values(self):
+ """Compute values for each vertex, so that they sum up
+ properly to the associated value for each edge."""
+
+ # Also done with a DFS; I simply copied the DFS code
+ # from is_acyclic(). (Should generalize the logic so
+ # one function could be used from both methods,
+ # but I couldn't be bothered.)
+
+ colour = {}
+ for i in range(self.num_vertices): colour[i] = WHITE
+
+ # Loop over all vertices, taking white ones as starting
+ # points for a traversal.
+ for i in range(self.num_vertices):
+ if colour[i] == WHITE:
+ # Set this vertex's value, arbitrarily, to zero.
+ self.set_vertex_value( i, 0 )
+
+ # List of vertices to visit
+ visit_list = [ (None,i) ]
+
+ # Do a DFS
+ while visit_list:
+ # Colour this vertex grey.
+ parent, vertex = visit_list[0] ; del visit_list[0]
+ colour[vertex] = GREY
+
+ # Make copy of list of neighbours, removing the vertex
+ # we arrived here from.
+ neighbours = self.reachable_list.get(vertex, []) [:]
+ if parent in neighbours: neighbours.remove( parent )
+
+ for neighbour in self.reachable_list.get(vertex, []):
+ edge_value = self.get_edge_value( vertex, neighbour )
+ if colour[neighbour] == WHITE:
+ visit_list.insert(0, (vertex, neighbour) )
+
+ # Set new vertex's value to the desired
+ # edge value, minus the value of the
+ # vertex we came here from.
+ new_val = (edge_value -
+ self.get_vertex_value( vertex ) )
+ self.set_vertex_value( neighbour,
+ new_val % self.num_vertices)
+
+ colour[vertex] = BLACK
+
+ # Returns nothing
+ return
+
+ def __getitem__(self, index):
+ if index < self.num_vertices: return index
+ raise IndexError
+
+ def get_vertex_value(self, vertex):
+ "Get value for a vertex"
+ return self.values[ vertex ]
+
+ def set_vertex_value(self, vertex, value):
+ "Set value for a vertex"
+ self.values[ vertex ] = value
+
+ def generate_code(self, out, width = 70):
+ "Return nicely formatted table"
+ out.write("{ ")
+ pos = 0
+ for v in self.values:
+ v=str(v)+', '
+ out.write(v)
+ pos = pos + len(v) + 1
+ if pos > width: out.write('\n '); pos = 0
+ out.write('};\n')
+
+
+class PerfectHash:
+ def __init__(self, cchMax, f1, f2, G, cHashElements, cKeys, maxHashValue):
+ self.cchMax = cchMax
+ self.f1 = f1
+ self.f2 = f2
+ self.G = G
+ self.cHashElements = cHashElements
+ self.cKeys = cKeys
+ # determine the necessary type for storing our hash function
+ # helper table:
+ self.type = self.determineType(maxHashValue)
+
+ def generate_header(self, structName):
+ header = """
+#include <Python.h>
+#include <stdlib.h>
+
+/* --- C API ----------------------------------------------------*/
+/* C API for usage by other Python modules */
+typedef struct %(structName)s
+{
+ unsigned long cKeys;
+ unsigned long cchMax;
+ unsigned long (*hash)(const char *key, unsigned int cch);
+ const void *(*getValue)(unsigned long iKey);
+} %(structName)s;
+""" % { "structName" : structName }
+ return header
+
+ def determineType(self, maxHashValue):
+ if maxHashValue <= 255:
+ return "unsigned char"
+ elif maxHashValue <= 65535:
+ return "unsigned short"
+ else:
+ # Take the cheesy way out...
+ return "unsigned long"
+
+ def generate_code(self, moduleName, dataArrayName, dataArrayType, structName):
+ # Output C code for the hash functions and tables
+ code = """
+/*
+ * The hash is produced using the algorithm described in
+ * "Optimal algorithms for minimal perfect hashing",
+ * G. Havas, B.S. Majewski. Available as a technical report
+ * from the CS department, University of Queensland
+ * (ftp://ftp.cs.uq.oz.au/).
+ *
+ * Generated using a heavily tweaked version of Andrew Kuchling's
+ * perfect_hash.py:
+ * http://starship.python.net/crew/amk/python/code/perfect-hash.html
+ *
+ * Generated on: %s
+ */
+""" % time.ctime(time.time())
+ # MSVC SP3 was complaining when I actually used a global constant
+ code = code + """
+#define k_cHashElements %i
+#define k_cchMaxKey %d
+#define k_cKeys %i
+
+""" % (self.cHashElements, self.cchMax, self.cKeys)
+
+ code = code + """
+static const %s G[k_cHashElements];
+static const %s %s[k_cKeys];
+""" % (self.type, dataArrayType, dataArrayName)
+
+ code = code + """
+static long f1(const char *key, unsigned int cch)
+"""
+ code = code + self.f1.generate_code()
+ code = code + """
+
+static long f2(const char *key, unsigned int cch)
+"""
+ code = code + self.f2.generate_code()
+ code = code + """
+
+static unsigned long hash(const char *key, unsigned int cch)
+{
+ return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) %% k_cHashElements;
+}
+
+const void *getValue(unsigned long iKey)
+{
+ return &%(dataArrayName)s[iKey];
+}
+
+/* Helper for adding objects to dictionaries. Check for errors with
+ PyErr_Occurred() */
+static
+void insobj(PyObject *dict,
+ char *name,
+ PyObject *v)
+{
+ PyDict_SetItemString(dict, name, v);
+ Py_XDECREF(v);
+}
+
+static const %(structName)s hashAPI =
+{
+ k_cKeys,
+ k_cchMaxKey,
+ &hash,
+ &getValue,
+};
+
+static
+PyMethodDef Module_methods[] =
+{
+ {NULL, NULL},
+};
+
+static char *Module_docstring = "%(moduleName)s hash function module";
+
+/* Error reporting for module init functions */
+
+#define Py_ReportModuleInitError(modname) { \\
+ PyObject *exc_type, *exc_value, *exc_tb; \\
+ PyObject *str_type, *str_value; \\
+ \\
+ /* Fetch error objects and convert them to strings */ \\
+ PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \\
+ if (exc_type && exc_value) { \\
+ str_type = PyObject_Str(exc_type); \\
+ str_value = PyObject_Str(exc_value); \\
+ } \\
+ else { \\
+ str_type = NULL; \\
+ str_value = NULL; \\
+ } \\
+ /* Try to format a more informative error message using the \\
+ original error */ \\
+ if (str_type && str_value && \\
+ PyString_Check(str_type) && PyString_Check(str_value)) \\
+ PyErr_Format( \\
+ PyExc_ImportError, \\
+ "initialization of module "modname" failed " \\
+ "(%%s:%%s)", \\
+ PyString_AS_STRING(str_type), \\
+ PyString_AS_STRING(str_value)); \\
+ else \\
+ PyErr_SetString( \\
+ PyExc_ImportError, \\
+ "initialization of module "modname" failed"); \\
+ Py_XDECREF(str_type); \\
+ Py_XDECREF(str_value); \\
+ Py_XDECREF(exc_type); \\
+ Py_XDECREF(exc_value); \\
+ Py_XDECREF(exc_tb); \\
+}
+
+
+/* Create PyMethodObjects and register them in the module\'s dict */
+DL_EXPORT(void)
+init%(moduleName)s(void)
+{
+ PyObject *module, *moddict;
+ /* Create module */
+ module = Py_InitModule4("%(moduleName)s", /* Module name */
+ Module_methods, /* Method list */
+ Module_docstring, /* Module doc-string */
+ (PyObject *)NULL, /* always pass this as *self */
+ PYTHON_API_VERSION); /* API Version */
+ if (module == NULL)
+ goto onError;
+ /* Add some constants to the module\'s dict */
+ moddict = PyModule_GetDict(module);
+ if (moddict == NULL)
+ goto onError;
+
+ /* Export C API */
+ insobj(
+ moddict,
+ "%(moduleName)sAPI",
+ PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
+
+onError:
+ /* Check for errors and report them */
+ if (PyErr_Occurred())
+ Py_ReportModuleInitError("%(moduleName)s");
+ return;
+}
+""" % { "moduleName" : moduleName,
+ "dataArrayName" : dataArrayName,
+ "structName" : structName, }
+
+ return code
+
+ def generate_graph(self, out):
+ out.write("""
+static const unsigned short G[] =
+""")
+ self.G.generate_code(out)
+
+
+def generate_hash(keys, caseInsensitive=0,
+ minC=None, initC=None,
+ f1Seed=None, f2Seed=None,
+ cIncrement=None, cTries=None):
+ """Print out code for a perfect minimal hash. Input is a list of
+ (key, desired hash value) tuples. """
+
+ # K is the number of keys.
+ K = len(keys)
+
+ # We will be generating graphs of size N, where N = c * K.
+ # The larger C is, the fewer trial graphs will need to be made, but
+ # the resulting table is also larger. Increase this starting value
+ # if you're impatient. After 50 failures, c will be increased by 0.025.
+ if initC is None:
+ initC = 1.5
+
+ c = initC
+ if cIncrement is None:
+ cIncrement = 0.0025
+
+ if cTries is None:
+ cTries = 50
+
+ # Number of trial graphs so far
+ num_graphs = 0
+ sys.stderr.write('Generating graphs... ')
+
+ while 1:
+ # N is the number of vertices in the graph G
+ N = int(c*K)
+ num_graphs = num_graphs + 1
+ if (num_graphs % cTries) == 0:
+ # Enough failures at this multiplier,
+ # increase the multiplier and keep trying....
+ c = c + cIncrement
+
+ # Whats good with searching for a better
+ # hash function if we exceed the size
+ # of a function we've generated in the past....
+ if minC is not None and \
+ c > minC:
+ c = initC
+ sys.stderr.write(' -- c > minC, resetting c to %0.4f\n' % c)
+ else:
+ sys.stderr.write(' -- increasing c to %0.4f\n' % c)
+ sys.stderr.write('Generating graphs... ')
+
+ # Output a progress message
+ sys.stderr.write( str(num_graphs) + ' ')
+ sys.stderr.flush()
+
+ # Create graph w/ N vertices
+ G = Graph(N)
+ # Save the seeds used to generate
+ # the following two hash functions.
+ _seeds = whrandom._inst._seed
+
+ # Create 2 random hash functions
+ f1 = Hash(N, caseInsensitive)
+ f2 = Hash(N, caseInsensitive)
+
+ # Set the initial hash function seed values if passed in.
+ # Doing this protects our hash functions from
+ # changes to whrandom's behavior.
+ if f1Seed is not None:
+ f1.seed = f1Seed
+ f1Seed = None
+ fSpecifiedSeeds = 1
+ if f2Seed is not None:
+ f2.seed = f2Seed
+ f2Seed = None
+ fSpecifiedSeeds = 1
+
+ # Connect vertices given by the values of the two hash functions
+ # for each key. Associate the desired hash value with each
+ # edge.
+ for k, v in keys:
+ h1 = f1(k) ; h2 = f2(k)
+ G.connect( h1,h2, v)
+
+ # Check if the resulting graph is acyclic; if it is,
+ # we're done with step 1.
+ if G.is_acyclic():
+ break
+ elif fSpecifiedSeeds:
+ sys.stderr.write('\nThe initial f1/f2 seeds you specified didn\'t generate a perfect hash function: \n')
+ sys.stderr.write('f1 seed: %s\n' % f1.seed)
+ sys.stderr.write('f2 seed: %s\n' % f2.seed)
+ sys.stderr.write('multipler: %s\n' % c)
+ sys.stderr.write('Your data has likely changed, or you forgot what your initial multiplier should be.\n')
+ sys.stderr.write('continuing the search for a perfect hash function......\n')
+ fSpecifiedSeeds = 0
+
+ # Now we have an acyclic graph, so we assign values to each vertex
+ # such that, for each edge, you can add the values for the two vertices
+ # involved and get the desired value for that edge -- which is the
+ # desired hash key. This task is dead easy, because the graph is acyclic.
+ sys.stderr.write('\nAcyclic graph found; computing vertex values...\n')
+ G.assign_values()
+
+ sys.stderr.write('Checking uniqueness of hash values...\n')
+
+ # Sanity check the result by actually verifying that all the keys
+ # hash to the right value.
+ cchMaxKey = 0
+ maxHashValue = 0
+
+ for k, v in keys:
+ hash1 = G.values[ f1(k) ]
+ hash2 = G.values[ f2(k) ]
+ if hash1 > maxHashValue:
+ maxHashValue = hash1
+ if hash2 > maxHashValue:
+ maxHashValue = hash2
+ perfecthash = (hash1 + hash2) % N
+ assert perfecthash == v
+ cch = len(k)
+ if cch > cchMaxKey:
+ cchMaxKey = cch
+
+ sys.stderr.write('Found perfect hash function!\n')
+ sys.stderr.write('\nIn order to regenerate this hash function, \n')
+ sys.stderr.write('you need to pass these following values back in:\n')
+ sys.stderr.write('f1 seed: %s\n' % repr(f1.seed))
+ sys.stderr.write('f2 seed: %s\n' % repr(f2.seed))
+ sys.stderr.write('initial multipler: %s\n' % c)
+
+ return PerfectHash(cchMaxKey, f1, f2, G, N, len(keys), maxHashValue)
+
+"""
+static
+PyObject *codec_tuple(PyObject *unicode,
+ int len)
+{
+ PyObject *v,*w;
+
+ if (unicode == NULL)
+ return NULL;
+ v = PyTuple_New(2);
+ if (v == NULL) {
+ Py_DECREF(unicode);
+ return NULL;
+ }
+ PyTuple_SET_ITEM(v,0,unicode);
+ w = PyInt_FromLong(len);
+ if (w == NULL) {
+ Py_DECREF(v);
+ return NULL;
+ }
+ PyTuple_SET_ITEM(v,1,w);
+ return v;
+}
+
+static PyObject *
+ucn_decode(PyObject *self,
+ PyObject *args)
+{
+ const char *data;
+ int size;
+ const char *errors = NULL;
+ PyObject *mapping = NULL;
+
+ if (!PyArg_ParseTuple(args, "t#|z:ucn_decode",
+ &data, &size, &errors))
+ return NULL;
+ if (mapping == Py_None)
+ mapping = NULL;
+
+ return codec_tuple(PyUnicode_DecodeNamedUnicodeEscape(data, size, errors),
+ size);
+}
+
+
+static PyMethodDef _codecs_functions[] = {
+ { "ucn_decode", ucn_decode, 1 },
+};
+
+DL_EXPORT(void)
+init_ucn()
+{
+ Py_InitModule("_ucn", _codecs_functions);
+}
+
+"""
+
+
+