]> granicus.if.org Git - postgresql/commitdiff
Move the guts of our Levenshtein implementation into core.
authorRobert Haas <rhaas@postgresql.org>
Thu, 13 Nov 2014 17:25:10 +0000 (12:25 -0500)
committerRobert Haas <rhaas@postgresql.org>
Thu, 13 Nov 2014 17:33:26 +0000 (12:33 -0500)
The hope is that we can use this to produce better diagnostics in
some cases.

Peter Geoghegan, reviewed by Michael Paquier, with some further
changes by me.

contrib/fuzzystrmatch/Makefile
contrib/fuzzystrmatch/fuzzystrmatch.c
src/backend/utils/adt/Makefile
src/backend/utils/adt/levenshtein.c [moved from contrib/fuzzystrmatch/levenshtein.c with 85% similarity]
src/backend/utils/adt/varlena.c
src/include/utils/builtins.h

index 024265d47900b6ac3e432daf8457ec7708d689b0..0327d9510a50f37e90d2a73880a14fbb2bab29c4 100644 (file)
@@ -17,6 +17,3 @@ top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
 endif
-
-# levenshtein.c is #included by fuzzystrmatch.c
-fuzzystrmatch.o: fuzzystrmatch.c levenshtein.c
index 7a53d8a008e1193e07ca5ad38766ad7915d25434..f0df08032b697d94ead57dcecc097ffce675e60b 100644 (file)
@@ -154,23 +154,6 @@ getcode(char c)
 /* These prevent GH from becoming F */
 #define NOGHTOF(c)     (getcode(c) & 16)       /* BDH */
 
-/* Faster than memcmp(), for this use case. */
-static inline bool
-rest_of_char_same(const char *s1, const char *s2, int len)
-{
-       while (len > 0)
-       {
-               len--;
-               if (s1[len] != s2[len])
-                       return false;
-       }
-       return true;
-}
-
-#include "levenshtein.c"
-#define LEVENSHTEIN_LESS_EQUAL
-#include "levenshtein.c"
-
 PG_FUNCTION_INFO_V1(levenshtein_with_costs);
 Datum
 levenshtein_with_costs(PG_FUNCTION_ARGS)
@@ -180,8 +163,20 @@ levenshtein_with_costs(PG_FUNCTION_ARGS)
        int                     ins_c = PG_GETARG_INT32(2);
        int                     del_c = PG_GETARG_INT32(3);
        int                     sub_c = PG_GETARG_INT32(4);
-
-       PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
+       const char *s_data;
+       const char *t_data;
+       int                     s_bytes,
+                               t_bytes;
+
+       /* Extract a pointer to the actual character data */
+       s_data = VARDATA_ANY(src);
+       t_data = VARDATA_ANY(dst);
+       /* Determine length of each string in bytes and characters */
+       s_bytes = VARSIZE_ANY_EXHDR(src);
+       t_bytes = VARSIZE_ANY_EXHDR(dst);
+
+       PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, ins_c,
+                                                                          del_c, sub_c));
 }
 
 
@@ -191,8 +186,20 @@ levenshtein(PG_FUNCTION_ARGS)
 {
        text       *src = PG_GETARG_TEXT_PP(0);
        text       *dst = PG_GETARG_TEXT_PP(1);
-
-       PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
+       const char *s_data;
+       const char *t_data;
+       int                     s_bytes,
+                               t_bytes;
+
+       /* Extract a pointer to the actual character data */
+       s_data = VARDATA_ANY(src);
+       t_data = VARDATA_ANY(dst);
+       /* Determine length of each string in bytes and characters */
+       s_bytes = VARSIZE_ANY_EXHDR(src);
+       t_bytes = VARSIZE_ANY_EXHDR(dst);
+
+       PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, 1, 1,
+                                                                          1));
 }
 
 
@@ -206,8 +213,21 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
        int                     del_c = PG_GETARG_INT32(3);
        int                     sub_c = PG_GETARG_INT32(4);
        int                     max_d = PG_GETARG_INT32(5);
-
-       PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d));
+       const char *s_data;
+       const char *t_data;
+       int                     s_bytes,
+                               t_bytes;
+
+       /* Extract a pointer to the actual character data */
+       s_data = VARDATA_ANY(src);
+       t_data = VARDATA_ANY(dst);
+       /* Determine length of each string in bytes and characters */
+       s_bytes = VARSIZE_ANY_EXHDR(src);
+       t_bytes = VARSIZE_ANY_EXHDR(dst);
+
+       PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
+                                                                                                 t_bytes, ins_c, del_c,
+                                                                                                 sub_c, max_d));
 }
 
 
@@ -218,8 +238,20 @@ levenshtein_less_equal(PG_FUNCTION_ARGS)
        text       *src = PG_GETARG_TEXT_PP(0);
        text       *dst = PG_GETARG_TEXT_PP(1);
        int                     max_d = PG_GETARG_INT32(2);
-
-       PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d));
+       const char *s_data;
+       const char *t_data;
+       int                     s_bytes,
+                               t_bytes;
+
+       /* Extract a pointer to the actual character data */
+       s_data = VARDATA_ANY(src);
+       t_data = VARDATA_ANY(dst);
+       /* Determine length of each string in bytes and characters */
+       s_bytes = VARSIZE_ANY_EXHDR(src);
+       t_bytes = VARSIZE_ANY_EXHDR(dst);
+
+       PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
+                                                                                                 t_bytes, 1, 1, 1, max_d));
 }
 
 
index 7b4391bba179082c21e9ff526b1f8276687dfce4..3ea9bf435a31c7d68ec6f142044fd36540866854 100644 (file)
@@ -38,4 +38,6 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
 
 like.o: like.c like_match.c
 
+varlena.o: varlena.c levenshtein.c
+
 include $(top_srcdir)/src/backend/common.mk
similarity index 85%
rename from contrib/fuzzystrmatch/levenshtein.c
rename to src/backend/utils/adt/levenshtein.c
index 4f37a54b1e446338e79da6686b4646a2fcf32fed..a8670e9a85bd7a070ee7f614725c3f6ab11399fe 100644 (file)
@@ -1,41 +1,34 @@
-/*
+/*-------------------------------------------------------------------------
+ *
  * levenshtein.c
+ *       Levenshtein distance implementation.
  *
- * Functions for "fuzzy" comparison of strings
+ * Original author:  Joe Conway <mail@joeconway.com>
  *
- * Joe Conway <mail@joeconway.com>
+ * This file is included by varlena.c twice, to provide matching code for (1)
+ * Levenshtein distance with custom costings, and (2) Levenshtein distance with
+ * custom costings and a "max" value above which exact distances are not
+ * interesting.  Before the inclusion, we rely on the presence of the inline
+ * function rest_of_char_same().
+ *
+ * Written based on a description of the algorithm by Michael Gilleland found
+ * at http://www.merriampark.com/ld.htm.  Also looked at levenshtein.c in the
+ * PHP 4.0.6 distribution for inspiration.  Configurable penalty costs
+ * extension is introduced by Volkan YAZICI <volkan.yazici@gmail.com.
  *
  * Copyright (c) 2001-2014, PostgreSQL Global Development Group
- * ALL RIGHTS RESERVED;
  *
- * levenshtein()
- * -------------
- * Written based on a description of the algorithm by Michael Gilleland
- * found at http://www.merriampark.com/ld.htm
- * Also looked at levenshtein.c in the PHP 4.0.6 distribution for
- * inspiration.
- * Configurable penalty costs extension is introduced by Volkan
- * YAZICI <volkan.yazici@gmail.com>.
- */
-
-/*
- * External declarations for exported functions
+ * IDENTIFICATION
+ *     src/backend/utils/adt/levenshtein.c
+ *
+ *-------------------------------------------------------------------------
  */
-#ifdef LEVENSHTEIN_LESS_EQUAL
-static int levenshtein_less_equal_internal(text *s, text *t,
-                                                               int ins_c, int del_c, int sub_c, int max_d);
-#else
-static int levenshtein_internal(text *s, text *t,
-                                        int ins_c, int del_c, int sub_c);
-#endif
-
 #define MAX_LEVENSHTEIN_STRLEN         255
 
-
 /*
- * Calculates Levenshtein distance metric between supplied strings. Generally
- * (1, 1, 1) penalty costs suffices for common cases, but your mileage may
- * vary.
+ * Calculates Levenshtein distance metric between supplied csrings, which are
+ * not necessarily null-terminated.  Generally (1, 1, 1) penalty costs suffices
+ * for common cases, but your mileage may vary.
  *
  * One way to compute Levenshtein distance is to incrementally construct
  * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
@@ -63,30 +56,27 @@ static int levenshtein_internal(text *s, text *t,
  * identify the portion of the matrix close to the diagonal which can still
  * affect the final answer.
  */
-static int
+int
 #ifdef LEVENSHTEIN_LESS_EQUAL
-levenshtein_less_equal_internal(text *s, text *t,
-                                                               int ins_c, int del_c, int sub_c, int max_d)
+varstr_levenshtein_less_equal(const char *source, int slen, const char *target,
+                                                         int tlen, int ins_c, int del_c, int sub_c,
+                                                         int max_d)
 #else
-levenshtein_internal(text *s, text *t,
-                                        int ins_c, int del_c, int sub_c)
+varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
+                                  int ins_c, int del_c, int sub_c)
 #endif
 {
        int                     m,
-                               n,
-                               s_bytes,
-                               t_bytes;
+                               n;
        int                *prev;
        int                *curr;
        int                *s_char_len = NULL;
        int                     i,
                                j;
-       const char *s_data;
-       const char *t_data;
        const char *y;
 
        /*
-        * For levenshtein_less_equal_internal, we have real variables called
+        * For varstr_levenshtein_less_equal, we have real variables called
         * start_column and stop_column; otherwise it's just short-hand for 0 and
         * m.
         */
@@ -105,15 +95,8 @@ levenshtein_internal(text *s, text *t,
 #define STOP_COLUMN m
 #endif
 
-       /* Extract a pointer to the actual character data. */
-       s_data = VARDATA_ANY(s);
-       t_data = VARDATA_ANY(t);
-
-       /* Determine length of each string in bytes and characters. */
-       s_bytes = VARSIZE_ANY_EXHDR(s);
-       t_bytes = VARSIZE_ANY_EXHDR(t);
-       m = pg_mbstrlen_with_len(s_data, s_bytes);
-       n = pg_mbstrlen_with_len(t_data, t_bytes);
+       m = pg_mbstrlen_with_len(source, slen);
+       n = pg_mbstrlen_with_len(target, tlen);
 
        /*
         * We can transform an empty s into t with n insertions, or a non-empty t
@@ -193,10 +176,10 @@ levenshtein_internal(text *s, text *t,
         * multi-byte characters, we still build the array, so that the fast-path
         * needn't deal with the case where the array hasn't been initialized.
         */
-       if (m != s_bytes || n != t_bytes)
+       if (m != slen || n != tlen)
        {
                int                     i;
-               const char *cp = s_data;
+               const char *cp = source;
 
                s_char_len = (int *) palloc((m + 1) * sizeof(int));
                for (i = 0; i < m; ++i)
@@ -223,11 +206,11 @@ levenshtein_internal(text *s, text *t,
                prev[i] = i * del_c;
 
        /* Loop through rows of the notional array */
-       for (y = t_data, j = 1; j < n; j++)
+       for (y = target, j = 1; j < n; j++)
        {
                int                *temp;
-               const char *x = s_data;
-               int                     y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1;
+               const char *x = source;
+               int                     y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
 
 #ifdef LEVENSHTEIN_LESS_EQUAL
 
@@ -384,7 +367,7 @@ levenshtein_internal(text *s, text *t,
                                prev[start_column] = max_d + 1;
                                curr[start_column] = max_d + 1;
                                if (start_column != 0)
-                                       s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
+                                       source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
                                start_column++;
                        }
 
index c3171b549a6feb9ef625388bd8f78c33a5a157b6..b3f397e9595270750181029542e2f0fc83e821a8 100644 (file)
@@ -1546,7 +1546,6 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
        return result;
 }
 
-
 /* text_cmp()
  * Internal comparison function for text strings.
  * Returns -1, 0 or 1
@@ -4747,3 +4746,24 @@ text_format_nv(PG_FUNCTION_ARGS)
 {
        return text_format(fcinfo);
 }
+
+/*
+ * Helper function for Levenshtein distance functions. Faster than memcmp(),
+ * for this use case.
+ */
+static inline bool
+rest_of_char_same(const char *s1, const char *s2, int len)
+{
+       while (len > 0)
+       {
+               len--;
+               if (s1[len] != s2[len])
+                       return false;
+       }
+       return true;
+}
+
+/* Expand each Levenshtein distance variant */
+#include "levenshtein.c"
+#define LEVENSHTEIN_LESS_EQUAL
+#include "levenshtein.c"
index 3ba34f88eec7be1b26c8d481990a8b91ec2d5b2b..417fd1771a8ed2efab0ae3822d0d09570b382c80 100644 (file)
@@ -786,6 +786,11 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS);
 extern Datum name_text(PG_FUNCTION_ARGS);
 extern Datum text_name(PG_FUNCTION_ARGS);
 extern int     varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid);
+extern int     varstr_levenshtein(const char *source, int slen, const char *target,
+                                  int tlen, int ins_c, int del_c, int sub_c);
+extern int     varstr_levenshtein_less_equal(const char *source, int slen,
+                                                         const char *target, int tlen, int ins_c,
+                                                         int del_c, int sub_c, int max_d);
 extern List *textToQualifiedNameList(text *textval);
 extern bool SplitIdentifierString(char *rawstring, char separator,
                                          List **namelist);