From: Robert Haas Date: Thu, 13 Nov 2014 17:25:10 +0000 (-0500) Subject: Move the guts of our Levenshtein implementation into core. X-Git-Tag: REL9_5_ALPHA1~1223 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c0828b78e930a4e085ec52f19fdc850104cb0659;p=postgresql Move the guts of our Levenshtein implementation into core. The hope is that we can use this to produce better diagnostics in some cases. Peter Geoghegan, reviewed by Michael Paquier, with some further changes by me. --- diff --git a/contrib/fuzzystrmatch/Makefile b/contrib/fuzzystrmatch/Makefile index 024265d479..0327d9510a 100644 --- a/contrib/fuzzystrmatch/Makefile +++ b/contrib/fuzzystrmatch/Makefile @@ -17,6 +17,3 @@ top_builddir = ../.. include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk endif - -# levenshtein.c is #included by fuzzystrmatch.c -fuzzystrmatch.o: fuzzystrmatch.c levenshtein.c diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c index 7a53d8a008..f0df08032b 100644 --- a/contrib/fuzzystrmatch/fuzzystrmatch.c +++ b/contrib/fuzzystrmatch/fuzzystrmatch.c @@ -154,23 +154,6 @@ getcode(char c) /* These prevent GH from becoming F */ #define NOGHTOF(c) (getcode(c) & 16) /* BDH */ -/* Faster than memcmp(), for this use case. */ -static inline bool -rest_of_char_same(const char *s1, const char *s2, int len) -{ - while (len > 0) - { - len--; - if (s1[len] != s2[len]) - return false; - } - return true; -} - -#include "levenshtein.c" -#define LEVENSHTEIN_LESS_EQUAL -#include "levenshtein.c" - PG_FUNCTION_INFO_V1(levenshtein_with_costs); Datum levenshtein_with_costs(PG_FUNCTION_ARGS) @@ -180,8 +163,20 @@ levenshtein_with_costs(PG_FUNCTION_ARGS) int ins_c = PG_GETARG_INT32(2); int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); - - PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, ins_c, + del_c, sub_c)); } @@ -191,8 +186,20 @@ levenshtein(PG_FUNCTION_ARGS) { text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); - - PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, 1, 1, + 1)); } @@ -206,8 +213,21 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS) int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); int max_d = PG_GETARG_INT32(5); - - PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data, + t_bytes, ins_c, del_c, + sub_c, max_d)); } @@ -218,8 +238,20 @@ levenshtein_less_equal(PG_FUNCTION_ARGS) text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); int max_d = PG_GETARG_INT32(2); - - PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data, + t_bytes, 1, 1, 1, max_d)); } diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 7b4391bba1..3ea9bf435a 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -38,4 +38,6 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \ like.o: like.c like_match.c +varlena.o: varlena.c levenshtein.c + include $(top_srcdir)/src/backend/common.mk diff --git a/contrib/fuzzystrmatch/levenshtein.c b/src/backend/utils/adt/levenshtein.c similarity index 85% rename from contrib/fuzzystrmatch/levenshtein.c rename to src/backend/utils/adt/levenshtein.c index 4f37a54b1e..a8670e9a85 100644 --- a/contrib/fuzzystrmatch/levenshtein.c +++ b/src/backend/utils/adt/levenshtein.c @@ -1,41 +1,34 @@ -/* +/*------------------------------------------------------------------------- + * * levenshtein.c + * Levenshtein distance implementation. * - * Functions for "fuzzy" comparison of strings + * Original author: Joe Conway * - * Joe Conway + * This file is included by varlena.c twice, to provide matching code for (1) + * Levenshtein distance with custom costings, and (2) Levenshtein distance with + * custom costings and a "max" value above which exact distances are not + * interesting. Before the inclusion, we rely on the presence of the inline + * function rest_of_char_same(). + * + * Written based on a description of the algorithm by Michael Gilleland found + * at http://www.merriampark.com/ld.htm. Also looked at levenshtein.c in the + * PHP 4.0.6 distribution for inspiration. Configurable penalty costs + * extension is introduced by Volkan YAZICI . - */ - -/* - * External declarations for exported functions + * IDENTIFICATION + * src/backend/utils/adt/levenshtein.c + * + *------------------------------------------------------------------------- */ -#ifdef LEVENSHTEIN_LESS_EQUAL -static int levenshtein_less_equal_internal(text *s, text *t, - int ins_c, int del_c, int sub_c, int max_d); -#else -static int levenshtein_internal(text *s, text *t, - int ins_c, int del_c, int sub_c); -#endif - #define MAX_LEVENSHTEIN_STRLEN 255 - /* - * Calculates Levenshtein distance metric between supplied strings. Generally - * (1, 1, 1) penalty costs suffices for common cases, but your mileage may - * vary. + * Calculates Levenshtein distance metric between supplied csrings, which are + * not necessarily null-terminated. Generally (1, 1, 1) penalty costs suffices + * for common cases, but your mileage may vary. * * One way to compute Levenshtein distance is to incrementally construct * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number @@ -63,30 +56,27 @@ static int levenshtein_internal(text *s, text *t, * identify the portion of the matrix close to the diagonal which can still * affect the final answer. */ -static int +int #ifdef LEVENSHTEIN_LESS_EQUAL -levenshtein_less_equal_internal(text *s, text *t, - int ins_c, int del_c, int sub_c, int max_d) +varstr_levenshtein_less_equal(const char *source, int slen, const char *target, + int tlen, int ins_c, int del_c, int sub_c, + int max_d) #else -levenshtein_internal(text *s, text *t, - int ins_c, int del_c, int sub_c) +varstr_levenshtein(const char *source, int slen, const char *target, int tlen, + int ins_c, int del_c, int sub_c) #endif { int m, - n, - s_bytes, - t_bytes; + n; int *prev; int *curr; int *s_char_len = NULL; int i, j; - const char *s_data; - const char *t_data; const char *y; /* - * For levenshtein_less_equal_internal, we have real variables called + * For varstr_levenshtein_less_equal, we have real variables called * start_column and stop_column; otherwise it's just short-hand for 0 and * m. */ @@ -105,15 +95,8 @@ levenshtein_internal(text *s, text *t, #define STOP_COLUMN m #endif - /* Extract a pointer to the actual character data. */ - s_data = VARDATA_ANY(s); - t_data = VARDATA_ANY(t); - - /* Determine length of each string in bytes and characters. */ - s_bytes = VARSIZE_ANY_EXHDR(s); - t_bytes = VARSIZE_ANY_EXHDR(t); - m = pg_mbstrlen_with_len(s_data, s_bytes); - n = pg_mbstrlen_with_len(t_data, t_bytes); + m = pg_mbstrlen_with_len(source, slen); + n = pg_mbstrlen_with_len(target, tlen); /* * We can transform an empty s into t with n insertions, or a non-empty t @@ -193,10 +176,10 @@ levenshtein_internal(text *s, text *t, * multi-byte characters, we still build the array, so that the fast-path * needn't deal with the case where the array hasn't been initialized. */ - if (m != s_bytes || n != t_bytes) + if (m != slen || n != tlen) { int i; - const char *cp = s_data; + const char *cp = source; s_char_len = (int *) palloc((m + 1) * sizeof(int)); for (i = 0; i < m; ++i) @@ -223,11 +206,11 @@ levenshtein_internal(text *s, text *t, prev[i] = i * del_c; /* Loop through rows of the notional array */ - for (y = t_data, j = 1; j < n; j++) + for (y = target, j = 1; j < n; j++) { int *temp; - const char *x = s_data; - int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1; + const char *x = source; + int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1; #ifdef LEVENSHTEIN_LESS_EQUAL @@ -384,7 +367,7 @@ levenshtein_internal(text *s, text *t, prev[start_column] = max_d + 1; curr[start_column] = max_d + 1; if (start_column != 0) - s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; + source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; start_column++; } diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index c3171b549a..b3f397e959 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1546,7 +1546,6 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid) return result; } - /* text_cmp() * Internal comparison function for text strings. * Returns -1, 0 or 1 @@ -4747,3 +4746,24 @@ text_format_nv(PG_FUNCTION_ARGS) { return text_format(fcinfo); } + +/* + * Helper function for Levenshtein distance functions. Faster than memcmp(), + * for this use case. + */ +static inline bool +rest_of_char_same(const char *s1, const char *s2, int len) +{ + while (len > 0) + { + len--; + if (s1[len] != s2[len]) + return false; + } + return true; +} + +/* Expand each Levenshtein distance variant */ +#include "levenshtein.c" +#define LEVENSHTEIN_LESS_EQUAL +#include "levenshtein.c" diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 3ba34f88ee..417fd1771a 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -786,6 +786,11 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS); extern Datum name_text(PG_FUNCTION_ARGS); extern Datum text_name(PG_FUNCTION_ARGS); extern int varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid); +extern int varstr_levenshtein(const char *source, int slen, const char *target, + int tlen, int ins_c, int del_c, int sub_c); +extern int varstr_levenshtein_less_equal(const char *source, int slen, + const char *target, int tlen, int ins_c, + int del_c, int sub_c, int max_d); extern List *textToQualifiedNameList(text *textval); extern bool SplitIdentifierString(char *rawstring, char separator, List **namelist);