]> granicus.if.org Git - postgresql/commitdiff
Remove new coupling between NAMEDATALEN and MAX_LEVENSHTEIN_STRLEN.
authorTom Lane <tgl@sss.pgh.pa.us>
Fri, 22 Jan 2016 16:53:06 +0000 (11:53 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Fri, 22 Jan 2016 16:53:06 +0000 (11:53 -0500)
Commit e529cd4ffa605c6f introduced an Assert requiring NAMEDATALEN to be
less than MAX_LEVENSHTEIN_STRLEN, which has been 255 for a long time.
Since up to that instant we had always allowed NAMEDATALEN to be
substantially more than that, this was ill-advised.

It's debatable whether we need MAX_LEVENSHTEIN_STRLEN at all (versus
putting a CHECK_FOR_INTERRUPTS into the loop), or whether it has to be
so tight; but this patch takes the narrower approach of just not applying
the MAX_LEVENSHTEIN_STRLEN limit to calls from the parser.

Trusting the parser for this seems reasonable, first because the strings
are limited to NAMEDATALEN which is unlikely to be hugely more than 256,
and second because the maximum distance is tightly constrained by
MAX_FUZZY_DISTANCE (though we'd forgotten to make use of that limit in one
place).  That means the cost is not really O(mn) but more like O(max(m,n)).

Relaxing the limit for user-supplied calls is left for future research;
given the lack of complaints to date, it doesn't seem very high priority.

In passing, fix confusion between lengths-in-bytes and lengths-in-chars
in comments and error messages.

Per gripe from Kevin Day; solution suggested by Robert Haas.  Back-patch
to 9.5 where the unwanted restriction was introduced.

contrib/fuzzystrmatch/fuzzystrmatch.c
src/backend/parser/parse_relation.c
src/backend/utils/adt/levenshtein.c
src/include/utils/builtins.h

index 92a2f1b92a67d191b0cf0624cb66a89d57ffd805..cbac1f2381fdd067e7b06d430bd3b392e4ebf832 100644 (file)
@@ -171,12 +171,12 @@ levenshtein_with_costs(PG_FUNCTION_ARGS)
        /* Extract a pointer to the actual character data */
        s_data = VARDATA_ANY(src);
        t_data = VARDATA_ANY(dst);
-       /* Determine length of each string in bytes and characters */
+       /* Determine length of each string in bytes */
        s_bytes = VARSIZE_ANY_EXHDR(src);
        t_bytes = VARSIZE_ANY_EXHDR(dst);
 
-       PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, ins_c,
-                                                                          del_c, sub_c));
+       PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
+                                                                          ins_c, del_c, sub_c, false));
 }
 
 
@@ -194,12 +194,12 @@ levenshtein(PG_FUNCTION_ARGS)
        /* Extract a pointer to the actual character data */
        s_data = VARDATA_ANY(src);
        t_data = VARDATA_ANY(dst);
-       /* Determine length of each string in bytes and characters */
+       /* Determine length of each string in bytes */
        s_bytes = VARSIZE_ANY_EXHDR(src);
        t_bytes = VARSIZE_ANY_EXHDR(dst);
 
-       PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, 1, 1,
-                                                                          1));
+       PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
+                                                                          1, 1, 1, false));
 }
 
 
@@ -221,13 +221,14 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
        /* Extract a pointer to the actual character data */
        s_data = VARDATA_ANY(src);
        t_data = VARDATA_ANY(dst);
-       /* Determine length of each string in bytes and characters */
+       /* Determine length of each string in bytes */
        s_bytes = VARSIZE_ANY_EXHDR(src);
        t_bytes = VARSIZE_ANY_EXHDR(dst);
 
-       PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
-                                                                                                 t_bytes, ins_c, del_c,
-                                                                                                 sub_c, max_d));
+       PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
+                                                                                                 t_data, t_bytes,
+                                                                                                 ins_c, del_c, sub_c,
+                                                                                                 max_d, false));
 }
 
 
@@ -246,12 +247,14 @@ levenshtein_less_equal(PG_FUNCTION_ARGS)
        /* Extract a pointer to the actual character data */
        s_data = VARDATA_ANY(src);
        t_data = VARDATA_ANY(dst);
-       /* Determine length of each string in bytes and characters */
+       /* Determine length of each string in bytes */
        s_bytes = VARSIZE_ANY_EXHDR(src);
        t_bytes = VARSIZE_ANY_EXHDR(dst);
 
-       PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
-                                                                                                 t_bytes, 1, 1, 1, max_d));
+       PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
+                                                                                                 t_data, t_bytes,
+                                                                                                 1, 1, 1,
+                                                                                                 max_d, false));
 }
 
 
index 632eb29312ebd649e69aedac145ed3efd5732fbf..81332b57d9311c2a04afe2200fe842f179850576 100644 (file)
@@ -550,7 +550,8 @@ updateFuzzyAttrMatchState(int fuzzy_rte_penalty,
                varstr_levenshtein_less_equal(actual, strlen(actual), match, matchlen,
                                                                          1, 1, 1,
                                                                          fuzzystate->distance + 1
-                                                                         - fuzzy_rte_penalty);
+                                                                         - fuzzy_rte_penalty,
+                                                                         true);
 
        /*
         * If more than half the characters are different, don't treat it as a
@@ -843,10 +844,12 @@ searchRangeTableForCol(ParseState *pstate, const char *alias, char *colname,
                         */
                        if (alias != NULL)
                                fuzzy_rte_penalty =
-                                       varstr_levenshtein(alias, strlen(alias),
-                                                                          rte->eref->aliasname,
-                                                                          strlen(rte->eref->aliasname),
-                                                                          1, 1, 1);
+                                       varstr_levenshtein_less_equal(alias, strlen(alias),
+                                                                                                 rte->eref->aliasname,
+                                                                                               strlen(rte->eref->aliasname),
+                                                                                                 1, 1, 1,
+                                                                                                 MAX_FUZZY_DISTANCE + 1,
+                                                                                                 true);
 
                        /*
                         * Scan for a matching column; if we find an exact match, we're
index a499a20df9dd99a829c4583ea50c15f5a47c7691..f40557b847e3f0bb88c1e38dfd874788dff5e3db 100644 (file)
 #define MAX_LEVENSHTEIN_STRLEN         255
 
 /*
- * Calculates Levenshtein distance metric between supplied csrings, which are
- * not necessarily null-terminated.  Generally (1, 1, 1) penalty costs suffices
- * for common cases, but your mileage may vary.
+ * Calculates Levenshtein distance metric between supplied strings, which are
+ * not necessarily null-terminated.
+ *
+ * source: source string, of length slen bytes.
+ * target: target string, of length tlen bytes.
+ * ins_c, del_c, sub_c: costs to charge for character insertion, deletion,
+ *             and substitution respectively; (1, 1, 1) costs suffice for common
+ *             cases, but your mileage may vary.
+ * max_d: if provided and >= 0, maximum distance we care about; see below.
+ * trusted: caller is trusted and need not obey MAX_LEVENSHTEIN_STRLEN.
  *
  * One way to compute Levenshtein distance is to incrementally construct
  * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
@@ -43,7 +50,7 @@
  * array.
  *
  * If max_d >= 0, we only need to provide an accurate answer when that answer
- * is less than or equal to the bound.  From any cell in the matrix, there is
+ * is less than or equal to max_d.  From any cell in the matrix, there is
  * theoretical "minimum residual distance" from that cell to the last column
  * of the final row.  This minimum residual distance is zero when the
  * untransformed portions of the strings are of equal length (because we might
  */
 int
 #ifdef LEVENSHTEIN_LESS_EQUAL
-varstr_levenshtein_less_equal(const char *source, int slen, const char *target,
-                                                         int tlen, int ins_c, int del_c, int sub_c,
-                                                         int max_d)
+varstr_levenshtein_less_equal(const char *source, int slen,
+                                                         const char *target, int tlen,
+                                                         int ins_c, int del_c, int sub_c,
+                                                         int max_d, bool trusted)
 #else
-varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
-                                  int ins_c, int del_c, int sub_c)
+varstr_levenshtein(const char *source, int slen,
+                                  const char *target, int tlen,
+                                  int ins_c, int del_c, int sub_c,
+                                  bool trusted)
 #endif
 {
        int                     m,
@@ -95,15 +105,7 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
 #define STOP_COLUMN m
 #endif
 
-       /*
-        * A common use for Levenshtein distance is to match attributes when
-        * building diagnostic, user-visible messages.  Restrict the size of
-        * MAX_LEVENSHTEIN_STRLEN at compile time so that this is guaranteed to
-        * work.
-        */
-       StaticAssertStmt(NAMEDATALEN <= MAX_LEVENSHTEIN_STRLEN,
-                                        "Levenshtein hinting mechanism restricts NAMEDATALEN");
-
+       /* Convert string lengths (in bytes) to lengths in characters */
        m = pg_mbstrlen_with_len(source, slen);
        n = pg_mbstrlen_with_len(target, tlen);
 
@@ -118,14 +120,18 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
 
        /*
         * For security concerns, restrict excessive CPU+RAM usage. (This
-        * implementation uses O(m) memory and has O(mn) complexity.)
+        * implementation uses O(m) memory and has O(mn) complexity.)  If
+        * "trusted" is true, caller is responsible for not making excessive
+        * requests, typically by using a small max_d along with strings that are
+        * bounded, though not necessarily to MAX_LEVENSHTEIN_STRLEN exactly.
         */
-       if (m > MAX_LEVENSHTEIN_STRLEN ||
-               n > MAX_LEVENSHTEIN_STRLEN)
+       if (!trusted &&
+               (m > MAX_LEVENSHTEIN_STRLEN ||
+                n > MAX_LEVENSHTEIN_STRLEN))
                ereport(ERROR,
                                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                                errmsg("argument exceeds the maximum length of %d bytes",
-                                               MAX_LEVENSHTEIN_STRLEN)));
+               errmsg("levenshtein argument exceeds maximum length of %d characters",
+                          MAX_LEVENSHTEIN_STRLEN)));
 
 #ifdef LEVENSHTEIN_LESS_EQUAL
        /* Initialize start and stop columns. */
index 477fde1f81dbbc601aedd46893fff57b2ac3968d..3c134a3aa9688cef8970d75a59ff6b2c09286ddf 100644 (file)
@@ -810,11 +810,14 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS);
 extern Datum name_text(PG_FUNCTION_ARGS);
 extern Datum text_name(PG_FUNCTION_ARGS);
 extern int     varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid);
-extern int varstr_levenshtein(const char *source, int slen, const char *target,
-                                  int tlen, int ins_c, int del_c, int sub_c);
+extern int varstr_levenshtein(const char *source, int slen,
+                                  const char *target, int tlen,
+                                  int ins_c, int del_c, int sub_c,
+                                  bool trusted);
 extern int varstr_levenshtein_less_equal(const char *source, int slen,
-                                                         const char *target, int tlen, int ins_c,
-                                                         int del_c, int sub_c, int max_d);
+                                                         const char *target, int tlen,
+                                                         int ins_c, int del_c, int sub_c,
+                                                         int max_d, bool trusted);
 extern List *textToQualifiedNameList(text *textval);
 extern bool SplitIdentifierString(char *rawstring, char separator,
                                          List **namelist);