Teach levenshtein() about multi-byte characters.

author Robert Haas <rhaas@postgresql.org>

Mon, 2 Aug 2010 23:20:23 +0000 (23:20 +0000)

committer Robert Haas <rhaas@postgresql.org>

Mon, 2 Aug 2010 23:20:23 +0000 (23:20 +0000)
author Robert Haas <rhaas@postgresql.org>
Mon, 2 Aug 2010 23:20:23 +0000 (23:20 +0000)
committer Robert Haas <rhaas@postgresql.org>
Mon, 2 Aug 2010 23:20:23 +0000 (23:20 +0000)
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c

index 169081591888f67c08ebf242b8e4f6d9063d9229..c752b2dfda1650b6d80a84e2b55424e80134c2fc 100644 (file)
--- a/contrib/fuzzystrmatch/fuzzystrmatch.c
+++ b/contrib/fuzzystrmatch/fuzzystrmatch.c
@@ -5,7 +5,7 @@
   *
   * Joe Conway <mail@joeconway.com>
   *
- * $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.33 2010/07/29 20:11:48 rhaas Exp $
+ * $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.34 2010/08/02 23:20:23 rhaas Exp $
   * Copyright (c) 2001-2010, PostgreSQL Global Development Group
   * ALL RIGHTS RESERVED;
   *
@@ -50,6 +50,7 @@
  #include <ctype.h>
  
  #include "fmgr.h"
+#include "mb/pg_wchar.h"
  #include "utils/builtins.h"
  
  PG_MODULE_MAGIC;
@@ -183,6 +184,18 @@ getcode(char c)
  /* These prevent GH from becoming F */
  #define NOGHTOF(c)     (getcode(c) & 16)       /* BDH */
  
+/* Faster than memcmp(), for this use case. */
+static bool inline
+rest_of_char_same(const char *s1, const char *s2, int len)
+{
+       while (len > 0)
+       {
+               len--;
+               if (s1[len] != s2[len])
+                       return false;
+       }
+       return true;
+}
  
  /*
   * levenshtein_internal - Calculates Levenshtein distance metric
@@ -195,16 +208,27 @@ levenshtein_internal(text *s, text *t,
                                          int ins_c, int del_c, int sub_c)
  {
         int                     m,
-                               n;
+                               n,
+                               s_bytes,
+                               t_bytes;
         int                *prev;
         int                *curr;
+       int                *s_char_len = NULL;
         int                     i,
                                 j;
-       const char *x;
+       const char *s_data;
+       const char *t_data;
         const char *y;
  
-       m = VARSIZE_ANY_EXHDR(s);
-       n = VARSIZE_ANY_EXHDR(t);
+       /* Extract a pointer to the actual character data. */
+       s_data = VARDATA_ANY(s);
+       t_data = VARDATA_ANY(t);
+
+       /* Determine length of each string in bytes and characters. */
+       s_bytes = VARSIZE_ANY_EXHDR(s);
+       t_bytes = VARSIZE_ANY_EXHDR(t);
+       m = pg_mbstrlen_with_len(s_data, s_bytes);
+       n = pg_mbstrlen_with_len(t_data, t_bytes);
  
         /*
          * We can transform an empty s into t with n insertions, or a non-empty t
@@ -226,6 +250,28 @@ levenshtein_internal(text *s, text *t,
                                  errmsg("argument exceeds the maximum length of %d bytes",
                                                 MAX_LEVENSHTEIN_STRLEN)));
  
+       /*
+        * In order to avoid calling pg_mblen() repeatedly on each character in s,
+        * we cache all the lengths before starting the main loop -- but if all the
+        * characters in both strings are single byte, then we skip this and use
+        * a fast-path in the main loop.  If only one string contains multi-byte
+        * characters, we still build the array, so that the fast-path needn't
+        * deal with the case where the array hasn't been initialized.
+        */
+       if (m != s_bytes || n != t_bytes)
+       {
+               int             i;
+               const char *cp = s_data;
+
+               s_char_len = (int *) palloc((m + 1) * sizeof(int));
+               for (i = 0; i < m; ++i)
+               {
+                       s_char_len[i] = pg_mblen(cp);
+                       cp += s_char_len[i];
+               }
+               s_char_len[i] = 0;
+       }
+
         /* One more cell for initialization column and row. */
         ++m;
         ++n;
@@ -244,9 +290,11 @@ levenshtein_internal(text *s, text *t,
                 prev[i] = i * del_c;
  
         /* Loop through rows of the notional array */
-       for (y = VARDATA_ANY(t), j = 1; j < n; y++, j++)
+       for (y = t_data, j = 1; j < n; j++)
         {
                 int                *temp;
+               const char *x = s_data;
+               int                     y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1;
  
                 /*
                  * First cell must increment sequentially, as we're on the j'th row of
@@ -254,26 +302,77 @@ levenshtein_internal(text *s, text *t,
                  */
                 curr[0] = j * ins_c;
  
-               for (x = VARDATA_ANY(s), i = 1; i < m; x++, i++)
+               /*
+                * This inner loop is critical to performance, so we include a
+                * fast-path to handle the (fairly common) case where no multibyte
+                * characters are in the mix.  The fast-path is entitled to assume
+                * that if s_char_len is not initialized then BOTH strings contain
+                * only single-byte characters.
+                */
+               if (s_char_len != NULL)
                 {
-                       int                     ins;
-                       int                     del;
-                       int                     sub;
-
-                       /* Calculate costs for probable operations. */
-                       ins = prev[i] + ins_c;          /* Insertion    */
-                       del = curr[i - 1] + del_c;      /* Deletion             */
-                       sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);           /* Substitution */
-
-                       /* Take the one with minimum cost. */
-                       curr[i] = Min(ins, del);
-                       curr[i] = Min(curr[i], sub);
+                       for (i = 1; i < m; i++)
+                       {
+                               int                     ins;
+                               int                     del;
+                               int                     sub;
+                               int                     x_char_len = s_char_len[i - 1];
+
+                               /*
+                                * Calculate costs for insertion, deletion, and substitution.
+                                *
+                                * When calculating cost for substitution, we compare the last
+                                * character of each possibly-multibyte character first,
+                                * because that's enough to rule out most mis-matches.  If we
+                                * get past that test, then we compare the lengths and the
+                                * remaining bytes.
+                                */
+                               ins = prev[i] + ins_c;
+                               del = curr[i - 1] + del_c;
+                               if (x[x_char_len-1] == y[y_char_len-1]
+                                       && x_char_len == y_char_len &&
+                                       (x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
+                                       sub = prev[i - 1];
+                               else
+                                       sub = prev[i - 1] + sub_c;
+
+                               /* Take the one with minimum cost. */
+                               curr[i] = Min(ins, del);
+                               curr[i] = Min(curr[i], sub);
+
+                               /* Point to next character. */
+                               x += x_char_len;
+                       }
+               }
+               else
+               {
+                       for (i = 1; i < m; i++)
+                       {
+                               int                     ins;
+                               int                     del;
+                               int                     sub;
+
+                               /* Calculate costs for insertion, deletion, and substitution. */
+                               ins = prev[i] + ins_c;
+                               del = curr[i - 1] + del_c;
+                               sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
+
+                               /* Take the one with minimum cost. */
+                               curr[i] = Min(ins, del);
+                               curr[i] = Min(curr[i], sub);
+
+                               /* Point to next character. */
+                               x++;
+                       }
                 }
  
                 /* Swap current row with previous row. */
                 temp = curr;
                 curr = prev;
                 prev = temp;
+
+               /* Point to next character. */
+               y += y_char_len;
         }
  
         /*
diff --git a/doc/src/sgml/fuzzystrmatch.sgml b/doc/src/sgml/fuzzystrmatch.sgml

index e0a8cea9ad872801f2b5d690eac8f21e65ef1281..1b8893697a914d21a17503f98523e4b16dd4c0cd 100644 (file)
--- a/doc/src/sgml/fuzzystrmatch.sgml
+++ b/doc/src/sgml/fuzzystrmatch.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.6 2010/07/29 19:34:40 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.7 2010/08/02 23:20:23 rhaas Exp $ -->
  
  <sect1 id="fuzzystrmatch">
   <title>fuzzystrmatch</title>
@@ -14,8 +14,9 @@
  
   <caution>
    <para>
-   At present, <filename>fuzzystrmatch</> does not work well with
-   multi-byte encodings (such as UTF-8).
+   At present, the <function>soundex</>, <function>metaphone</>,
+   <function>dmetaphone</>, and <function>dmetaphone_alt</> functions do
+   not work well with multi-byte encodings (such as UTF-8).
    </para>
   </caution>
author	Robert Haas <rhaas@postgresql.org>
	Mon, 2 Aug 2010 23:20:23 +0000 (23:20 +0000)
committer	Robert Haas <rhaas@postgresql.org>
	Mon, 2 Aug 2010 23:20:23 +0000 (23:20 +0000)
contrib/fuzzystrmatch/fuzzystrmatch.c		patch \| blob \| history
doc/src/sgml/fuzzystrmatch.sgml		patch \| blob \| history