Allow multi-character source strings in contrib/unaccent.

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 1 Jul 2014 01:46:29 +0000 (21:46 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 1 Jul 2014 01:46:29 +0000 (21:46 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 1 Jul 2014 01:46:29 +0000 (21:46 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 1 Jul 2014 01:46:29 +0000 (21:46 -0400)
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c

index 5a31f85a132a0227e2e6b7c6783a265f22e4553e..0101506b4580f328c5a549e38227eba76f412705 100644 (file)
--- a/contrib/unaccent/unaccent.c
+++ b/contrib/unaccent/unaccent.c
@@ -23,9 +23,16 @@
  PG_MODULE_MAGIC;
  
  /*
- * Unaccent dictionary uses a trie to find a character to replace. Each node of
- * the trie is an array of 256 TrieChar structs (n-th element of array
- * corresponds to byte)
+ * An unaccent dictionary uses a trie to find a string to replace.  Each node
+ * of the trie is an array of 256 TrieChar structs; the N-th element of the
+ * array corresponds to next byte value N.  That element can contain both a
+ * replacement string (to be used if the source string ends with this byte)
+ * and a link to another trie node (to be followed if there are more bytes).
+ *
+ * Note that the trie search logic pays no attention to multibyte character
+ * boundaries.  This is OK as long as both the data entered into the trie and
+ * the data we're trying to look up are validly encoded; no partial-character
+ * matches will occur.
   */
  typedef struct TrieChar
  {
@@ -36,34 +43,38 @@ typedef struct TrieChar
  
  /*
   * placeChar - put str into trie's structure, byte by byte.
+ *
+ * If node is NULL, we need to make a new node, which will be returned;
+ * otherwise the return value is the same as node.
   */
  static TrieChar *
-placeChar(TrieChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
+placeChar(TrieChar *node, const unsigned char *str, int lenstr,
+                 const char *replaceTo, int replacelen)
  {
         TrieChar   *curnode;
  
         if (!node)
-       {
-               node = palloc(sizeof(TrieChar) * 256);
-               memset(node, 0, sizeof(TrieChar) * 256);
-       }
+               node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
+
+       Assert(lenstr > 0);                     /* else str[0] doesn't exist */
  
         curnode = node + *str;
  
-       if (lenstr == 1)
+       if (lenstr <= 1)
         {
                 if (curnode->replaceTo)
-                       elog(WARNING, "duplicate TO argument, use first one");
+                       elog(WARNING, "duplicate source strings, first one will be used");
                 else
                 {
                         curnode->replacelen = replacelen;
-                       curnode->replaceTo = palloc(replacelen);
+                       curnode->replaceTo = (char *) palloc(replacelen);
                         memcpy(curnode->replaceTo, replaceTo, replacelen);
                 }
         }
         else
         {
-               curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
+               curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
+                                                                         replaceTo, replacelen);
         }
  
         return node;
@@ -213,23 +224,35 @@ initTrie(char *filename)
  }
  
  /*
- * findReplaceTo - find multibyte character in trie
+ * findReplaceTo - find longest possible match in trie
+ *
+ * On success, returns pointer to ending subnode, plus length of matched
+ * source string in *p_matchlen.  On failure, returns NULL.
   */
  static TrieChar *
-findReplaceTo(TrieChar *node, unsigned char *src, int srclen)
+findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
+                         int *p_matchlen)
  {
-       while (node)
+       TrieChar   *result = NULL;
+       int                     matchlen = 0;
+
+       *p_matchlen = 0;                        /* prevent uninitialized-variable warnings */
+
+       while (node && matchlen < srclen)
         {
-               node = node + *src;
-               if (srclen == 1)
-                       return node;
+               node = node + src[matchlen];
+               matchlen++;
+
+               if (node->replaceTo)
+               {
+                       result = node;
+                       *p_matchlen = matchlen;
+               }
  
-               src++;
-               srclen--;
                 node = node->nextChar;
         }
  
-       return NULL;
+       return result;
  }
  
  PG_FUNCTION_INFO_V1(unaccent_init);
@@ -280,18 +303,17 @@ unaccent_lexize(PG_FUNCTION_ARGS)
         TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
         char       *srcchar = (char *) PG_GETARG_POINTER(1);
         int32           len = PG_GETARG_INT32(2);
-       char       *srcstart,
+       char       *srcstart = srcchar,
                            *trgchar = NULL;
-       int                     charlen;
         TSLexeme   *res = NULL;
-       TrieChar   *node;
  
-       srcstart = srcchar;
-       while (srcchar - srcstart < len)
+       while (len > 0)
         {
-               charlen = pg_mblen(srcchar);
+               TrieChar   *node;
+               int                     matchlen;
  
-               node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen);
+               node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
+                                                        &matchlen);
                 if (node && node->replaceTo)
                 {
                         if (!res)
@@ -309,13 +331,18 @@ unaccent_lexize(PG_FUNCTION_ARGS)
                         memcpy(trgchar, node->replaceTo, node->replacelen);
                         trgchar += node->replacelen;
                 }
-               else if (res)
+               else
                 {
-                       memcpy(trgchar, srcchar, charlen);
-                       trgchar += charlen;
+                       matchlen = pg_mblen(srcchar);
+                       if (res)
+                       {
+                               memcpy(trgchar, srcchar, matchlen);
+                               trgchar += matchlen;
+                       }
                 }
  
-               srcchar += charlen;
+               srcchar += matchlen;
+               len -= matchlen;
         }
  
         if (res)
diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml

index aef0031dcbcc40073046ab3c09d47ea949818e13..1382fafc5ec3859ae14f8a3e0f26d57c46bbf0de 100644 (file)
--- a/doc/src/sgml/unaccent.sgml
+++ b/doc/src/sgml/unaccent.sgml
@@ -70,6 +70,14 @@
      </para>
     </listitem>
  
+   <listitem>
+    <para>
+     Actually, each <quote>character</> can be any string not containing
+     whitespace, so <filename>unaccent</> dictionaries could be used for
+     other sorts of substring substitutions besides diacritic removal.
+    </para>
+   </listitem>
+
     <listitem>
      <para>
       As with other <productname>PostgreSQL</> text search configuration files,
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 1 Jul 2014 01:46:29 +0000 (21:46 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 1 Jul 2014 01:46:29 +0000 (21:46 -0400)
contrib/unaccent/unaccent.c		patch \| blob \| history
doc/src/sgml/unaccent.sgml		patch \| blob \| history