Allow empty replacement strings in contrib/unaccent.

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 1 Jul 2014 00:51:26 +0000 (20:51 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 1 Jul 2014 00:51:30 +0000 (20:51 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 1 Jul 2014 00:51:26 +0000 (20:51 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 1 Jul 2014 00:51:30 +0000 (20:51 -0400)
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c

index a337df61af4f5dd45a9c39dc9543b07602cae75d..5a31f85a132a0227e2e6b7c6783a265f22e4553e 100644 (file)
--- a/contrib/unaccent/unaccent.c
+++ b/contrib/unaccent/unaccent.c
@@ -104,11 +104,21 @@ initTrie(char *filename)
  
                         while ((line = tsearch_readline(&trst)) != NULL)
                         {
-                               /*
-                                * The format of each line must be "src trg" where src and trg
-                                * are sequences of one or more non-whitespace characters,
-                                * separated by whitespace.  Whitespace at start or end of
-                                * line is ignored.
+                               /*----------
+                                * The format of each line must be "src" or "src trg", where
+                                * src and trg are sequences of one or more non-whitespace
+                                * characters, separated by whitespace.  Whitespace at start
+                                * or end of line is ignored.  If trg is omitted, an empty
+                                * string is used as the replacement.
+                                *
+                                * We use a simple state machine, with states
+                                *      0       initial (before src)
+                                *      1       in src
+                                *      2       in whitespace after src
+                                *      3       in trg
+                                *      4       in whitespace after trg
+                                *      -1      syntax error detected (line will be ignored)
+                                *----------
                                  */
                                 int                     state;
                                 char       *ptr;
@@ -160,7 +170,14 @@ initTrie(char *filename)
                                         }
                                 }
  
-                               if (state >= 3)
+                               if (state == 1 || state == 2)
+                               {
+                                       /* trg was omitted, so use "" */
+                                       trg = "";
+                                       trglen = 0;
+                               }
+
+                               if (state > 0)
                                         rootTrie = placeChar(rootTrie,
                                                                                  (unsigned char *) src, srclen,
                                                                                  trg, trglen);
diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml

index af9cad5d8c778659ba2d36421579ad6d6e4cf64a..aef0031dcbcc40073046ab3c09d47ea949818e13 100644 (file)
--- a/doc/src/sgml/unaccent.sgml
+++ b/doc/src/sgml/unaccent.sgml
@@ -45,9 +45,9 @@
    <itemizedlist>
     <listitem>
      <para>
-     Each line represents a pair, consisting of a character with accent
-     followed by a character without accent.  The first is translated into
-     the second.  For example,
+     Each line represents one translation rule, consisting of a character with
+     accent followed by a character without accent.  The first is translated
+     into the second.  For example,
  <programlisting>
  &Agrave;        A
  &Aacute;        A
@@ -57,6 +57,27 @@
  &Aring;        A
  &AElig;        A
  </programlisting>
+     The two characters must be separated by whitespace, and any leading or
+     trailing whitespace on a line is ignored.
+    </para>
+   </listitem>
+
+   <listitem>
+    <para>
+     Alternatively, if only one character is given on a line, instances of
+     that character are deleted; this is useful in languages where accents
+     are represented by separate characters.
+    </para>
+   </listitem>
+
+   <listitem>
+    <para>
+     As with other <productname>PostgreSQL</> text search configuration files,
+     the rules file must be stored in UTF-8 encoding.  The data is
+     automatically translated into the current database's encoding when
+     loaded.  Any lines containing untranslatable characters are silently
+     ignored, so that rules files can contain rules that are not applicable in
+     the current encoding.
      </para>
     </listitem>
    </itemizedlist>
@@ -132,8 +153,8 @@ mydb=# select ts_headline('fr','H&ocirc;tel de la Mer',to_tsquery('fr','Hotels')
  
   <para>
    The <function>unaccent()</> function removes accents (diacritic signs) from
-  a given string.  Basically, it's a wrapper around the
-  <filename>unaccent</> dictionary, but it can be used outside normal
+  a given string.  Basically, it's a wrapper around
+  <filename>unaccent</>-type dictionaries, but it can be used outside normal
    text search contexts.
   </para>
  
@@ -145,6 +166,11 @@ mydb=# select ts_headline('fr','H&ocirc;tel de la Mer',to_tsquery('fr','Hotels')
  unaccent(<optional><replaceable class="PARAMETER">dictionary</replaceable>, </optional> <replaceable class="PARAMETER">string</replaceable>) returns <type>text</type>
  </synopsis>
  
+ <para>
+  If the <replaceable class="PARAMETER">dictionary</replaceable> argument is
+  omitted, <literal>unaccent</> is assumed.
+ </para>
+
   <para>
    For example:
  <programlisting>
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 1 Jul 2014 00:51:26 +0000 (20:51 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 1 Jul 2014 00:51:30 +0000 (20:51 -0400)
contrib/unaccent/unaccent.c		patch \| blob \| history
doc/src/sgml/unaccent.sgml		patch \| blob \| history