]> granicus.if.org Git - postgresql/commitdiff
Bring SIMILAR TO and SUBSTRING into some semblance of conformance with
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 22 Sep 2002 17:27:25 +0000 (17:27 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 22 Sep 2002 17:27:25 +0000 (17:27 +0000)
the SQL99 standard.  (I'm not sure that the character-class features are
quite right, but that can be fixed later.)  Document SQL99 and POSIX
regexps as being different features; provide variants of SUBSTRING for
each.

doc/src/sgml/func.sgml
src/backend/parser/gram.y
src/backend/utils/adt/regexp.c
src/include/catalog/pg_proc.h
src/include/utils/builtins.h
src/test/regress/expected/strings.out
src/test/regress/sql/strings.sql

index 544d070a85cd05e07bc77cbdad42b34e0de9dc2c..7b12f80fb78ee7198a4086ea740793f02d64cc12 100644 (file)
@@ -1,5 +1,5 @@
 <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/func.sgml,v 1.124 2002/09/21 18:32:53 petere Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/func.sgml,v 1.125 2002/09/22 17:27:23 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -921,18 +921,31 @@ PostgreSQL documentation
       </row>
 
       <row>
-       <entry><function>substring</function>(<parameter>string</parameter> <optional>from <replaceable>pattern</replaceable></optional> <optional>for <replaceable>escape</replaceable></optional>)</entry>
+       <entry><function>substring</function>(<parameter>string</parameter> from <replaceable>pattern</replaceable>)</entry>
        <entry><type>text</type></entry>
        <entry>
-        extract regular expression
+        extract substring matching POSIX regular expression
         <indexterm>
          <primary>substring</primary>
         </indexterm>
        </entry>
-       <entry><literal>substring('Thomas' from 'mas$' for <optional>escape '\\'</optional>)</literal></entry>
+       <entry><literal>substring('Thomas' from '...$')</literal></entry>
        <entry><literal>mas</literal></entry>
       </row>
 
+      <row>
+       <entry><function>substring</function>(<parameter>string</parameter> from <replaceable>pattern</replaceable> for <replaceable>escape</replaceable>)</entry>
+       <entry><type>text</type></entry>
+       <entry>
+        extract substring matching SQL99 regular expression
+        <indexterm>
+         <primary>substring</primary>
+        </indexterm>
+       </entry>
+       <entry><literal>substring('Thomas' from '%#"o_a#"_' for '#')</literal></entry>
+       <entry><literal>oma</literal></entry>
+      </row>
+
       <row>
        <entry>
         <function>trim</function>(<optional>leading | trailing | both</optional>
@@ -960,6 +973,328 @@ PostgreSQL documentation
     </tgroup>
    </table>
 
+   <para>
+    Additional string manipulation functions are available and are
+    listed below.  Some of them are used internally to implement the
+    <acronym>SQL</acronym>-standard string functions listed above.
+   </para>
+
+   <table id="functions-string-other">
+    <title>Other String Functions</title>
+    <tgroup cols="5">
+     <thead>
+      <row>
+       <entry>Function</entry>
+       <entry>Return Type</entry>
+       <entry>Description</entry>
+       <entry>Example</entry>
+       <entry>Result</entry>
+      </row>
+     </thead>
+
+     <tbody>
+      <row>
+       <entry><function>ascii</function>(<type>text</type>)</entry>
+       <entry>integer</entry>
+       <entry>Returns the <acronym>ASCII</acronym> code of the first character of the argument.</entry>
+       <entry><literal>ascii('x')</literal></entry>
+       <entry><literal>120</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>btrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>trim</parameter> <type>text</type>)</entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Remove (trim) the longest string consisting only of characters
+        in <parameter>trim</parameter> from the start and end of
+        <parameter>string</parameter>.
+       </entry>
+       <entry><literal>btrim('xyxtrimyyx','xy')</literal></entry>
+       <entry><literal>trim</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>chr</function>(<type>integer</type>)</entry>
+       <entry><type>text</type></entry>
+       <entry>Returns the character with the given <acronym>ASCII</acronym> code.</entry>
+       <entry><literal>chr(65)</literal></entry>
+       <entry><literal>A</literal></entry>
+      </row>
+
+      <row>
+       <entry>
+        <function>convert</function>(<parameter>string</parameter>
+        <type>text</type>,
+        <optional><parameter>src_encoding</parameter> <type>name</type>,</optional>
+        <parameter>dest_encoding</parameter> <type>name</type>)
+       </entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Converts string using <parameter>dest_encoding</parameter>.
+        The original encoding is specified by
+        <parameter>src_encoding</parameter>.  If
+        <parameter>src_encoding</parameter> is omitted, database
+        encoding is assumed.
+       </entry>
+       <entry><literal>convert('text_in_unicode', 'UNICODE', 'LATIN1')</literal></entry>
+       <entry><literal>text_in_unicode</literal> represented in ISO 8859-1</entry>
+      </row>
+
+      <row>
+       <entry>
+        <function>decode</function>(<parameter>string</parameter> <type>text</type>,
+        <parameter>type</parameter> <type>text</type>)
+       </entry>
+       <entry><type>bytea</type></entry>
+       <entry>
+        Decodes binary data from <parameter>string</parameter> previously 
+        encoded with encode().  Parameter type is same as in encode().
+       </entry>
+       <entry><literal>decode('MTIzAAE=', 'base64')</literal></entry>
+       <entry><literal>123\000\001</literal></entry>
+      </row>       
+
+      <row>
+       <entry>
+        <function>encode</function>(<parameter>data</parameter> <type>bytea</type>,
+        <parameter>type</parameter> <type>text</type>)
+       </entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Encodes binary data to <acronym>ASCII</acronym>-only representation.  Supported
+        types are: 'base64', 'hex', 'escape'.
+       </entry>
+       <entry><literal>encode('123\\000\\001', 'base64')</literal></entry>
+       <entry><literal>MTIzAAE=</literal></entry>
+      </row>       
+
+      <row>
+       <entry><function>initcap</function>(<type>text</type>)</entry>
+       <entry><type>text</type></entry>
+       <entry>Converts first letter of each word (whitespace separated) to upper case.</entry>
+       <entry><literal>initcap('hi thomas')</literal></entry>
+       <entry><literal>Hi Thomas</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>length</function>(<parameter>string</parameter>)</entry>
+       <entry><type>integer</type></entry>
+       <entry>
+        length of string
+        <indexterm>
+         <primary>character strings</primary>
+         <secondary>length</secondary>
+        </indexterm>
+        <indexterm>
+         <primary>length</primary>
+         <secondary>character strings</secondary>
+         <see>character strings, length</see>
+        </indexterm>
+       </entry>
+       <entry><literal>length('jose')</literal></entry>
+       <entry><literal>4</literal></entry>
+      </row>
+
+      <row>
+       <entry>
+        <function>lpad</function>(<parameter>string</parameter> <type>text</type>,
+        <parameter>length</parameter> <type>integer</type>
+        <optional>, <parameter>fill</parameter> <type>text</type></optional>)
+       </entry>
+       <entry>text</entry>
+       <entry>
+        Fills up the <parameter>string</parameter> to length
+        <parameter>length</parameter> by prepending the characters
+        <parameter>fill</parameter> (a space by default).  If the
+        <parameter>string</parameter> is already longer than
+        <parameter>length</parameter> then it is truncated (on the
+        right).
+       </entry>
+       <entry><literal>lpad('hi', 5, 'xy')</literal></entry>
+       <entry><literal>xyxhi</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>ltrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>text</parameter> <type>text</type>)</entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Removes the longest string containing only characters from
+        <parameter>trim</parameter> from the start of the string.
+       </entry>
+       <entry><literal>ltrim('zzzytrim','xyz')</literal></entry>
+       <entry><literal>trim</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>pg_client_encoding</function>()</entry>
+       <entry><type>name</type></entry>
+       <entry>
+        Returns current client encoding name.
+       </entry>
+       <entry><literal>pg_client_encoding()</literal></entry>
+       <entry><literal>SQL_ASCII</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>quote_ident</function>(<parameter>string</parameter> text)</entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Returns the given string suitably quoted to be used as an identifier
+       in an SQL query string.
+       Quotes are added only if necessary (i.e., if the string contains
+       non-identifier characters or would be case-folded).
+       Embedded quotes are properly doubled.
+       </entry>
+       <entry><literal>quote_ident('Foo')</literal></entry>
+       <entry><literal>"Foo"</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>quote_literal</function>(<parameter>string</parameter> text)</entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Returns the given string suitably quoted to be used as a literal
+       in an SQL query string.
+       Embedded quotes and backslashes are properly doubled.
+       </entry>
+       <entry><literal>quote_literal('O\'Reilly')</literal></entry>
+       <entry><literal>'O''Reilly'</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>repeat</function>(<type>text</type>, <type>integer</type>)</entry>
+       <entry><type>text</type></entry>
+       <entry>Repeat text a number of times.</entry>
+       <entry><literal>repeat('Pg', 4)</literal></entry>
+       <entry><literal>PgPgPgPg</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>replace</function>(<parameter>string</parameter> <type>text</type>,
+       <parameter>from</parameter> <type>text</type>,
+       <parameter>to</parameter> <type>text</type>)</entry>
+       <entry><type>text</type></entry>
+       <entry>Replace all occurrences in <parameter>string</parameter> of substring
+        <parameter>from</parameter> with substring <parameter>to</parameter>
+       </entry>
+       <entry><literal>replace('abcdefabcdef', 'cd', 'XX')</literal></entry>
+       <entry><literal>abXXefabXXef</literal></entry>
+      </row>
+
+      <row>
+       <entry>
+        <function>rpad</function>(<parameter>string</parameter> <type>text</type>,
+        <parameter>length</parameter> <type>integer</type>
+        <optional>, <parameter>fill</parameter> <type>text</type></optional>)
+       </entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Fills up the <parameter>string</parameter> to length
+        <parameter>length</parameter> by appending the characters
+        <parameter>fill</parameter> (a space by default).  If the
+        <parameter>string</parameter> is already longer than
+        <parameter>length</parameter> then it is truncated.
+       </entry>
+       <entry><literal>rpad('hi', 5, 'xy')</literal></entry>
+       <entry><literal>hixyx</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>rtrim</function>(<parameter>string</parameter>
+        text, <parameter>trim</parameter> text)</entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Removes the longest string containing only characters from
+        <parameter>trim</parameter> from the end of the string.
+       </entry>
+       <entry><literal>rtrim('trimxxxx','x')</literal></entry>
+       <entry><literal>trim</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>split_part</function>(<parameter>string</parameter> <type>text</type>,
+       <parameter>delimiter</parameter> <type>text</type>,
+       <parameter>column</parameter> <type>integer</type>)</entry>
+       <entry><type>text</type></entry>
+       <entry>Split <parameter>string</parameter> on <parameter>delimiter</parameter>
+        returning the resulting (one based) <parameter>column</parameter> number.
+       </entry>
+       <entry><literal>split_part('abc~@~def~@~ghi','~@~',2)</literal></entry>
+       <entry><literal>def</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>strpos</function>(<parameter>string</parameter>, <parameter>substring</parameter>)</entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Locates specified substring. (same as
+        <literal>position(<parameter>substring</parameter> in
+         <parameter>string</parameter>)</literal>, but note the reversed
+        argument order)
+       </entry>
+       <entry><literal>strpos('high','ig')</literal></entry>
+       <entry><literal>2</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>substr</function>(<parameter>string</parameter>, <parameter>from</parameter> <optional>, <parameter>count</parameter></optional>)</entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Extracts specified substring. (same as
+        <literal>substring(<parameter>string</parameter> from <parameter>from</parameter> for <parameter>count</parameter>)</literal>)
+       </entry>
+       <entry><literal>substr('alphabet', 3, 2)</literal></entry>
+       <entry><literal>ph</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>to_ascii</function>(<type>text</type>
+        <optional>, <parameter>encoding</parameter></optional>)</entry>
+       <entry><type>text</type></entry>
+       <entry>Converts text from multibyte encoding to <acronym>ASCII</acronym>.</entry>
+       <entry><literal>to_ascii('Karel')</literal></entry>
+       <entry><literal>Karel</literal></entry>
+      </row>
+
+      <row>
+       <entry><function>to_hex</function>(<parameter>number</parameter> <type>integer</type>
+       or <type>bigint</type>)</entry>
+       <entry><type>text</type></entry>
+       <entry>Convert <parameter>number</parameter> to its equivalent hexadecimal
+        representation.
+       </entry>
+       <entry><literal>to_hex(9223372036854775807::bigint)</literal></entry>
+       <entry><literal>7fffffffffffffff</literal></entry>
+      </row>
+
+      <row>
+       <entry>
+        <function>translate</function>(<parameter>string</parameter>
+        <type>text</type>,
+        <parameter>from</parameter> <type>text</type>,
+        <parameter>to</parameter> <type>text</type>)
+       </entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Any character in <parameter>string</parameter> that matches a
+        character in the <parameter>from</parameter> set is replaced by
+        the corresponding character in the <parameter>to</parameter>
+        set.
+       </entry>
+       <entry><literal>translate('12345', '14', 'ax')</literal></entry>
+       <entry><literal>a23x5</literal></entry>
+      </row>       
+      
+     </tbody>
+    </tgroup>
+   </table>
+
+   <para>
+    The <function>to_ascii</function> function supports conversion from
+    LATIN1, LATIN2, WIN1250 (CP1250) only.
+   </para>
+
+
    <table id="conversion-names">
     <title>Available conversion names</title>
     <tgroup cols="3">
@@ -1579,407 +1914,87 @@ PostgreSQL documentation
       </row>
 
       <row>
-       <entry><literal>win1250_to_utf_8</literal></entry>
-       <entry><literal>WIN1250</literal></entry>
-       <entry><literal>UNICODE</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win1251_to_iso_8859_5</literal></entry>
-       <entry><literal>WIN</literal></entry>
-       <entry><literal>ISO_8859_5</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win1251_to_koi8r</literal></entry>
-       <entry><literal>WIN</literal></entry>
-       <entry><literal>KOI8</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win1251_to_mic</literal></entry>
-       <entry><literal>WIN</literal></entry>
-       <entry><literal>MULE_INTERNAL</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win1251_to_utf_8</literal></entry>
-       <entry><literal>WIN</literal></entry>
-       <entry><literal>UNICODE</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win1251_to_win866</literal></entry>
-       <entry><literal>WIN</literal></entry>
-       <entry><literal>ALT</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win1256_to_utf_8</literal></entry>
-       <entry><literal>WIN1256</literal></entry>
-       <entry><literal>UNICODE</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win866_to_iso_8859_5</literal></entry>
-       <entry><literal>ALT</literal></entry>
-       <entry><literal>ISO_8859_5</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win866_to_koi8r</literal></entry>
-       <entry><literal>ALT</literal></entry>
-       <entry><literal>KOI8</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win866_to_mic</literal></entry>
-       <entry><literal>ALT</literal></entry>
-       <entry><literal>MULE_INTERNAL</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win866_to_utf_8</literal></entry>
-       <entry><literal>ALT</literal></entry>
-       <entry><literal>UNICODE</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win866_to_win1251</literal></entry>
-       <entry><literal>ALT</literal></entry>
-       <entry><literal>WIN</literal></entry>
-      </row>
-
-      <row>
-       <entry><literal>win874_to_utf_8</literal></entry>
-       <entry><literal>WIN874</literal></entry>
-       <entry><literal>UNICODE</literal></entry>
-      </row>
-
-     </tbody>
-    </tgroup>
-   </table>
-
-   <para>
-    Additional string manipulation functions are available and are
-    listed below.  Some of them are used internally to implement the
-    <acronym>SQL</acronym>-standard string functions listed above.
-   </para>
-
-   <table id="functions-string-other">
-    <title>Other String Functions</title>
-    <tgroup cols="5">
-     <thead>
-      <row>
-       <entry>Function</entry>
-       <entry>Return Type</entry>
-       <entry>Description</entry>
-       <entry>Example</entry>
-       <entry>Result</entry>
-      </row>
-     </thead>
-
-     <tbody>
-      <row>
-       <entry><function>ascii</function>(<type>text</type>)</entry>
-       <entry>integer</entry>
-       <entry>Returns the <acronym>ASCII</acronym> code of the first character of the argument.</entry>
-       <entry><literal>ascii('x')</literal></entry>
-       <entry><literal>120</literal></entry>
-      </row>
-
-      <row>
-       <entry><function>btrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>trim</parameter> <type>text</type>)</entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Remove (trim) the longest string consisting only of characters
-        in <parameter>trim</parameter> from the start and end of
-        <parameter>string</parameter>.
-       </entry>
-       <entry><literal>btrim('xyxtrimyyx','xy')</literal></entry>
-       <entry><literal>trim</literal></entry>
-      </row>
-
-      <row>
-       <entry><function>chr</function>(<type>integer</type>)</entry>
-       <entry><type>text</type></entry>
-       <entry>Returns the character with the given <acronym>ASCII</acronym> code.</entry>
-       <entry><literal>chr(65)</literal></entry>
-       <entry><literal>A</literal></entry>
-      </row>
-
-      <row>
-       <entry>
-        <function>convert</function>(<parameter>string</parameter>
-        <type>text</type>,
-        <optional><parameter>src_encoding</parameter> <type>name</type>,</optional>
-        <parameter>dest_encoding</parameter> <type>name</type>)
-       </entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Converts string using <parameter>dest_encoding</parameter>.
-        The original encoding is specified by
-        <parameter>src_encoding</parameter>.  If
-        <parameter>src_encoding</parameter> is omitted, database
-        encoding is assumed.
-       </entry>
-       <entry><literal>convert('text_in_unicode', 'UNICODE', 'LATIN1')</literal></entry>
-       <entry><literal>text_in_unicode</literal> represented in ISO 8859-1</entry>
-      </row>
-
-      <row>
-       <entry>
-        <function>decode</function>(<parameter>string</parameter> <type>text</type>,
-        <parameter>type</parameter> <type>text</type>)
-       </entry>
-       <entry><type>bytea</type></entry>
-       <entry>
-        Decodes binary data from <parameter>string</parameter> previously 
-        encoded with encode().  Parameter type is same as in encode().
-       </entry>
-       <entry><literal>decode('MTIzAAE=', 'base64')</literal></entry>
-       <entry><literal>123\000\001</literal></entry>
-      </row>       
-
-      <row>
-       <entry>
-        <function>encode</function>(<parameter>data</parameter> <type>bytea</type>,
-        <parameter>type</parameter> <type>text</type>)
-       </entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Encodes binary data to <acronym>ASCII</acronym>-only representation.  Supported
-        types are: 'base64', 'hex', 'escape'.
-       </entry>
-       <entry><literal>encode('123\\000\\001', 'base64')</literal></entry>
-       <entry><literal>MTIzAAE=</literal></entry>
-      </row>       
-
-      <row>
-       <entry><function>initcap</function>(<type>text</type>)</entry>
-       <entry><type>text</type></entry>
-       <entry>Converts first letter of each word (whitespace separated) to upper case.</entry>
-       <entry><literal>initcap('hi thomas')</literal></entry>
-       <entry><literal>Hi Thomas</literal></entry>
-      </row>
-
-      <row>
-       <entry><function>length</function>(<parameter>string</parameter>)</entry>
-       <entry><type>integer</type></entry>
-       <entry>
-        length of string
-        <indexterm>
-         <primary>character strings</primary>
-         <secondary>length</secondary>
-        </indexterm>
-        <indexterm>
-         <primary>length</primary>
-         <secondary>character strings</secondary>
-         <see>character strings, length</see>
-        </indexterm>
-       </entry>
-       <entry><literal>length('jose')</literal></entry>
-       <entry><literal>4</literal></entry>
-      </row>
-
-      <row>
-       <entry>
-        <function>lpad</function>(<parameter>string</parameter> <type>text</type>,
-        <parameter>length</parameter> <type>integer</type>
-        <optional>, <parameter>fill</parameter> <type>text</type></optional>)
-       </entry>
-       <entry>text</entry>
-       <entry>
-        Fills up the <parameter>string</parameter> to length
-        <parameter>length</parameter> by prepending the characters
-        <parameter>fill</parameter> (a space by default).  If the
-        <parameter>string</parameter> is already longer than
-        <parameter>length</parameter> then it is truncated (on the
-        right).
-       </entry>
-       <entry><literal>lpad('hi', 5, 'xy')</literal></entry>
-       <entry><literal>xyxhi</literal></entry>
-      </row>
-
-      <row>
-       <entry><function>ltrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>text</parameter> <type>text</type>)</entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Removes the longest string containing only characters from
-        <parameter>trim</parameter> from the start of the string.
-       </entry>
-       <entry><literal>ltrim('zzzytrim','xyz')</literal></entry>
-       <entry><literal>trim</literal></entry>
+       <entry><literal>win1250_to_utf_8</literal></entry>
+       <entry><literal>WIN1250</literal></entry>
+       <entry><literal>UNICODE</literal></entry>
       </row>
 
       <row>
-       <entry><function>pg_client_encoding</function>()</entry>
-       <entry><type>name</type></entry>
-       <entry>
-        Returns current client encoding name.
-       </entry>
-       <entry><literal>pg_client_encoding()</literal></entry>
-       <entry><literal>SQL_ASCII</literal></entry>
+       <entry><literal>win1251_to_iso_8859_5</literal></entry>
+       <entry><literal>WIN</literal></entry>
+       <entry><literal>ISO_8859_5</literal></entry>
       </row>
 
       <row>
-       <entry><function>quote_ident</function>(<parameter>string</parameter> text)</entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Returns the given string suitably quoted to be used as an identifier
-       in an SQL query string.
-       Quotes are added only if necessary (i.e., if the string contains
-       non-identifier characters or would be case-folded).
-       Embedded quotes are properly doubled.
-       </entry>
-       <entry><literal>quote_ident('Foo')</literal></entry>
-       <entry><literal>"Foo"</literal></entry>
+       <entry><literal>win1251_to_koi8r</literal></entry>
+       <entry><literal>WIN</literal></entry>
+       <entry><literal>KOI8</literal></entry>
       </row>
 
       <row>
-       <entry><function>quote_literal</function>(<parameter>string</parameter> text)</entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Returns the given string suitably quoted to be used as a literal
-       in an SQL query string.
-       Embedded quotes and backslashes are properly doubled.
-       </entry>
-       <entry><literal>quote_literal('O\'Reilly')</literal></entry>
-       <entry><literal>'O''Reilly'</literal></entry>
+       <entry><literal>win1251_to_mic</literal></entry>
+       <entry><literal>WIN</literal></entry>
+       <entry><literal>MULE_INTERNAL</literal></entry>
       </row>
 
       <row>
-       <entry><function>repeat</function>(<type>text</type>, <type>integer</type>)</entry>
-       <entry><type>text</type></entry>
-       <entry>Repeat text a number of times.</entry>
-       <entry><literal>repeat('Pg', 4)</literal></entry>
-       <entry><literal>PgPgPgPg</literal></entry>
+       <entry><literal>win1251_to_utf_8</literal></entry>
+       <entry><literal>WIN</literal></entry>
+       <entry><literal>UNICODE</literal></entry>
       </row>
 
       <row>
-       <entry><function>replace</function>(<parameter>string</parameter> <type>text</type>,
-       <parameter>from</parameter> <type>text</type>,
-       <parameter>to</parameter> <type>text</type>)</entry>
-       <entry><type>text</type></entry>
-       <entry>Replace all occurrences in <parameter>string</parameter> of substring
-        <parameter>from</parameter> with substring <parameter>to</parameter>
-       </entry>
-       <entry><literal>replace('abcdefabcdef', 'cd', 'XX')</literal></entry>
-       <entry><literal>abXXefabXXef</literal></entry>
+       <entry><literal>win1251_to_win866</literal></entry>
+       <entry><literal>WIN</literal></entry>
+       <entry><literal>ALT</literal></entry>
       </row>
 
       <row>
-       <entry>
-        <function>rpad</function>(<parameter>string</parameter> <type>text</type>,
-        <parameter>length</parameter> <type>integer</type>
-        <optional>, <parameter>fill</parameter> <type>text</type></optional>)
-       </entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Fills up the <parameter>string</parameter> to length
-        <parameter>length</parameter> by appending the characters
-        <parameter>fill</parameter> (a space by default).  If the
-        <parameter>string</parameter> is already longer than
-        <parameter>length</parameter> then it is truncated.
-       </entry>
-       <entry><literal>rpad('hi', 5, 'xy')</literal></entry>
-       <entry><literal>hixyx</literal></entry>
+       <entry><literal>win1256_to_utf_8</literal></entry>
+       <entry><literal>WIN1256</literal></entry>
+       <entry><literal>UNICODE</literal></entry>
       </row>
 
       <row>
-       <entry><function>rtrim</function>(<parameter>string</parameter>
-        text, <parameter>trim</parameter> text)</entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Removes the longest string containing only characters from
-        <parameter>trim</parameter> from the end of the string.
-       </entry>
-       <entry><literal>rtrim('trimxxxx','x')</literal></entry>
-       <entry><literal>trim</literal></entry>
+       <entry><literal>win866_to_iso_8859_5</literal></entry>
+       <entry><literal>ALT</literal></entry>
+       <entry><literal>ISO_8859_5</literal></entry>
       </row>
 
       <row>
-       <entry><function>split_part</function>(<parameter>string</parameter> <type>text</type>,
-       <parameter>delimiter</parameter> <type>text</type>,
-       <parameter>column</parameter> <type>integer</type>)</entry>
-       <entry><type>text</type></entry>
-       <entry>Split <parameter>string</parameter> on <parameter>delimiter</parameter>
-        returning the resulting (one based) <parameter>column</parameter> number.
-       </entry>
-       <entry><literal>split_part('abc~@~def~@~ghi','~@~',2)</literal></entry>
-       <entry><literal>def</literal></entry>
+       <entry><literal>win866_to_koi8r</literal></entry>
+       <entry><literal>ALT</literal></entry>
+       <entry><literal>KOI8</literal></entry>
       </row>
 
       <row>
-       <entry><function>strpos</function>(<parameter>string</parameter>, <parameter>substring</parameter>)</entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Locates specified substring. (same as
-        <literal>position(<parameter>substring</parameter> in
-         <parameter>string</parameter>)</literal>, but note the reversed
-        argument order)
-       </entry>
-       <entry><literal>strpos('high','ig')</literal></entry>
-       <entry><literal>2</literal></entry>
+       <entry><literal>win866_to_mic</literal></entry>
+       <entry><literal>ALT</literal></entry>
+       <entry><literal>MULE_INTERNAL</literal></entry>
       </row>
 
       <row>
-       <entry><function>substr</function>(<parameter>string</parameter>, <parameter>from</parameter> <optional>, <parameter>count</parameter></optional>)</entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Extracts specified substring. (same as
-        <literal>substring(<parameter>string</parameter> from <parameter>from</parameter> for <parameter>count</parameter>)</literal>)
-       </entry>
-       <entry><literal>substr('alphabet', 3, 2)</literal></entry>
-       <entry><literal>ph</literal></entry>
+       <entry><literal>win866_to_utf_8</literal></entry>
+       <entry><literal>ALT</literal></entry>
+       <entry><literal>UNICODE</literal></entry>
       </row>
 
       <row>
-       <entry><function>to_ascii</function>(<type>text</type>
-        <optional>, <parameter>encoding</parameter></optional>)</entry>
-       <entry><type>text</type></entry>
-       <entry>Converts text from multibyte encoding to <acronym>ASCII</acronym>.</entry>
-       <entry><literal>to_ascii('Karel')</literal></entry>
-       <entry><literal>Karel</literal></entry>
+       <entry><literal>win866_to_win1251</literal></entry>
+       <entry><literal>ALT</literal></entry>
+       <entry><literal>WIN</literal></entry>
       </row>
 
       <row>
-       <entry><function>to_hex</function>(<parameter>number</parameter> <type>integer</type>
-       or <type>bigint</type>)</entry>
-       <entry><type>text</type></entry>
-       <entry>Convert <parameter>number</parameter> to its equivalent hexadecimal
-        representation.
-       </entry>
-       <entry><literal>to_hex(9223372036854775807::bigint)</literal></entry>
-       <entry><literal>7fffffffffffffff</literal></entry>
+       <entry><literal>win874_to_utf_8</literal></entry>
+       <entry><literal>WIN874</literal></entry>
+       <entry><literal>UNICODE</literal></entry>
       </row>
 
-      <row>
-       <entry>
-        <function>translate</function>(<parameter>string</parameter>
-        <type>text</type>,
-        <parameter>from</parameter> <type>text</type>,
-        <parameter>to</parameter> <type>text</type>)
-       </entry>
-       <entry><type>text</type></entry>
-       <entry>
-        Any character in <parameter>string</parameter> that matches a
-        character in the <parameter>from</parameter> set is replaced by
-        the corresponding character in the <parameter>to</parameter>
-        set.
-       </entry>
-       <entry><literal>translate('12345', '14', 'ax')</literal></entry>
-       <entry><literal>a23x5</literal></entry>
-      </row>       
-      
      </tbody>
     </tgroup>
    </table>
 
-   <para>
-    The <function>to_ascii</function> function supports conversion from
-    LATIN1, LATIN2, WIN1250 (CP1250) only.
-   </para>
   </sect1>
 
 
@@ -2171,16 +2186,16 @@ PostgreSQL documentation
   <title>Pattern Matching</title>
 
    <para>
-    There are two separate approaches to pattern matching provided by
+    There are three separate approaches to pattern matching provided by
     <productname>PostgreSQL</productname>:  the traditional
     <acronym>SQL</acronym> 
-    <function>LIKE</function> operator and the more recent
+    <function>LIKE</function> operator, the more recent
     <acronym>SQL99</acronym> 
-    <function>SIMILAR TO</function> operator implementing
+    <function>SIMILAR TO</function> operator, and
     <acronym>POSIX</acronym>-style regular expressions.
     Additionally, a pattern matching function,
-    <function>SUBSTRING</function>, is available, as defined in
-    <acronym>SQL99</acronym>.
+    <function>SUBSTRING</function>, is available, using either
+    <acronym>SQL99</acronym>-style or POSIX-style regular expressions.
    </para>
 
    <tip>
@@ -2190,13 +2205,6 @@ PostgreSQL documentation
     </para>
    </tip>
 
-   <para>
-    Both <function>LIKE</function> and <function>SIMILAR TO</function>
-    are SQL-standard operators which are also available in alternate
-    forms as <productname>PostgreSQL</productname> operators; look at
-    <literal>~</literal> and <literal>~~</literal> for examples.
-   </para>
-
   <sect2 id="functions-like">
    <title><function>LIKE</function></title>
 
@@ -2296,11 +2304,142 @@ PostgreSQL documentation
   </sect2>
 
 
-  <sect2 id="functions-regexp">
-   <title><function>SIMILAR TO</function> and <acronym>POSIX</acronym>
+  <sect2 id="functions-sql99-regexp">
+   <title><function>SIMILAR TO</function> and <acronym>SQL99</acronym>
      Regular Expressions</title>
 
-   <indexterm zone="functions-regexp">
+   <indexterm zone="functions-sql99-regexp">
+    <primary>regular expressions</primary>
+    <seealso>pattern matching</seealso>
+   </indexterm>
+
+   <indexterm>
+    <primary>similar to</primary>
+   </indexterm>
+
+   <indexterm>
+    <primary>substring</primary>
+   </indexterm>
+
+    <synopsis>
+<replaceable>string</replaceable> SIMILAR TO <replaceable>pattern</replaceable> <optional>ESCAPE <replaceable>escape-character</replaceable></optional>
+<replaceable>string</replaceable> NOT SIMILAR TO <replaceable>pattern</replaceable> <optional>ESCAPE <replaceable>escape-character</replaceable></optional>
+    </synopsis>
+
+    <para>
+     The <function>SIMILAR TO</function> operator returns true or false
+     depending on whether its pattern matches the given string.  It is
+     much like <function>LIKE</function>, except that it interprets the
+     pattern using <acronym>SQL99</acronym>'s definition of a regular
+     expression.
+     <acronym>SQL99</acronym>'s regular expressions are a curious cross
+     between <function>LIKE</function> notation and common regular expression
+     notation.
+    </para>
+
+    <para>
+     Like <function>LIKE</function>, the  <function>SIMILAR TO</function>
+     operator succeeds only if its pattern matches the entire string;
+     this is unlike common regular expression practice, wherein the pattern
+     may match any part of the string.
+     Also like
+     <function>LIKE</function>, <function>SIMILAR TO</function> uses
+     <literal>%</> and <literal>_</> as wildcard characters denoting
+     any string and any single character, respectively (these are
+     comparable to <literal>.*</> and <literal>.</> in POSIX regular
+     expressions).
+    </para>
+
+    <para>
+     In addition to these facilities borrowed from <function>LIKE</function>,
+     <function>SIMILAR TO</function> supports these pattern-matching
+     metacharacters borrowed from POSIX regular expressions:
+
+    <itemizedlist>
+     <listitem>
+      <para>
+       <literal>|</literal> denotes alternation (either of two alternatives).
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       <literal>*</literal> denotes repetition of the previous item zero
+       or more times.
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       <literal>+</literal> denotes repetition of the previous item one
+       or more times.
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       Parentheses <literal>()</literal> may be used to group items into
+       a single logical item.
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       A bracket expression <literal>[...]</literal> specifies a character
+       class, just as in POSIX regular expressions.
+      </para>
+     </listitem>
+    </itemizedlist>
+
+     Notice that bounded repetition (<literal>?</> and <literal>{...}</>)
+     are not provided, though they exist in POSIX.  Also, dot (<literal>.</>)
+     is not a metacharacter.
+    </para>
+
+    <para>
+     As with <function>LIKE</>, a backslash disables the special meaning
+     of any of these metacharacters; or a different escape character can
+     be specified with <literal>ESCAPE</>.
+    </para>
+
+    <informalexample>
+     <para>
+      Some examples:
+      <programlisting>
+'abc' SIMILAR TO 'abc'      <lineannotation>true</lineannotation>
+'abc' SIMILAR TO 'a'        <lineannotation>false</lineannotation>
+'abc' SIMILAR TO '%(b|d)%'  <lineannotation>true</lineannotation>
+'abc' SIMILAR TO '(b|c)%'   <lineannotation>false</lineannotation>
+      </programlisting>
+     </para>
+    </informalexample>
+
+    <para>
+     The SUBSTRING function with three parameters,
+     <function>SUBSTRING</function>(<parameter>string</parameter> FROM
+     <replaceable>pattern</replaceable> FOR
+     <replaceable>escape</replaceable>), provides extraction of a substring
+     that matches a SQL99 regular expression pattern.  As with SIMILAR TO,
+     the specified pattern must match to the entire data string, else the
+     function fails and returns NULL.  To indicate the part of the pattern
+     that should be returned on success, SQL99 specifies that the pattern
+     must contain two occurrences of the escape character followed by
+     double quote (<literal>"</>).  The text matching the portion of the
+     pattern between these markers is returned.
+    </para>
+
+    <informalexample>
+     <para>
+      Some examples:
+      <programlisting>
+SUBSTRING('foobar' FROM '%#"o_b#"%' FOR '#')   <lineannotation>oob</lineannotation>
+SUBSTRING('foobar' FROM '#"o_b#"%' FOR '#')    <lineannotation>NULL</lineannotation>
+      </programlisting>
+     </para>
+    </informalexample>
+
+  </sect2>
+
+  <sect2 id="functions-posix-regexp">
+   <title><acronym>POSIX</acronym> Regular Expressions</title>
+
+   <indexterm zone="functions-posix-regexp">
     <primary>regular expressions</primary>
     <seealso>pattern matching</seealso>
    </indexterm>
@@ -2341,12 +2480,6 @@ PostgreSQL documentation
         <entry>Does not match regular expression, case insensitive</entry>
         <entry><literal>'thomas' !~* '.*vadim.*'</literal></entry>
        </row>
-
-       <row>
-        <entry> <literal>SIMILAR TO</literal> </entry>
-        <entry>Matches regular expression, case sensitive</entry>
-        <entry><literal>'thomas' SIMILAR TO '.*thomas.*'</literal></entry>
-       </row>
       </tbody>
      </tgroup>
     </table>
@@ -2354,7 +2487,8 @@ PostgreSQL documentation
     <para>
      <acronym>POSIX</acronym> regular expressions provide a more
      powerful means for 
-     pattern matching than the <function>LIKE</function> function.
+     pattern matching than the <function>LIKE</function> and
+     <function>SIMILAR TO</> operators.
      Many Unix tools such as <command>egrep</command>,
      <command>sed</command>, or <command>awk</command> use a pattern
      matching language that is similar to the one described here.
@@ -2379,10 +2513,34 @@ PostgreSQL documentation
      <para>
       Some examples:
       <programlisting>
-'abc' SIMILAR TO 'abc'    <lineannotation>true</lineannotation>
-'abc' SIMILAR TO '^a'     <lineannotation>true</lineannotation>
-'abc' SIMILAR TO '(b|d)'  <lineannotation>true</lineannotation>
-'abc' SIMILAR TO '^(b|c)' <lineannotation>false</lineannotation>
+'abc' ~ 'abc'    <lineannotation>true</lineannotation>
+'abc' ~ '^a'     <lineannotation>true</lineannotation>
+'abc' ~ '(b|d)'  <lineannotation>true</lineannotation>
+'abc' ~ '^(b|c)' <lineannotation>false</lineannotation>
+      </programlisting>
+     </para>
+    </informalexample>
+
+    <para>
+     The SUBSTRING function with two parameters,
+     <function>SUBSTRING</function>(<parameter>string</parameter> FROM
+     <replaceable>pattern</replaceable>), provides extraction of a substring
+     that matches a POSIX regular expression pattern.  It returns NULL if
+     there is no match, otherwise the portion of the text that matched the
+     pattern.  But if the pattern contains any parentheses, the portion
+     of the text that matched the first parenthesized subexpression (the
+     one whose left parenthesis comes first) is
+     returned.  You can always put parentheses around the whole expression
+     if you want to use parentheses within it without triggering this
+     exception.
+    </para>
+
+    <informalexample>
+     <para>
+      Some examples:
+      <programlisting>
+SUBSTRING('foobar' FROM 'o.b')     <lineannotation>oob</lineannotation>
+SUBSTRING('foobar' FROM 'o(.)b')   <lineannotation>o</lineannotation>
       </programlisting>
      </para>
     </informalexample>
index 43597306d4410a67c520a26a2e02af25e2f579f1..be45d7bde162433a8ca6321abf51e2075938ebaf 100644 (file)
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.367 2002/09/18 21:35:21 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.368 2002/09/22 17:27:23 tgl Exp $
  *
  * HISTORY
  *       AUTHOR                        DATE                    MAJOR EVENT
@@ -5644,22 +5644,40 @@ a_expr:         c_expr                                                                  { $$ = $1; }
                                }
 
                        | a_expr SIMILAR TO a_expr                              %prec SIMILAR
-                               { $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, $4); }
+                               {
+                                       A_Const *c = makeNode(A_Const);
+                                       FuncCall *n = makeNode(FuncCall);
+                                       c->val.type = T_Null;
+                                       n->funcname = SystemFuncName("similar_escape");
+                                       n->args = makeList2($4, (Node *) c);
+                                       n->agg_star = FALSE;
+                                       n->agg_distinct = FALSE;
+                                       $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n);
+                               }
                        | a_expr SIMILAR TO a_expr ESCAPE a_expr
                                {
                                        FuncCall *n = makeNode(FuncCall);
-                                       n->funcname = SystemFuncName("like_escape");
+                                       n->funcname = SystemFuncName("similar_escape");
                                        n->args = makeList2($4, $6);
                                        n->agg_star = FALSE;
                                        n->agg_distinct = FALSE;
                                        $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n);
                                }
                        | a_expr NOT SIMILAR TO a_expr                  %prec SIMILAR
-                               { $$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, $5); }
+                               {
+                                       A_Const *c = makeNode(A_Const);
+                                       FuncCall *n = makeNode(FuncCall);
+                                       c->val.type = T_Null;
+                                       n->funcname = SystemFuncName("similar_escape");
+                                       n->args = makeList2($5, (Node *) c);
+                                       n->agg_star = FALSE;
+                                       n->agg_distinct = FALSE;
+                                       $$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, (Node *) n);
+                               }
                        | a_expr NOT SIMILAR TO a_expr ESCAPE a_expr
                                {
                                        FuncCall *n = makeNode(FuncCall);
-                                       n->funcname = SystemFuncName("like_escape");
+                                       n->funcname = SystemFuncName("similar_escape");
                                        n->args = makeList2($5, $7);
                                        n->agg_star = FALSE;
                                        n->agg_distinct = FALSE;
index b64d6ede65a8a870708ff919460b1bf45decb391..ebbca8f04017578db10adf96845d7d2a99e6b5ec 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.42 2002/09/04 20:31:28 momjian Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.43 2002/09/22 17:27:23 tgl Exp $
  *
  *             Alistair Crooks added the code for the regex caching
  *             agc - cached the regular expressions used - there's a good chance
@@ -317,8 +317,7 @@ textregexsubstr(PG_FUNCTION_ARGS)
        char       *sterm;
        int                     len;
        bool            match;
-       int                     nmatch = 1;
-       regmatch_t      pmatch;
+       regmatch_t      pmatch[2];
 
        /* be sure sterm is null-terminated */
        len = VARSIZE(s) - VARHDRSZ;
@@ -327,21 +326,131 @@ textregexsubstr(PG_FUNCTION_ARGS)
        sterm[len] = '\0';
 
        /*
-        * We need the match info back from the pattern match to be able to
-        * actually extract the substring. It seems to be adequate to pass in
-        * a structure to return only one result.
+        * We pass two regmatch_t structs to get info about the overall match
+        * and the match for the first parenthesized subexpression (if any).
+        * If there is a parenthesized subexpression, we return what it matched;
+        * else return what the whole regexp matched.
         */
-       match = RE_compile_and_execute(p, sterm, REG_EXTENDED, nmatch, &pmatch);
+       match = RE_compile_and_execute(p, sterm, REG_EXTENDED, 2, pmatch);
+
        pfree(sterm);
 
        /* match? then return the substring matching the pattern */
        if (match)
        {
+               int             so,
+                               eo;
+
+               so = pmatch[1].rm_so;
+               eo = pmatch[1].rm_eo;
+               if (so < 0 || eo < 0)
+               {
+                       /* no parenthesized subexpression */
+                       so = pmatch[0].rm_so;
+                       eo = pmatch[0].rm_eo;
+               }
+
                return (DirectFunctionCall3(text_substr,
                                                                        PointerGetDatum(s),
-                                                                       Int32GetDatum(pmatch.rm_so + 1),
-                                                       Int32GetDatum(pmatch.rm_eo - pmatch.rm_so)));
+                                                                       Int32GetDatum(so + 1),
+                                                                       Int32GetDatum(eo - so)));
        }
 
        PG_RETURN_NULL();
 }
+
+/* similar_escape()
+ * Convert a SQL99 regexp pattern to POSIX style, so it can be used by
+ * our regexp engine.
+ */
+Datum
+similar_escape(PG_FUNCTION_ARGS)
+{
+       text       *pat_text;
+       text       *esc_text;
+       text       *result;
+       unsigned char *p,
+                          *e,
+                          *r;
+       int                     plen,
+                               elen;
+       bool            afterescape = false;
+       int                     nquotes = 0;
+
+       /* This function is not strict, so must test explicitly */
+       if (PG_ARGISNULL(0))
+               PG_RETURN_NULL();
+       pat_text = PG_GETARG_TEXT_P(0);
+       p = VARDATA(pat_text);
+       plen = (VARSIZE(pat_text) - VARHDRSZ);
+       if (PG_ARGISNULL(1))
+       {
+               /* No ESCAPE clause provided; default to backslash as escape */
+               e = "\\";
+               elen = 1;
+       }
+       else
+       {
+               esc_text = PG_GETARG_TEXT_P(1);
+               e = VARDATA(esc_text);
+               elen = (VARSIZE(esc_text) - VARHDRSZ);
+               if (elen == 0)
+                       e = NULL;                       /* no escape character */
+               else if (elen != 1)
+                       elog(ERROR, "ESCAPE string must be empty or one character");
+       }
+
+       /* We need room for ^, $, and up to 2 output bytes per input byte */
+       result = (text *) palloc(VARHDRSZ + 2 + 2 * plen);
+       r = VARDATA(result);
+
+       *r++ = '^';
+
+       while (plen > 0)
+       {
+               unsigned char pchar = *p;
+
+               if (afterescape)
+               {
+                       if (pchar == '"')       /* for SUBSTRING patterns */
+                               *r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
+                       else
+                       {
+                               *r++ = '\\';
+                               *r++ = pchar;
+                       }
+                       afterescape = false;
+               }
+               else if (e && pchar == *e)
+               {
+                       /* SQL99 escape character; do not send to output */
+                       afterescape = true;
+               }
+               else if (pchar == '%')
+               {
+                       *r++ = '.';
+                       *r++ = '*';
+               }
+               else if (pchar == '_')
+               {
+                       *r++ = '.';
+               }
+               else if (pchar == '\\' || pchar == '.' || pchar == '?' ||
+                                pchar == '{')
+               {
+                       *r++ = '\\';
+                       *r++ = pchar;
+               }
+               else
+               {
+                       *r++ = pchar;
+               }
+               p++, plen--;
+       }
+
+       *r++ = '$';
+       
+       VARATT_SIZEP(result) = r - ((unsigned char *) result);
+
+       PG_RETURN_TEXT_P(result);
+}
index eb44f283b9197812b80de05070fb6692b66c3657..369da463e8a47cbe769e460a24a116b2bef63591 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: pg_proc.h,v 1.272 2002/09/18 21:35:23 tgl Exp $
+ * $Id: pg_proc.h,v 1.273 2002/09/22 17:27:23 tgl Exp $
  *
  * NOTES
  *       The script catalog/genbki.sh reads this file and generates .bki
@@ -2076,6 +2076,9 @@ DESCR("convert int4 to char");
 DATA(insert OID = 1622 (  repeat                       PGNSP PGUID 12 f f t f i 2 25 "25 23"  repeat - _null_ ));
 DESCR("replicate string int4 times");
 
+DATA(insert OID = 1623 (  similar_escape       PGNSP PGUID 12 f f f f i 2 25 "25 25" similar_escape - _null_ ));
+DESCR("convert SQL99 regexp pattern to POSIX style");
+
 DATA(insert OID = 1624 (  mul_d_interval       PGNSP PGUID 12 f f t f i 2 1186 "701 1186"      mul_d_interval - _null_ ));
 
 DATA(insert OID = 1633 (  texticlike           PGNSP PGUID 12 f f t f i 2 16 "25 25" texticlike - _null_ ));
@@ -2087,7 +2090,7 @@ DESCR("matches LIKE expression, case-insensitive");
 DATA(insert OID = 1636 (  nameicnlike          PGNSP PGUID 12 f f t f i 2 16 "19 25"  nameicnlike - _null_ ));
 DESCR("does not match LIKE expression, case-insensitive");
 DATA(insert OID = 1637 (  like_escape          PGNSP PGUID 12 f f t f i 2 25 "25 25" like_escape - _null_ ));
-DESCR("convert match pattern to use backslash escapes");
+DESCR("convert LIKE pattern to use backslash escapes");
 
 DATA(insert OID = 1689 (  update_pg_pwd_and_pg_group  PGNSP PGUID 12 f f t f v 0 2279  ""      update_pg_pwd_and_pg_group - _null_ ));
 DESCR("update pg_pwd and pg_group files");
@@ -2784,7 +2787,7 @@ DESCR("matches LIKE expression");
 DATA(insert OID = 2008 (  notlike                 PGNSP PGUID 12 f f t f i 2 16 "17 17"        byteanlike - _null_ ));
 DESCR("does not match LIKE expression");
 DATA(insert OID = 2009 (  like_escape     PGNSP PGUID 12 f f t f i 2 17 "17 17" like_escape_bytea - _null_ ));
-DESCR("convert match pattern to use backslash escapes");
+DESCR("convert LIKE pattern to use backslash escapes");
 DATA(insert OID = 2010 (  length                  PGNSP PGUID 12 f f t f i 1 23 "17"  byteaoctetlen - _null_ ));
 DESCR("octet length");
 DATA(insert OID = 2011 (  byteacat                PGNSP PGUID 12 f f t f i 2 17 "17 17"        byteacat - _null_ ));
@@ -2889,9 +2892,9 @@ DATA(insert OID = 2072 (  date_mi_interval        PGNSP PGUID 14 f f t f i 2 1114 "1082
 DESCR("subtract");
 
 DATA(insert OID = 2073 (  substring                    PGNSP PGUID 12 f f t f i 2 25 "25 25"   textregexsubstr - _null_ ));
-DESCR("substitutes regular expression");
-DATA(insert OID = 2074 (  substring                    PGNSP PGUID 14 f f t f i 3 25 "25 25 25"        "select substring($1, like_escape($2, $3))" - _null_ ));
-DESCR("substitutes regular expression with escape argument");
+DESCR("extracts text matching regular expression");
+DATA(insert OID = 2074 (  substring                    PGNSP PGUID 14 f f t f i 3 25 "25 25 25"        "select substring($1, similar_escape($2, $3))" - _null_ ));
+DESCR("extracts text matching SQL99 regular expression");
 
 DATA(insert OID = 2075 (  bit                          PGNSP PGUID 12 f f t f i 1 1560 "20"    bitfromint8 - _null_ ));
 DESCR("int8 to bitstring");
index bfa5fa8efc7de46552c7329aa5a20152f1a6f555..9f1c9ba8dbfa5df77455b3aa834b39f6ed73e595 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: builtins.h,v 1.201 2002/09/19 22:48:34 tgl Exp $
+ * $Id: builtins.h,v 1.202 2002/09/22 17:27:25 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -371,6 +371,7 @@ extern Datum nameicregexne(PG_FUNCTION_ARGS);
 extern Datum texticregexeq(PG_FUNCTION_ARGS);
 extern Datum texticregexne(PG_FUNCTION_ARGS);
 extern Datum textregexsubstr(PG_FUNCTION_ARGS);
+extern Datum similar_escape(PG_FUNCTION_ARGS);
 
 /* regproc.c */
 extern Datum regprocin(PG_FUNCTION_ARGS);
index 576fafb7729493308ae297abe0b2f7254d75868f..a73ca1aa84b17a9b5458618dc37ec664eca4877a 100644 (file)
@@ -142,15 +142,15 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
  t
 (1 row)
 
--- T581 regular expression substring
-SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd";
+-- T581 regular expression substring (with SQL99's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
  bcd 
 -----
  bcd
 (1 row)
 
 -- No match should return NULL
-SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
  True 
 ------
  t
@@ -175,8 +175,16 @@ SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
  t
 (1 row)
 
--- PostgreSQL extention to allow omitting the escape character
-SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde";
+-- PostgreSQL extension to allow omitting the escape character;
+-- here the regexp is taken as Posix syntax
+SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
+ cde 
+-----
+ cde
+(1 row)
+
+-- With a parenthesized subexpression, return only what matches the subexpr
+SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde";
  cde 
 -----
  cde
index e5c15bc528fdfd516d12dea5821a4703cb2ed081..c0a18959cd391b5f471207917603838815da1acc 100644 (file)
@@ -62,19 +62,24 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890";
 
 SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
 
--- T581 regular expression substring
-SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd";
+-- T581 regular expression substring (with SQL99's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
 
 -- No match should return NULL
-SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
 
 -- Null inputs should return NULL
 SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True";
 SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True";
 SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
 
--- PostgreSQL extention to allow omitting the escape character
-SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde";
+-- PostgreSQL extension to allow omitting the escape character;
+-- here the regexp is taken as Posix syntax
+SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
+
+-- With a parenthesized subexpression, return only what matches the subexpr
+SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde";
+
 
 -- E021-11 position expression
 SELECT POSITION('4' IN '1234567890') = '4' AS "4";