]> granicus.if.org Git - postgresql/commitdiff
Teach regular expression operators to honor collations.
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 10 Apr 2011 22:02:17 +0000 (18:02 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 10 Apr 2011 22:03:09 +0000 (18:03 -0400)
This involves getting the character classification and case-folding
functions in the regex library to use the collations infrastructure.
Most of this work had been done already in connection with the upper/lower
and LIKE logic, so it was a simple matter of transposition.

While at it, split out these functions into a separate source file
regc_pg_locale.c, so that they can be correctly labeled with the Postgres
project's license rather than the Scriptics license.  These functions are
100% Postgres-written code whereas what remains in regc_locale.c is still
mostly not ours, so lumping them both under the same copyright notice was
getting more and more misleading.

12 files changed:
doc/src/sgml/charset.sgml
src/backend/libpq/hba.c
src/backend/regex/Makefile
src/backend/regex/regc_locale.c
src/backend/regex/regc_pg_locale.c [new file with mode: 0644]
src/backend/regex/regcomp.c
src/backend/regex/regexec.c
src/backend/tsearch/spell.c
src/backend/utils/adt/regexp.c
src/include/regex/regex.h
src/test/regress/expected/collate.linux.utf8.out
src/test/regress/sql/collate.linux.utf8.sql

index 72ba333790223597978f25fc7c137024248559e0..6b0793e2197cf82ec08679029aac1d7ea41665ff 100644 (file)
@@ -221,17 +221,21 @@ initdb --locale=sv_SE
 
      <listitem>
       <para>
-       The ability to use indexes with <literal>LIKE</> clauses
-       <indexterm><primary>LIKE</><secondary>and locales</></indexterm>
+       The <function>upper</>, <function>lower</>, and <function>initcap</>
+       functions
+       <indexterm><primary>upper</><secondary>and locales</></indexterm>
+       <indexterm><primary>lower</><secondary>and locales</></indexterm>
       </para>
      </listitem>
 
      <listitem>
       <para>
-       The <function>upper</>,  <function>lower</>,  and <function>initcap</>
-       functions
-       <indexterm><primary>upper</><secondary>and locales</></indexterm>
-       <indexterm><primary>lower</><secondary>and locales</></indexterm>
+       Pattern matching operators (<literal>LIKE</>, <literal>SIMILAR TO</>,
+       and POSIX-style regular expressions); locales affect both case
+       insensitive matching and the classification of characters by
+       character-class regular expressions
+       <indexterm><primary>LIKE</><secondary>and locales</></indexterm>
+       <indexterm><primary>regular expressions</><secondary>and locales</></indexterm>
       </para>
      </listitem>
 
@@ -241,6 +245,12 @@ initdb --locale=sv_SE
        <indexterm><primary>to_char</><secondary>and locales</></indexterm>
       </para>
      </listitem>
+
+     <listitem>
+      <para>
+       The ability to use indexes with <literal>LIKE</> clauses
+      </para>
+     </listitem>
     </itemizedlist>
    </para>
 
@@ -319,8 +329,8 @@ initdb --locale=sv_SE
   <indexterm zone="collation"><primary>collation</></>
 
   <para>
-   The collation feature allows specifying the sort order and certain
-   other locale aspects of data per-column, or even per-operation.
+   The collation feature allows specifying the sort order and character
+   classification behavior of data per-column, or even per-operation.
    This alleviates the restriction that the
    <symbol>LC_COLLATE</symbol> and <symbol>LC_CTYPE</symbol> settings
    of a database cannot be changed after its creation.
@@ -351,8 +361,8 @@ initdb --locale=sv_SE
    </para>
 
    <para>
-    When the database system has to perform an ordering or a
-    comparison, it uses the collation of the input expression.  This
+    When the database system has to perform an ordering or a character
+    classification, it uses the collation of the input expression.  This
     happens, for example, with <literal>ORDER BY</literal> clauses
     and function or operator calls such as <literal>&lt;</literal>.
     The collation to apply for an <literal>ORDER BY</literal> clause
@@ -361,7 +371,8 @@ initdb --locale=sv_SE
     below.  In addition to comparison operators, collations are taken into
     account by functions that convert between lower and upper case
     letters, such as <function>lower</>, <function>upper</>, and
-    <function>initcap</>.
+    <function>initcap</>; by pattern matching operators; and by
+    <function>to_char</> and related functions.
    </para>
 
    <para>
index 1f79c8fe007ff9882287728eb734e933b0f9be1c..f25505feb385f62b54d9b29654f8fab93c1ad375 100644 (file)
@@ -25,6 +25,7 @@
 #include <arpa/inet.h>
 #include <unistd.h>
 
+#include "catalog/pg_collation.h"
 #include "libpq/ip.h"
 #include "libpq/libpq.h"
 #include "regex/regex.h"
@@ -1781,7 +1782,7 @@ parse_ident_usermap(List *line, int line_number, const char *usermap_name,
                 * XXX: Major room for optimization: regexps could be compiled when
                 * the file is loaded and then re-used in every connection.
                 */
-               r = pg_regcomp(&re, wstr, wlen, REG_ADVANCED);
+               r = pg_regcomp(&re, wstr, wlen, REG_ADVANCED, C_COLLATION_OID);
                if (r)
                {
                        char            errstr[100];
index b4146449dec7d929fae766cef564b6e1d8167c15..21e7fa5329b9384333d6a8c9912be81dd24a4867 100644 (file)
@@ -17,6 +17,7 @@ OBJS = regcomp.o regerror.o regexec.o regfree.o
 include $(top_srcdir)/src/backend/common.mk
 
 # mark inclusion dependencies between .c files explicitly
-regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c regc_locale.c
+regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c \
+        regc_locale.c regc_pg_locale.c
 
 regexec.o: regexec.c rege_dfa.c
index 4f891973643ae594db4a6c9d1eb23679bee1b001..0f70931b13edfef77b6a4908e3b038aa5542153d 100644 (file)
@@ -350,171 +350,6 @@ static const struct cname
 };
 
 
-/*
- * ctype functions adapted to work on pg_wchar (a/k/a chr)
- *
- * When working in UTF8 encoding, we use the <wctype.h> functions if
- * available.  This assumes that every platform uses Unicode codepoints
- * directly as the wchar_t representation of Unicode.  On some platforms
- * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
- *
- * In all other encodings, we use the <ctype.h> functions for pg_wchar
- * values up to 255, and punt for values above that.  This is only 100%
- * correct in single-byte encodings such as LATINn.  However, non-Unicode
- * multibyte encodings are mostly Far Eastern character sets for which the
- * properties being tested here aren't relevant for higher code values anyway.
- *
- * NB: the coding here assumes pg_wchar is an unsigned type.
- */
-
-static int
-pg_wc_isdigit(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return iswdigit((wint_t) c);
-       }
-#endif
-       return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c));
-}
-
-static int
-pg_wc_isalpha(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return iswalpha((wint_t) c);
-       }
-#endif
-       return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c));
-}
-
-static int
-pg_wc_isalnum(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return iswalnum((wint_t) c);
-       }
-#endif
-       return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c));
-}
-
-static int
-pg_wc_isupper(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return iswupper((wint_t) c);
-       }
-#endif
-       return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c));
-}
-
-static int
-pg_wc_islower(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return iswlower((wint_t) c);
-       }
-#endif
-       return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c));
-}
-
-static int
-pg_wc_isgraph(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return iswgraph((wint_t) c);
-       }
-#endif
-       return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c));
-}
-
-static int
-pg_wc_isprint(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return iswprint((wint_t) c);
-       }
-#endif
-       return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c));
-}
-
-static int
-pg_wc_ispunct(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return iswpunct((wint_t) c);
-       }
-#endif
-       return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c));
-}
-
-static int
-pg_wc_isspace(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return iswspace((wint_t) c);
-       }
-#endif
-       return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c));
-}
-
-static pg_wchar
-pg_wc_toupper(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return towupper((wint_t) c);
-       }
-#endif
-       if (c <= (pg_wchar) UCHAR_MAX)
-               return toupper((unsigned char) c);
-       return c;
-}
-
-static pg_wchar
-pg_wc_tolower(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-       if (GetDatabaseEncoding() == PG_UTF8)
-       {
-               if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                       return towlower((wint_t) c);
-       }
-#endif
-       if (c <= (pg_wchar) UCHAR_MAX)
-               return tolower((unsigned char) c);
-       return c;
-}
-
-
 /*
  * element - map collating-element name to celt
  */
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
new file mode 100644 (file)
index 0000000..ccfcb71
--- /dev/null
@@ -0,0 +1,649 @@
+/*-------------------------------------------------------------------------
+ *
+ * regc_pg_locale.c
+ *       ctype functions adapted to work on pg_wchar (a/k/a chr)
+ *
+ * This file is #included by regcomp.c; it's not meant to compile standalone.
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/regex/regc_pg_locale.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "catalog/pg_collation.h"
+#include "utils/pg_locale.h"
+
+/*
+ * To provide as much functionality as possible on a variety of platforms,
+ * without going so far as to implement everything from scratch, we use
+ * several implementation strategies depending on the situation:
+ *
+ * 1. In C/POSIX collations, we use hard-wired code.  We can't depend on
+ * the <ctype.h> functions since those will obey LC_CTYPE.  Note that these
+ * collations don't give a fig about multibyte characters.
+ *
+ * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
+ *
+ * 2a. When working in UTF8 encoding, we use the <wctype.h> functions if
+ * available.  This assumes that every platform uses Unicode codepoints
+ * directly as the wchar_t representation of Unicode.  On some platforms
+ * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
+ *
+ * 2b. In all other encodings, or on machines that lack <wctype.h>, we use
+ * the <ctype.h> functions for pg_wchar values up to 255, and punt for values
+ * above that.  This is only 100% correct in single-byte encodings such as
+ * LATINn.  However, non-Unicode multibyte encodings are mostly Far Eastern
+ * character sets for which the properties being tested here aren't very
+ * relevant for higher code values anyway.  The difficulty with using the
+ * <wctype.h> functions with non-Unicode multibyte encodings is that we can
+ * have no certainty that the platform's wchar_t representation matches
+ * what we do in pg_wchar conversions.
+ *
+ * 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
+ * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
+ * functions, under exactly the same cases as #2.
+ *
+ * There is one notable difference between cases 2 and 3: in the "default"
+ * collation we force ASCII letters to follow ASCII upcase/downcase rules,
+ * while in a non-default collation we just let the library functions do what
+ * they will.  The case where this matters is treatment of I/i in Turkish,
+ * and the behavior is meant to match the upper()/lower() SQL functions.
+ *
+ * We store the active collation setting in static variables.  In principle
+ * it could be passed down to here via the regex library's "struct vars" data
+ * structure; but that would require somewhat invasive changes in the regex
+ * library, and right now there's no real benefit to be gained from that.
+ *
+ * NB: the coding here assumes pg_wchar is an unsigned type.
+ */
+
+typedef enum
+{
+       PG_REGEX_LOCALE_C,                      /* C locale (encoding independent) */
+       PG_REGEX_LOCALE_WIDE,           /* Use <wctype.h> functions */
+       PG_REGEX_LOCALE_1BYTE,          /* Use <ctype.h> functions */
+       PG_REGEX_LOCALE_WIDE_L,         /* Use locale_t <wctype.h> functions */
+       PG_REGEX_LOCALE_1BYTE_L         /* Use locale_t <ctype.h> functions */
+} PG_Locale_Strategy;
+
+static PG_Locale_Strategy pg_regex_strategy;
+static pg_locale_t pg_regex_locale;
+
+/*
+ * Hard-wired character properties for C locale
+ */
+#define PG_ISDIGIT     0x01
+#define PG_ISALPHA     0x02
+#define PG_ISALNUM     (PG_ISDIGIT | PG_ISALPHA)
+#define PG_ISUPPER     0x04
+#define PG_ISLOWER     0x08
+#define PG_ISGRAPH     0x10
+#define PG_ISPRINT     0x20
+#define PG_ISPUNCT     0x40
+#define PG_ISSPACE     0x80
+
+static const unsigned char pg_char_properties[128] = {
+       /* NUL */       0,
+       /* ^A */        0,
+       /* ^B */        0,
+       /* ^C */        0,
+       /* ^D */        0,
+       /* ^E */        0,
+       /* ^F */        0,
+       /* ^G */        0,
+       /* ^H */        0,
+       /* ^I */        PG_ISSPACE,
+       /* ^J */        PG_ISSPACE,
+       /* ^K */        PG_ISSPACE,
+       /* ^L */        PG_ISSPACE,
+       /* ^M */        PG_ISSPACE,
+       /* ^N */        0,
+       /* ^O */        0,
+       /* ^P */        0,
+       /* ^Q */        0,
+       /* ^R */        0,
+       /* ^S */        0,
+       /* ^T */        0,
+       /* ^U */        0,
+       /* ^V */        0,
+       /* ^W */        0,
+       /* ^X */        0,
+       /* ^Y */        0,
+       /* ^Z */        0,
+       /* ^[ */        0,
+       /* ^\ */        0,
+       /* ^] */        0,
+       /* ^^ */        0,
+       /* ^_ */        0,
+       /*    */        PG_ISPRINT | PG_ISSPACE,
+       /* !  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* "  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* #  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* $  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* %  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* &  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* '  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* (  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* )  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* *  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* +  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* ,  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* -  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* .  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* /  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* 0  */        PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+       /* 1  */        PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+       /* 2  */        PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+       /* 3  */        PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+       /* 4  */        PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+       /* 5  */        PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+       /* 6  */        PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+       /* 7  */        PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+       /* 8  */        PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+       /* 9  */        PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+       /* :  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* ;  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* <  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* =  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* >  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* ?  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* @  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* A  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* B  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* C  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* D  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* E  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* F  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* G  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* H  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* I  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* J  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* K  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* L  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* M  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* N  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* O  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* P  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* Q  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* R  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* S  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* T  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* U  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* V  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* W  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* X  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* Y  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* Z  */        PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+       /* [  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* \  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* ]  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* ^  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* _  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* `  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* a  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* b  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* c  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* d  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* e  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* f  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* g  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* h  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* i  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* j  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* k  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* l  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* m  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* n  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* o  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* p  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* q  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* r  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* s  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* t  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* u  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* v  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* w  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* x  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* y  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* z  */        PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+       /* {  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* |  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* }  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* ~  */        PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+       /* DEL */       0
+};
+
+
+/*
+ * pg_set_regex_collation: set collation for these functions to obey
+ *
+ * This is called when beginning compilation or execution of a regexp.
+ * Since there's no need for re-entrancy of regexp operations, it's okay
+ * to store the results in static variables.
+ */
+void
+pg_set_regex_collation(Oid collation)
+{
+       if (lc_ctype_is_c(collation))
+       {
+               /* C/POSIX collations use this path regardless of database encoding */
+               pg_regex_strategy = PG_REGEX_LOCALE_C;
+               pg_regex_locale = 0;
+       }
+       else
+       {
+               if (collation == DEFAULT_COLLATION_OID)
+                       pg_regex_locale = 0;
+               else if (OidIsValid(collation))
+               {
+                       /*
+                        * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T;
+                        * the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does
+                        * not have to be considered below.
+                        */
+                       pg_regex_locale = pg_newlocale_from_collation(collation);
+               }
+               else
+               {
+                       /*
+                        * This typically means that the parser could not resolve a
+                        * conflict of implicit collations, so report it that way.
+                        */
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INDETERMINATE_COLLATION),
+                                        errmsg("could not determine which collation to use for regular expression"),
+                                        errhint("Use the COLLATE clause to set the collation explicitly.")));
+               }
+
+#ifdef USE_WIDE_UPPER_LOWER
+               if (GetDatabaseEncoding() == PG_UTF8)
+               {
+                       if (pg_regex_locale)
+                               pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
+                       else
+                               pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
+               }
+               else
+#endif   /* USE_WIDE_UPPER_LOWER */
+               {
+                       if (pg_regex_locale)
+                               pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L;
+                       else
+                               pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
+               }
+       }
+}
+
+static int
+pg_wc_isdigit(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       return (c <= (pg_wchar) 127 &&
+                                       (pg_char_properties[c] & PG_ISDIGIT));
+               case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswdigit((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isdigit((unsigned char) c));
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswdigit_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isdigit_l((unsigned char) c, pg_regex_locale));
+#endif
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isalpha(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       return (c <= (pg_wchar) 127 &&
+                                       (pg_char_properties[c] & PG_ISALPHA));
+               case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswalpha((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isalpha((unsigned char) c));
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswalpha_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isalpha_l((unsigned char) c, pg_regex_locale));
+#endif
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isalnum(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       return (c <= (pg_wchar) 127 &&
+                                       (pg_char_properties[c] & PG_ISALNUM));
+               case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswalnum((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isalnum((unsigned char) c));
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswalnum_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isalnum_l((unsigned char) c, pg_regex_locale));
+#endif
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isupper(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       return (c <= (pg_wchar) 127 &&
+                                       (pg_char_properties[c] & PG_ISUPPER));
+               case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswupper((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isupper((unsigned char) c));
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswupper_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isupper_l((unsigned char) c, pg_regex_locale));
+#endif
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_islower(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       return (c <= (pg_wchar) 127 &&
+                                       (pg_char_properties[c] & PG_ISLOWER));
+               case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswlower((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       islower((unsigned char) c));
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswlower_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       islower_l((unsigned char) c, pg_regex_locale));
+#endif
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isgraph(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       return (c <= (pg_wchar) 127 &&
+                                       (pg_char_properties[c] & PG_ISGRAPH));
+               case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswgraph((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isgraph((unsigned char) c));
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswgraph_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isgraph_l((unsigned char) c, pg_regex_locale));
+#endif
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isprint(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       return (c <= (pg_wchar) 127 &&
+                                       (pg_char_properties[c] & PG_ISPRINT));
+               case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswprint((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isprint((unsigned char) c));
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswprint_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isprint_l((unsigned char) c, pg_regex_locale));
+#endif
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_ispunct(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       return (c <= (pg_wchar) 127 &&
+                                       (pg_char_properties[c] & PG_ISPUNCT));
+               case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswpunct((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       ispunct((unsigned char) c));
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswpunct_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       ispunct_l((unsigned char) c, pg_regex_locale));
+#endif
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isspace(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       return (c <= (pg_wchar) 127 &&
+                                       (pg_char_properties[c] & PG_ISSPACE));
+               case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswspace((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isspace((unsigned char) c));
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return iswspace_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       return (c <= (pg_wchar) UCHAR_MAX &&
+                                       isspace_l((unsigned char) c, pg_regex_locale));
+#endif
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
+
+static pg_wchar
+pg_wc_toupper(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       if (c <= (pg_wchar) 127)
+                               return pg_ascii_toupper((unsigned char) c);
+                       return c;
+               case PG_REGEX_LOCALE_WIDE:
+                       /* force C behavior for ASCII characters, per comments above */
+                       if (c <= (pg_wchar) 127)
+                               return pg_ascii_toupper((unsigned char) c);
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return towupper((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       /* force C behavior for ASCII characters, per comments above */
+                       if (c <= (pg_wchar) 127)
+                               return pg_ascii_toupper((unsigned char) c);
+                       if (c <= (pg_wchar) UCHAR_MAX)
+                               return toupper((unsigned char) c);
+                       return c;
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return towupper_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       if (c <= (pg_wchar) UCHAR_MAX)
+                               return toupper_l((unsigned char) c, pg_regex_locale);
+#endif
+                       return c;
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
+
+static pg_wchar
+pg_wc_tolower(pg_wchar c)
+{
+       switch (pg_regex_strategy)
+       {
+               case PG_REGEX_LOCALE_C:
+                       if (c <= (pg_wchar) 127)
+                               return pg_ascii_tolower((unsigned char) c);
+                       return c;
+               case PG_REGEX_LOCALE_WIDE:
+                       /* force C behavior for ASCII characters, per comments above */
+                       if (c <= (pg_wchar) 127)
+                               return pg_ascii_tolower((unsigned char) c);
+#ifdef USE_WIDE_UPPER_LOWER
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return towlower((wint_t) c);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE:
+                       /* force C behavior for ASCII characters, per comments above */
+                       if (c <= (pg_wchar) 127)
+                               return pg_ascii_tolower((unsigned char) c);
+                       if (c <= (pg_wchar) UCHAR_MAX)
+                               return tolower((unsigned char) c);
+                       return c;
+               case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+                               return towlower_l((wint_t) c, pg_regex_locale);
+#endif
+                       /* FALL THRU */
+               case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+                       if (c <= (pg_wchar) UCHAR_MAX)
+                               return tolower_l((unsigned char) c, pg_regex_locale);
+#endif
+                       return c;
+       }
+       return 0;                                       /* can't get here, but keep compiler quiet */
+}
index 6ed466a9d9741695668fac9e8adcaa883e6e29a6..bd4d4c3761928b098c1e5aab030968ea8046b4ab 100644 (file)
@@ -172,7 +172,7 @@ static void addrange(struct cvec *, chr, chr);
 static struct cvec *getcvec(struct vars *, int, int);
 static void freecvec(struct cvec *);
 
-/* === regc_locale.c === */
+/* === regc_pg_locale.c === */
 static int     pg_wc_isdigit(pg_wchar c);
 static int     pg_wc_isalpha(pg_wchar c);
 static int     pg_wc_isalnum(pg_wchar c);
@@ -184,6 +184,8 @@ static int  pg_wc_ispunct(pg_wchar c);
 static int     pg_wc_isspace(pg_wchar c);
 static pg_wchar pg_wc_toupper(pg_wchar c);
 static pg_wchar pg_wc_tolower(pg_wchar c);
+
+/* === regc_locale.c === */
 static celt element(struct vars *, const chr *, const chr *);
 static struct cvec *range(struct vars *, celt, celt, int);
 static int     before(celt, celt);
@@ -281,7 +283,8 @@ int
 pg_regcomp(regex_t *re,
                   const chr *string,
                   size_t len,
-                  int flags)
+                  int flags,
+                  Oid collation)
 {
        struct vars var;
        struct vars *v = &var;
@@ -307,6 +310,9 @@ pg_regcomp(regex_t *re,
        if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
                return REG_INVARG;
 
+       /* Initialize locale-dependent support */
+       pg_set_regex_collation(collation);
+
        /* initial setup (after which freev() is callable) */
        v->re = re;
        v->now = string;
@@ -333,6 +339,7 @@ pg_regcomp(regex_t *re,
        re->re_magic = REMAGIC;
        re->re_info = 0;                        /* bits get set during parse */
        re->re_csize = sizeof(chr);
+       re->re_collation = collation;
        re->re_guts = NULL;
        re->re_fns = VS(&functions);
 
@@ -1987,4 +1994,5 @@ stid(struct subre * t,
 #include "regc_color.c"
 #include "regc_nfa.c"
 #include "regc_cvec.c"
+#include "regc_pg_locale.c"
 #include "regc_locale.c"
index 5642bdfedfd24a96f25b66d617cada72f390533c..7dc0ddba29ef2f785050ee3fc3677708fcdcb16a 100644 (file)
@@ -192,6 +192,9 @@ pg_regexec(regex_t *re,
        if (re->re_csize != sizeof(chr))
                return REG_MIXED;
 
+       /* Initialize locale-dependent support */
+       pg_set_regex_collation(re->re_collation);
+
        /* setup */
        v->re = re;
        v->g = (struct guts *) re->re_guts;
index ecc880f54de6f68eb1f4c39bab67e7cb9f1c88ba..8c0eaa78a7c4b7caafa8d22dc91eaa49fbb5628f 100644 (file)
@@ -14,6 +14,7 @@
 
 #include "postgres.h"
 
+#include "catalog/pg_collation.h"
 #include "tsearch/dicts/spell.h"
 #include "tsearch/ts_locale.h"
 #include "utils/memutils.h"
@@ -425,7 +426,9 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
                wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
                wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
 
-               err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen, REG_ADVANCED | REG_NOSUB);
+               err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen,
+                                                REG_ADVANCED | REG_NOSUB,
+                                                DEFAULT_COLLATION_OID);
                if (err)
                {
                        char            errstr[100];
index a4cb87915bba9a7d5188248f6a861d66f3afd71e..0dbbd6715c98b2c8b8ba598e73cd72915bb70036 100644 (file)
@@ -96,6 +96,7 @@ typedef struct cached_re_str
        char       *cre_pat;            /* original RE (not null terminated!) */
        int                     cre_pat_len;    /* length of original RE, in bytes */
        int                     cre_flags;              /* compile flags: extended,icase etc */
+       Oid                     cre_collation;  /* collation to use */
        regex_t         cre_re;                 /* the compiled regular expression */
 } cached_re_str;
 
@@ -106,6 +107,7 @@ static cached_re_str re_array[MAX_CACHED_RES];      /* cached re's */
 /* Local functions */
 static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
                                         text *flags,
+                                        Oid collation,
                                         bool force_glob,
                                         bool use_subpatterns,
                                         bool ignore_degenerate);
@@ -121,12 +123,13 @@ static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
  *
  *     text_re --- the pattern, expressed as a TEXT object
  *     cflags --- compile options for the pattern
+ *     collation --- collation to use for LC_CTYPE-dependent behavior
  *
  * Pattern is given in the database encoding.  We internally convert to
  * an array of pg_wchar, which is what Spencer's regex package wants.
  */
 static regex_t *
-RE_compile_and_cache(text *text_re, int cflags)
+RE_compile_and_cache(text *text_re, int cflags, Oid collation)
 {
        int                     text_re_len = VARSIZE_ANY_EXHDR(text_re);
        char       *text_re_val = VARDATA_ANY(text_re);
@@ -146,6 +149,7 @@ RE_compile_and_cache(text *text_re, int cflags)
        {
                if (re_array[i].cre_pat_len == text_re_len &&
                        re_array[i].cre_flags == cflags &&
+                       re_array[i].cre_collation == collation &&
                        memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
                {
                        /*
@@ -176,7 +180,8 @@ RE_compile_and_cache(text *text_re, int cflags)
        regcomp_result = pg_regcomp(&re_temp.cre_re,
                                                                pattern,
                                                                pattern_len,
-                                                               cflags);
+                                                               cflags,
+                                                               collation);
 
        pfree(pattern);
 
@@ -207,6 +212,7 @@ RE_compile_and_cache(text *text_re, int cflags)
        memcpy(re_temp.cre_pat, text_re_val, text_re_len);
        re_temp.cre_pat_len = text_re_len;
        re_temp.cre_flags = cflags;
+       re_temp.cre_collation = collation;
 
        /*
         * Okay, we have a valid new item in re_temp; insert it into the storage
@@ -313,6 +319,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
  *     dat --- the data to match against (need not be null-terminated)
  *     dat_len --- the length of the data string
  *     cflags --- compile options for the pattern
+ *     collation --- collation to use for LC_CTYPE-dependent behavior
  *     nmatch, pmatch  --- optional return area for match details
  *
  * Both pattern and data are given in the database encoding.  We internally
@@ -320,12 +327,13 @@ RE_execute(regex_t *re, char *dat, int dat_len,
  */
 static bool
 RE_compile_and_execute(text *text_re, char *dat, int dat_len,
-                                          int cflags, int nmatch, regmatch_t *pmatch)
+                                          int cflags, Oid collation,
+                                          int nmatch, regmatch_t *pmatch)
 {
        regex_t    *re;
 
        /* Compile RE */
-       re = RE_compile_and_cache(text_re, cflags);
+       re = RE_compile_and_cache(text_re, cflags, collation);
 
        return RE_execute(re, dat, dat_len, nmatch, pmatch);
 }
@@ -424,6 +432,7 @@ nameregexeq(PG_FUNCTION_ARGS)
                                                                                  NameStr(*n),
                                                                                  strlen(NameStr(*n)),
                                                                                  REG_ADVANCED,
+                                                                                 PG_GET_COLLATION(),
                                                                                  0, NULL));
 }
 
@@ -437,6 +446,7 @@ nameregexne(PG_FUNCTION_ARGS)
                                                                                   NameStr(*n),
                                                                                   strlen(NameStr(*n)),
                                                                                   REG_ADVANCED,
+                                                                                  PG_GET_COLLATION(),
                                                                                   0, NULL));
 }
 
@@ -450,6 +460,7 @@ textregexeq(PG_FUNCTION_ARGS)
                                                                                  VARDATA_ANY(s),
                                                                                  VARSIZE_ANY_EXHDR(s),
                                                                                  REG_ADVANCED,
+                                                                                 PG_GET_COLLATION(),
                                                                                  0, NULL));
 }
 
@@ -463,6 +474,7 @@ textregexne(PG_FUNCTION_ARGS)
                                                                                   VARDATA_ANY(s),
                                                                                   VARSIZE_ANY_EXHDR(s),
                                                                                   REG_ADVANCED,
+                                                                                  PG_GET_COLLATION(),
                                                                                   0, NULL));
 }
 
@@ -483,6 +495,7 @@ nameicregexeq(PG_FUNCTION_ARGS)
                                                                                  NameStr(*n),
                                                                                  strlen(NameStr(*n)),
                                                                                  REG_ADVANCED | REG_ICASE,
+                                                                                 PG_GET_COLLATION(),
                                                                                  0, NULL));
 }
 
@@ -496,6 +509,7 @@ nameicregexne(PG_FUNCTION_ARGS)
                                                                                   NameStr(*n),
                                                                                   strlen(NameStr(*n)),
                                                                                   REG_ADVANCED | REG_ICASE,
+                                                                                  PG_GET_COLLATION(),
                                                                                   0, NULL));
 }
 
@@ -509,6 +523,7 @@ texticregexeq(PG_FUNCTION_ARGS)
                                                                                  VARDATA_ANY(s),
                                                                                  VARSIZE_ANY_EXHDR(s),
                                                                                  REG_ADVANCED | REG_ICASE,
+                                                                                 PG_GET_COLLATION(),
                                                                                  0, NULL));
 }
 
@@ -522,6 +537,7 @@ texticregexne(PG_FUNCTION_ARGS)
                                                                                   VARDATA_ANY(s),
                                                                                   VARSIZE_ANY_EXHDR(s),
                                                                                   REG_ADVANCED | REG_ICASE,
+                                                                                  PG_GET_COLLATION(),
                                                                                   0, NULL));
 }
 
@@ -541,7 +557,7 @@ textregexsubstr(PG_FUNCTION_ARGS)
                                eo;
 
        /* Compile RE */
-       re = RE_compile_and_cache(p, REG_ADVANCED);
+       re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
 
        /*
         * We pass two regmatch_t structs to get info about the overall match and
@@ -597,7 +613,7 @@ textregexreplace_noopt(PG_FUNCTION_ARGS)
        text       *r = PG_GETARG_TEXT_PP(2);
        regex_t    *re;
 
-       re = RE_compile_and_cache(p, REG_ADVANCED);
+       re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
 
        PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
 }
@@ -618,7 +634,7 @@ textregexreplace(PG_FUNCTION_ARGS)
 
        parse_re_flags(&flags, opt);
 
-       re = RE_compile_and_cache(p, flags.cflags);
+       re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
 
        PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
 }
@@ -781,7 +797,9 @@ regexp_matches(PG_FUNCTION_ARGS)
 
                /* be sure to copy the input string into the multi-call ctx */
                matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-                                                                               flags, false, true, false);
+                                                                               flags,
+                                                                               PG_GET_COLLATION(),
+                                                                               false, true, false);
 
                /* Pre-create workspace that build_regexp_matches_result needs */
                matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -830,6 +848,7 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
  */
 static regexp_matches_ctx *
 setup_regexp_matches(text *orig_str, text *pattern, text *flags,
+                                        Oid collation,
                                         bool force_glob, bool use_subpatterns,
                                         bool ignore_degenerate)
 {
@@ -868,7 +887,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
        }
 
        /* set up the compiled pattern */
-       cpattern = RE_compile_and_cache(pattern, re_flags.cflags);
+       cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation);
 
        /* do we want to remember subpatterns? */
        if (use_subpatterns && cpattern->re_nsub > 0)
@@ -1039,7 +1058,9 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
 
                /* be sure to copy the input string into the multi-call ctx */
                splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-                                                                               flags, true, false, true);
+                                                                               flags,
+                                                                               PG_GET_COLLATION(),
+                                                                               true, false, true);
 
                MemoryContextSwitchTo(oldcontext);
                funcctx->user_fctx = (void *) splitctx;
@@ -1083,6 +1104,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
        splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
                                                                        PG_GETARG_TEXT_PP(1),
                                                                        PG_GETARG_TEXT_PP_IF_EXISTS(2),
+                                                                       PG_GET_COLLATION(),
                                                                        true, false, true);
 
        while (splitctx->next_match <= splitctx->nmatches)
index cab439df78edbad88fa196e07e0e7f3712fa882e..cec4b837cd15665da8e800ac1729245c21f1b22d 100644 (file)
@@ -73,6 +73,7 @@ typedef struct
 #define  REG_USHORTEST          020000
        int                     re_csize;               /* sizeof(character) */
        char       *re_endp;            /* backward compatibility kludge */
+       Oid                     re_collation;   /* Collation that defines LC_CTYPE behavior */
        /* the rest is opaque pointers to hidden innards */
        char       *re_guts;            /* `char *' is more portable than `void *' */
        char       *re_fns;
@@ -161,9 +162,10 @@ typedef struct
 /*
  * the prototypes for exported functions
  */
-extern int     pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
+extern int     pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid);
 extern int     pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int);
 extern void pg_regfree(regex_t *);
 extern size_t pg_regerror(int, const regex_t *, char *, size_t);
+extern void pg_set_regex_collation(Oid collation);
 
 #endif   /* _REGEX_H_ */
index 25c543c2bb8c8c008fdec1630f95785c34f5e6a0..f0008ddf14b242859da9461390310f0556ed9123 100644 (file)
@@ -319,6 +319,80 @@ SELECT relname FROM pg_class WHERE relname ILIKE 'abc%';
 ---------
 (0 rows)
 
+-- regular expressions
+SELECT * FROM collate_test1 WHERE b ~ '^abc$';
+ a |  b  
+---+-----
+ 1 | abc
+(1 row)
+
+SELECT * FROM collate_test1 WHERE b ~ '^abc';
+ a |  b  
+---+-----
+ 1 | abc
+(1 row)
+
+SELECT * FROM collate_test1 WHERE b ~ 'bc';
+ a |  b  
+---+-----
+ 1 | abc
+ 2 | äbc
+ 3 | bbc
+(3 rows)
+
+SELECT * FROM collate_test1 WHERE b ~* '^abc$';
+ a |  b  
+---+-----
+ 1 | abc
+ 4 | ABC
+(2 rows)
+
+SELECT * FROM collate_test1 WHERE b ~* '^abc';
+ a |  b  
+---+-----
+ 1 | abc
+ 4 | ABC
+(2 rows)
+
+SELECT * FROM collate_test1 WHERE b ~* 'bc';
+ a |  b  
+---+-----
+ 1 | abc
+ 2 | äbc
+ 3 | bbc
+ 4 | ABC
+(4 rows)
+
+SELECT 'Türkiye' COLLATE "en_US" ~* 'KI' AS "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'Türkiye' COLLATE "tr_TR" ~* 'KI' AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'bıt' ~* 'BIT' COLLATE "en_US" AS "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'bıt' ~* 'BIT' COLLATE "tr_TR" AS "true";
+ true 
+------
+ t
+(1 row)
+
+-- The following actually exercises the selectivity estimation for ~*.
+SELECT relname FROM pg_class WHERE relname ~* '^abc';
+ relname 
+---------
+(0 rows)
+
 -- to_char
 SET lc_time TO 'tr_TR';
 SELECT to_char(date '2010-04-01', 'DD TMMON YYYY');
index b6d9368a063994fcd38d019af28f244fe6935f2f..51d65cf0da8c4538864288d9152ad40777acac1d 100644 (file)
@@ -124,6 +124,24 @@ SELECT 'bıt' ILIKE 'BIT' COLLATE "tr_TR" AS "true";
 -- The following actually exercises the selectivity estimation for ILIKE.
 SELECT relname FROM pg_class WHERE relname ILIKE 'abc%';
 
+-- regular expressions
+
+SELECT * FROM collate_test1 WHERE b ~ '^abc$';
+SELECT * FROM collate_test1 WHERE b ~ '^abc';
+SELECT * FROM collate_test1 WHERE b ~ 'bc';
+SELECT * FROM collate_test1 WHERE b ~* '^abc$';
+SELECT * FROM collate_test1 WHERE b ~* '^abc';
+SELECT * FROM collate_test1 WHERE b ~* 'bc';
+
+SELECT 'Türkiye' COLLATE "en_US" ~* 'KI' AS "true";
+SELECT 'Türkiye' COLLATE "tr_TR" ~* 'KI' AS "false";
+
+SELECT 'bıt' ~* 'BIT' COLLATE "en_US" AS "false";
+SELECT 'bıt' ~* 'BIT' COLLATE "tr_TR" AS "true";
+
+-- The following actually exercises the selectivity estimation for ~*.
+SELECT relname FROM pg_class WHERE relname ~* '^abc';
+
 
 -- to_char