From: Tom Lane Date: Sun, 22 Sep 2002 17:27:25 +0000 (+0000) Subject: Bring SIMILAR TO and SUBSTRING into some semblance of conformance with X-Git-Tag: REL7_3~429 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9946b83dedb629b9eff1c063b9fbcaab25c209df;p=postgresql Bring SIMILAR TO and SUBSTRING into some semblance of conformance with the SQL99 standard. (I'm not sure that the character-class features are quite right, but that can be fixed later.) Document SQL99 and POSIX regexps as being different features; provide variants of SUBSTRING for each. --- diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 544d070a85..7b12f80fb7 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -1,5 +1,5 @@ @@ -921,18 +921,31 @@ PostgreSQL documentation - substring(string from pattern for escape) + substring(string from pattern) text - extract regular expression + extract substring matching POSIX regular expression substring - substring('Thomas' from 'mas$' for escape '\\') + substring('Thomas' from '...$') mas + + substring(string from pattern for escape) + text + + extract substring matching SQL99 regular expression + + substring + + + substring('Thomas' from '%#"o_a#"_' for '#') + oma + + trim(leading | trailing | both @@ -960,6 +973,328 @@ PostgreSQL documentation + + Additional string manipulation functions are available and are + listed below. Some of them are used internally to implement the + SQL-standard string functions listed above. + + + + Other String Functions + + + + Function + Return Type + Description + Example + Result + + + + + + ascii(text) + integer + Returns the ASCII code of the first character of the argument. + ascii('x') + 120 + + + + btrim(string text, trim text) + text + + Remove (trim) the longest string consisting only of characters + in trim from the start and end of + string. + + btrim('xyxtrimyyx','xy') + trim + + + + chr(integer) + text + Returns the character with the given ASCII code. + chr(65) + A + + + + + convert(string + text, + src_encoding name, + dest_encoding name) + + text + + Converts string using dest_encoding. + The original encoding is specified by + src_encoding. If + src_encoding is omitted, database + encoding is assumed. + + convert('text_in_unicode', 'UNICODE', 'LATIN1') + text_in_unicode represented in ISO 8859-1 + + + + + decode(string text, + type text) + + bytea + + Decodes binary data from string previously + encoded with encode(). Parameter type is same as in encode(). + + decode('MTIzAAE=', 'base64') + 123\000\001 + + + + + encode(data bytea, + type text) + + text + + Encodes binary data to ASCII-only representation. Supported + types are: 'base64', 'hex', 'escape'. + + encode('123\\000\\001', 'base64') + MTIzAAE= + + + + initcap(text) + text + Converts first letter of each word (whitespace separated) to upper case. + initcap('hi thomas') + Hi Thomas + + + + length(string) + integer + + length of string + + character strings + length + + + length + character strings + character strings, length + + + length('jose') + 4 + + + + + lpad(string text, + length integer + , fill text) + + text + + Fills up the string to length + length by prepending the characters + fill (a space by default). If the + string is already longer than + length then it is truncated (on the + right). + + lpad('hi', 5, 'xy') + xyxhi + + + + ltrim(string text, text text) + text + + Removes the longest string containing only characters from + trim from the start of the string. + + ltrim('zzzytrim','xyz') + trim + + + + pg_client_encoding() + name + + Returns current client encoding name. + + pg_client_encoding() + SQL_ASCII + + + + quote_ident(string text) + text + + Returns the given string suitably quoted to be used as an identifier + in an SQL query string. + Quotes are added only if necessary (i.e., if the string contains + non-identifier characters or would be case-folded). + Embedded quotes are properly doubled. + + quote_ident('Foo') + "Foo" + + + + quote_literal(string text) + text + + Returns the given string suitably quoted to be used as a literal + in an SQL query string. + Embedded quotes and backslashes are properly doubled. + + quote_literal('O\'Reilly') + 'O''Reilly' + + + + repeat(text, integer) + text + Repeat text a number of times. + repeat('Pg', 4) + PgPgPgPg + + + + replace(string text, + from text, + to text) + text + Replace all occurrences in string of substring + from with substring to + + replace('abcdefabcdef', 'cd', 'XX') + abXXefabXXef + + + + + rpad(string text, + length integer + , fill text) + + text + + Fills up the string to length + length by appending the characters + fill (a space by default). If the + string is already longer than + length then it is truncated. + + rpad('hi', 5, 'xy') + hixyx + + + + rtrim(string + text, trim text) + text + + Removes the longest string containing only characters from + trim from the end of the string. + + rtrim('trimxxxx','x') + trim + + + + split_part(string text, + delimiter text, + column integer) + text + Split string on delimiter + returning the resulting (one based) column number. + + split_part('abc~@~def~@~ghi','~@~',2) + def + + + + strpos(string, substring) + text + + Locates specified substring. (same as + position(substring in + string), but note the reversed + argument order) + + strpos('high','ig') + 2 + + + + substr(string, from , count) + text + + Extracts specified substring. (same as + substring(string from from for count)) + + substr('alphabet', 3, 2) + ph + + + + to_ascii(text + , encoding) + text + Converts text from multibyte encoding to ASCII. + to_ascii('Karel') + Karel + + + + to_hex(number integer + or bigint) + text + Convert number to its equivalent hexadecimal + representation. + + to_hex(9223372036854775807::bigint) + 7fffffffffffffff + + + + + translate(string + text, + from text, + to text) + + text + + Any character in string that matches a + character in the from set is replaced by + the corresponding character in the to + set. + + translate('12345', '14', 'ax') + a23x5 + + + + +
+ + + The to_ascii function supports conversion from + LATIN1, LATIN2, WIN1250 (CP1250) only. + + + Available conversion names @@ -1579,407 +1914,87 @@ PostgreSQL documentation - win1250_to_utf_8 - WIN1250 - UNICODE - - - - win1251_to_iso_8859_5 - WIN - ISO_8859_5 - - - - win1251_to_koi8r - WIN - KOI8 - - - - win1251_to_mic - WIN - MULE_INTERNAL - - - - win1251_to_utf_8 - WIN - UNICODE - - - - win1251_to_win866 - WIN - ALT - - - - win1256_to_utf_8 - WIN1256 - UNICODE - - - - win866_to_iso_8859_5 - ALT - ISO_8859_5 - - - - win866_to_koi8r - ALT - KOI8 - - - - win866_to_mic - ALT - MULE_INTERNAL - - - - win866_to_utf_8 - ALT - UNICODE - - - - win866_to_win1251 - ALT - WIN - - - - win874_to_utf_8 - WIN874 - UNICODE - - - - -
- - - Additional string manipulation functions are available and are - listed below. Some of them are used internally to implement the - SQL-standard string functions listed above. - - - - Other String Functions - - - - Function - Return Type - Description - Example - Result - - - - - - ascii(text) - integer - Returns the ASCII code of the first character of the argument. - ascii('x') - 120 - - - - btrim(string text, trim text) - text - - Remove (trim) the longest string consisting only of characters - in trim from the start and end of - string. - - btrim('xyxtrimyyx','xy') - trim - - - - chr(integer) - text - Returns the character with the given ASCII code. - chr(65) - A - - - - - convert(string - text, - src_encoding name, - dest_encoding name) - - text - - Converts string using dest_encoding. - The original encoding is specified by - src_encoding. If - src_encoding is omitted, database - encoding is assumed. - - convert('text_in_unicode', 'UNICODE', 'LATIN1') - text_in_unicode represented in ISO 8859-1 - - - - - decode(string text, - type text) - - bytea - - Decodes binary data from string previously - encoded with encode(). Parameter type is same as in encode(). - - decode('MTIzAAE=', 'base64') - 123\000\001 - - - - - encode(data bytea, - type text) - - text - - Encodes binary data to ASCII-only representation. Supported - types are: 'base64', 'hex', 'escape'. - - encode('123\\000\\001', 'base64') - MTIzAAE= - - - - initcap(text) - text - Converts first letter of each word (whitespace separated) to upper case. - initcap('hi thomas') - Hi Thomas - - - - length(string) - integer - - length of string - - character strings - length - - - length - character strings - character strings, length - - - length('jose') - 4 - - - - - lpad(string text, - length integer - , fill text) - - text - - Fills up the string to length - length by prepending the characters - fill (a space by default). If the - string is already longer than - length then it is truncated (on the - right). - - lpad('hi', 5, 'xy') - xyxhi - - - - ltrim(string text, text text) - text - - Removes the longest string containing only characters from - trim from the start of the string. - - ltrim('zzzytrim','xyz') - trim + win1250_to_utf_8 + WIN1250 + UNICODE - pg_client_encoding() - name - - Returns current client encoding name. - - pg_client_encoding() - SQL_ASCII + win1251_to_iso_8859_5 + WIN + ISO_8859_5 - quote_ident(string text) - text - - Returns the given string suitably quoted to be used as an identifier - in an SQL query string. - Quotes are added only if necessary (i.e., if the string contains - non-identifier characters or would be case-folded). - Embedded quotes are properly doubled. - - quote_ident('Foo') - "Foo" + win1251_to_koi8r + WIN + KOI8 - quote_literal(string text) - text - - Returns the given string suitably quoted to be used as a literal - in an SQL query string. - Embedded quotes and backslashes are properly doubled. - - quote_literal('O\'Reilly') - 'O''Reilly' + win1251_to_mic + WIN + MULE_INTERNAL - repeat(text, integer) - text - Repeat text a number of times. - repeat('Pg', 4) - PgPgPgPg + win1251_to_utf_8 + WIN + UNICODE - replace(string text, - from text, - to text) - text - Replace all occurrences in string of substring - from with substring to - - replace('abcdefabcdef', 'cd', 'XX') - abXXefabXXef + win1251_to_win866 + WIN + ALT - - rpad(string text, - length integer - , fill text) - - text - - Fills up the string to length - length by appending the characters - fill (a space by default). If the - string is already longer than - length then it is truncated. - - rpad('hi', 5, 'xy') - hixyx + win1256_to_utf_8 + WIN1256 + UNICODE - rtrim(string - text, trim text) - text - - Removes the longest string containing only characters from - trim from the end of the string. - - rtrim('trimxxxx','x') - trim + win866_to_iso_8859_5 + ALT + ISO_8859_5 - split_part(string text, - delimiter text, - column integer) - text - Split string on delimiter - returning the resulting (one based) column number. - - split_part('abc~@~def~@~ghi','~@~',2) - def + win866_to_koi8r + ALT + KOI8 - strpos(string, substring) - text - - Locates specified substring. (same as - position(substring in - string), but note the reversed - argument order) - - strpos('high','ig') - 2 + win866_to_mic + ALT + MULE_INTERNAL - substr(string, from , count) - text - - Extracts specified substring. (same as - substring(string from from for count)) - - substr('alphabet', 3, 2) - ph + win866_to_utf_8 + ALT + UNICODE - to_ascii(text - , encoding) - text - Converts text from multibyte encoding to ASCII. - to_ascii('Karel') - Karel + win866_to_win1251 + ALT + WIN - to_hex(number integer - or bigint) - text - Convert number to its equivalent hexadecimal - representation. - - to_hex(9223372036854775807::bigint) - 7fffffffffffffff + win874_to_utf_8 + WIN874 + UNICODE - - - translate(string - text, - from text, - to text) - - text - - Any character in string that matches a - character in the from set is replaced by - the corresponding character in the to - set. - - translate('12345', '14', 'ax') - a23x5 - -
- - The to_ascii function supports conversion from - LATIN1, LATIN2, WIN1250 (CP1250) only. - @@ -2171,16 +2186,16 @@ PostgreSQL documentation Pattern Matching - There are two separate approaches to pattern matching provided by + There are three separate approaches to pattern matching provided by PostgreSQL: the traditional SQL - LIKE operator and the more recent + LIKE operator, the more recent SQL99 - SIMILAR TO operator implementing + SIMILAR TO operator, and POSIX-style regular expressions. Additionally, a pattern matching function, - SUBSTRING, is available, as defined in - SQL99. + SUBSTRING, is available, using either + SQL99-style or POSIX-style regular expressions. @@ -2190,13 +2205,6 @@ PostgreSQL documentation - - Both LIKE and SIMILAR TO - are SQL-standard operators which are also available in alternate - forms as PostgreSQL operators; look at - ~ and ~~ for examples. - - <function>LIKE</function> @@ -2296,11 +2304,142 @@ PostgreSQL documentation - - <function>SIMILAR TO</function> and <acronym>POSIX</acronym> + <sect2 id="functions-sql99-regexp"> + <title><function>SIMILAR TO</function> and <acronym>SQL99</acronym> Regular Expressions - + + regular expressions + pattern matching + + + + similar to + + + + substring + + + +string SIMILAR TO pattern ESCAPE escape-character +string NOT SIMILAR TO pattern ESCAPE escape-character + + + + The SIMILAR TO operator returns true or false + depending on whether its pattern matches the given string. It is + much like LIKE, except that it interprets the + pattern using SQL99's definition of a regular + expression. + SQL99's regular expressions are a curious cross + between LIKE notation and common regular expression + notation. + + + + Like LIKE, the SIMILAR TO + operator succeeds only if its pattern matches the entire string; + this is unlike common regular expression practice, wherein the pattern + may match any part of the string. + Also like + LIKE, SIMILAR TO uses + % and _ as wildcard characters denoting + any string and any single character, respectively (these are + comparable to .* and . in POSIX regular + expressions). + + + + In addition to these facilities borrowed from LIKE, + SIMILAR TO supports these pattern-matching + metacharacters borrowed from POSIX regular expressions: + + + + + | denotes alternation (either of two alternatives). + + + + + * denotes repetition of the previous item zero + or more times. + + + + + + denotes repetition of the previous item one + or more times. + + + + + Parentheses () may be used to group items into + a single logical item. + + + + + A bracket expression [...] specifies a character + class, just as in POSIX regular expressions. + + + + + Notice that bounded repetition (? and {...}) + are not provided, though they exist in POSIX. Also, dot (.) + is not a metacharacter. + + + + As with LIKE, a backslash disables the special meaning + of any of these metacharacters; or a different escape character can + be specified with ESCAPE. + + + + + Some examples: + +'abc' SIMILAR TO 'abc' true +'abc' SIMILAR TO 'a' false +'abc' SIMILAR TO '%(b|d)%' true +'abc' SIMILAR TO '(b|c)%' false + + + + + + The SUBSTRING function with three parameters, + SUBSTRING(string FROM + pattern FOR + escape), provides extraction of a substring + that matches a SQL99 regular expression pattern. As with SIMILAR TO, + the specified pattern must match to the entire data string, else the + function fails and returns NULL. To indicate the part of the pattern + that should be returned on success, SQL99 specifies that the pattern + must contain two occurrences of the escape character followed by + double quote ("). The text matching the portion of the + pattern between these markers is returned. + + + + + Some examples: + +SUBSTRING('foobar' FROM '%#"o_b#"%' FOR '#') oob +SUBSTRING('foobar' FROM '#"o_b#"%' FOR '#') NULL + + + + + + + + <acronym>POSIX</acronym> Regular Expressions + + regular expressions pattern matching @@ -2341,12 +2480,6 @@ PostgreSQL documentation Does not match regular expression, case insensitive 'thomas' !~* '.*vadim.*'
- - - SIMILAR TO - Matches regular expression, case sensitive - 'thomas' SIMILAR TO '.*thomas.*' - @@ -2354,7 +2487,8 @@ PostgreSQL documentation POSIX regular expressions provide a more powerful means for - pattern matching than the LIKE function. + pattern matching than the LIKE and + SIMILAR TO operators. Many Unix tools such as egrep, sed, or awk use a pattern matching language that is similar to the one described here. @@ -2379,10 +2513,34 @@ PostgreSQL documentation Some examples: -'abc' SIMILAR TO 'abc' true -'abc' SIMILAR TO '^a' true -'abc' SIMILAR TO '(b|d)' true -'abc' SIMILAR TO '^(b|c)' false +'abc' ~ 'abc' true +'abc' ~ '^a' true +'abc' ~ '(b|d)' true +'abc' ~ '^(b|c)' false + + + + + + The SUBSTRING function with two parameters, + SUBSTRING(string FROM + pattern), provides extraction of a substring + that matches a POSIX regular expression pattern. It returns NULL if + there is no match, otherwise the portion of the text that matched the + pattern. But if the pattern contains any parentheses, the portion + of the text that matched the first parenthesized subexpression (the + one whose left parenthesis comes first) is + returned. You can always put parentheses around the whole expression + if you want to use parentheses within it without triggering this + exception. + + + + + Some examples: + +SUBSTRING('foobar' FROM 'o.b') oob +SUBSTRING('foobar' FROM 'o(.)b') o diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 43597306d4..be45d7bde1 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.367 2002/09/18 21:35:21 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.368 2002/09/22 17:27:23 tgl Exp $ * * HISTORY * AUTHOR DATE MAJOR EVENT @@ -5644,22 +5644,40 @@ a_expr: c_expr { $$ = $1; } } | a_expr SIMILAR TO a_expr %prec SIMILAR - { $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, $4); } + { + A_Const *c = makeNode(A_Const); + FuncCall *n = makeNode(FuncCall); + c->val.type = T_Null; + n->funcname = SystemFuncName("similar_escape"); + n->args = makeList2($4, (Node *) c); + n->agg_star = FALSE; + n->agg_distinct = FALSE; + $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n); + } | a_expr SIMILAR TO a_expr ESCAPE a_expr { FuncCall *n = makeNode(FuncCall); - n->funcname = SystemFuncName("like_escape"); + n->funcname = SystemFuncName("similar_escape"); n->args = makeList2($4, $6); n->agg_star = FALSE; n->agg_distinct = FALSE; $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n); } | a_expr NOT SIMILAR TO a_expr %prec SIMILAR - { $$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, $5); } + { + A_Const *c = makeNode(A_Const); + FuncCall *n = makeNode(FuncCall); + c->val.type = T_Null; + n->funcname = SystemFuncName("similar_escape"); + n->args = makeList2($5, (Node *) c); + n->agg_star = FALSE; + n->agg_distinct = FALSE; + $$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, (Node *) n); + } | a_expr NOT SIMILAR TO a_expr ESCAPE a_expr { FuncCall *n = makeNode(FuncCall); - n->funcname = SystemFuncName("like_escape"); + n->funcname = SystemFuncName("similar_escape"); n->args = makeList2($5, $7); n->agg_star = FALSE; n->agg_distinct = FALSE; diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index b64d6ede65..ebbca8f040 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.42 2002/09/04 20:31:28 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.43 2002/09/22 17:27:23 tgl Exp $ * * Alistair Crooks added the code for the regex caching * agc - cached the regular expressions used - there's a good chance @@ -317,8 +317,7 @@ textregexsubstr(PG_FUNCTION_ARGS) char *sterm; int len; bool match; - int nmatch = 1; - regmatch_t pmatch; + regmatch_t pmatch[2]; /* be sure sterm is null-terminated */ len = VARSIZE(s) - VARHDRSZ; @@ -327,21 +326,131 @@ textregexsubstr(PG_FUNCTION_ARGS) sterm[len] = '\0'; /* - * We need the match info back from the pattern match to be able to - * actually extract the substring. It seems to be adequate to pass in - * a structure to return only one result. + * We pass two regmatch_t structs to get info about the overall match + * and the match for the first parenthesized subexpression (if any). + * If there is a parenthesized subexpression, we return what it matched; + * else return what the whole regexp matched. */ - match = RE_compile_and_execute(p, sterm, REG_EXTENDED, nmatch, &pmatch); + match = RE_compile_and_execute(p, sterm, REG_EXTENDED, 2, pmatch); + pfree(sterm); /* match? then return the substring matching the pattern */ if (match) { + int so, + eo; + + so = pmatch[1].rm_so; + eo = pmatch[1].rm_eo; + if (so < 0 || eo < 0) + { + /* no parenthesized subexpression */ + so = pmatch[0].rm_so; + eo = pmatch[0].rm_eo; + } + return (DirectFunctionCall3(text_substr, PointerGetDatum(s), - Int32GetDatum(pmatch.rm_so + 1), - Int32GetDatum(pmatch.rm_eo - pmatch.rm_so))); + Int32GetDatum(so + 1), + Int32GetDatum(eo - so))); } PG_RETURN_NULL(); } + +/* similar_escape() + * Convert a SQL99 regexp pattern to POSIX style, so it can be used by + * our regexp engine. + */ +Datum +similar_escape(PG_FUNCTION_ARGS) +{ + text *pat_text; + text *esc_text; + text *result; + unsigned char *p, + *e, + *r; + int plen, + elen; + bool afterescape = false; + int nquotes = 0; + + /* This function is not strict, so must test explicitly */ + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + pat_text = PG_GETARG_TEXT_P(0); + p = VARDATA(pat_text); + plen = (VARSIZE(pat_text) - VARHDRSZ); + if (PG_ARGISNULL(1)) + { + /* No ESCAPE clause provided; default to backslash as escape */ + e = "\\"; + elen = 1; + } + else + { + esc_text = PG_GETARG_TEXT_P(1); + e = VARDATA(esc_text); + elen = (VARSIZE(esc_text) - VARHDRSZ); + if (elen == 0) + e = NULL; /* no escape character */ + else if (elen != 1) + elog(ERROR, "ESCAPE string must be empty or one character"); + } + + /* We need room for ^, $, and up to 2 output bytes per input byte */ + result = (text *) palloc(VARHDRSZ + 2 + 2 * plen); + r = VARDATA(result); + + *r++ = '^'; + + while (plen > 0) + { + unsigned char pchar = *p; + + if (afterescape) + { + if (pchar == '"') /* for SUBSTRING patterns */ + *r++ = ((nquotes++ % 2) == 0) ? '(' : ')'; + else + { + *r++ = '\\'; + *r++ = pchar; + } + afterescape = false; + } + else if (e && pchar == *e) + { + /* SQL99 escape character; do not send to output */ + afterescape = true; + } + else if (pchar == '%') + { + *r++ = '.'; + *r++ = '*'; + } + else if (pchar == '_') + { + *r++ = '.'; + } + else if (pchar == '\\' || pchar == '.' || pchar == '?' || + pchar == '{') + { + *r++ = '\\'; + *r++ = pchar; + } + else + { + *r++ = pchar; + } + p++, plen--; + } + + *r++ = '$'; + + VARATT_SIZEP(result) = r - ((unsigned char *) result); + + PG_RETURN_TEXT_P(result); +} diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index eb44f283b9..369da463e8 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: pg_proc.h,v 1.272 2002/09/18 21:35:23 tgl Exp $ + * $Id: pg_proc.h,v 1.273 2002/09/22 17:27:23 tgl Exp $ * * NOTES * The script catalog/genbki.sh reads this file and generates .bki @@ -2076,6 +2076,9 @@ DESCR("convert int4 to char"); DATA(insert OID = 1622 ( repeat PGNSP PGUID 12 f f t f i 2 25 "25 23" repeat - _null_ )); DESCR("replicate string int4 times"); +DATA(insert OID = 1623 ( similar_escape PGNSP PGUID 12 f f f f i 2 25 "25 25" similar_escape - _null_ )); +DESCR("convert SQL99 regexp pattern to POSIX style"); + DATA(insert OID = 1624 ( mul_d_interval PGNSP PGUID 12 f f t f i 2 1186 "701 1186" mul_d_interval - _null_ )); DATA(insert OID = 1633 ( texticlike PGNSP PGUID 12 f f t f i 2 16 "25 25" texticlike - _null_ )); @@ -2087,7 +2090,7 @@ DESCR("matches LIKE expression, case-insensitive"); DATA(insert OID = 1636 ( nameicnlike PGNSP PGUID 12 f f t f i 2 16 "19 25" nameicnlike - _null_ )); DESCR("does not match LIKE expression, case-insensitive"); DATA(insert OID = 1637 ( like_escape PGNSP PGUID 12 f f t f i 2 25 "25 25" like_escape - _null_ )); -DESCR("convert match pattern to use backslash escapes"); +DESCR("convert LIKE pattern to use backslash escapes"); DATA(insert OID = 1689 ( update_pg_pwd_and_pg_group PGNSP PGUID 12 f f t f v 0 2279 "" update_pg_pwd_and_pg_group - _null_ )); DESCR("update pg_pwd and pg_group files"); @@ -2784,7 +2787,7 @@ DESCR("matches LIKE expression"); DATA(insert OID = 2008 ( notlike PGNSP PGUID 12 f f t f i 2 16 "17 17" byteanlike - _null_ )); DESCR("does not match LIKE expression"); DATA(insert OID = 2009 ( like_escape PGNSP PGUID 12 f f t f i 2 17 "17 17" like_escape_bytea - _null_ )); -DESCR("convert match pattern to use backslash escapes"); +DESCR("convert LIKE pattern to use backslash escapes"); DATA(insert OID = 2010 ( length PGNSP PGUID 12 f f t f i 1 23 "17" byteaoctetlen - _null_ )); DESCR("octet length"); DATA(insert OID = 2011 ( byteacat PGNSP PGUID 12 f f t f i 2 17 "17 17" byteacat - _null_ )); @@ -2889,9 +2892,9 @@ DATA(insert OID = 2072 ( date_mi_interval PGNSP PGUID 14 f f t f i 2 1114 "1082 DESCR("subtract"); DATA(insert OID = 2073 ( substring PGNSP PGUID 12 f f t f i 2 25 "25 25" textregexsubstr - _null_ )); -DESCR("substitutes regular expression"); -DATA(insert OID = 2074 ( substring PGNSP PGUID 14 f f t f i 3 25 "25 25 25" "select substring($1, like_escape($2, $3))" - _null_ )); -DESCR("substitutes regular expression with escape argument"); +DESCR("extracts text matching regular expression"); +DATA(insert OID = 2074 ( substring PGNSP PGUID 14 f f t f i 3 25 "25 25 25" "select substring($1, similar_escape($2, $3))" - _null_ )); +DESCR("extracts text matching SQL99 regular expression"); DATA(insert OID = 2075 ( bit PGNSP PGUID 12 f f t f i 1 1560 "20" bitfromint8 - _null_ )); DESCR("int8 to bitstring"); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index bfa5fa8efc..9f1c9ba8db 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: builtins.h,v 1.201 2002/09/19 22:48:34 tgl Exp $ + * $Id: builtins.h,v 1.202 2002/09/22 17:27:25 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -371,6 +371,7 @@ extern Datum nameicregexne(PG_FUNCTION_ARGS); extern Datum texticregexeq(PG_FUNCTION_ARGS); extern Datum texticregexne(PG_FUNCTION_ARGS); extern Datum textregexsubstr(PG_FUNCTION_ARGS); +extern Datum similar_escape(PG_FUNCTION_ARGS); /* regproc.c */ extern Datum regprocin(PG_FUNCTION_ARGS); diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 576fafb772..a73ca1aa84 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -142,15 +142,15 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; t (1 row) --- T581 regular expression substring -SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd"; +-- T581 regular expression substring (with SQL99's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; bcd ----- bcd (1 row) -- No match should return NULL -SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; True ------ t @@ -175,8 +175,16 @@ SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; t (1 row) --- PostgreSQL extention to allow omitting the escape character -SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde"; +-- PostgreSQL extension to allow omitting the escape character; +-- here the regexp is taken as Posix syntax +SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; + cde +----- + cde +(1 row) + +-- With a parenthesized subexpression, return only what matches the subexpr +SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; cde ----- cde diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index e5c15bc528..c0a18959cd 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -62,19 +62,24 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890"; SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; --- T581 regular expression substring -SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd"; +-- T581 regular expression substring (with SQL99's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; -- No match should return NULL -SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; -- Null inputs should return NULL SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True"; SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True"; SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; --- PostgreSQL extention to allow omitting the escape character -SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde"; +-- PostgreSQL extension to allow omitting the escape character; +-- here the regexp is taken as Posix syntax +SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; + +-- With a parenthesized subexpression, return only what matches the subexpr +SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; + -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4";