From 5ada9ef088ae0151a2f6efe48203100ef5b51113 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 25 Feb 2004 18:10:51 +0000 Subject: [PATCH] Teach plpgsql's lexer about dollar-quoted literals. Andrew Dunstan, some help from Tom Lane. --- src/pl/plpgsql/src/gram.y | 4 +- src/pl/plpgsql/src/pl_exec.c | 19 +----- src/pl/plpgsql/src/plpgsql.h | 3 +- src/pl/plpgsql/src/scan.l | 122 ++++++++++++++++++++++++++++++++--- 4 files changed, 118 insertions(+), 30 deletions(-) diff --git a/src/pl/plpgsql/src/gram.y b/src/pl/plpgsql/src/gram.y index 45c50d088b..b6526ce47b 100644 --- a/src/pl/plpgsql/src/gram.y +++ b/src/pl/plpgsql/src/gram.y @@ -4,7 +4,7 @@ * procedural language * * IDENTIFICATION - * $PostgreSQL: pgsql/src/pl/plpgsql/src/gram.y,v 1.50 2003/12/23 00:01:57 tgl Exp $ + * $PostgreSQL: pgsql/src/pl/plpgsql/src/gram.y,v 1.51 2004/02/25 18:10:51 tgl Exp $ * * This software is copyrighted by Jan Wieck - Hamburg. * @@ -1235,7 +1235,7 @@ stmt_raise : K_RAISE lno raise_level raise_msg raise_params ';' raise_msg : T_STRING { - $$ = strdup(yytext); + $$ = plpgsql_get_string_value(); } ; diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index 8c104f6363..0c409c0e64 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -3,7 +3,7 @@ * procedural language * * IDENTIFICATION - * $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_exec.c,v 1.96 2004/02/24 01:44:33 tgl Exp $ + * $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_exec.c,v 1.97 2004/02/25 18:10:51 tgl Exp $ * * This software is copyrighted by Jan Wieck - Hamburg. * @@ -1805,7 +1805,7 @@ exec_stmt_raise(PLpgSQL_execstate * estate, PLpgSQL_stmt_raise * stmt) for (cp = stmt->message; *cp; cp++) { /* - * Occurences of a single % are replaced by the next argument's + * Occurrences of a single % are replaced by the next argument's * external representation. Double %'s are converted to one %. */ if ((c[0] = *cp) == '%') @@ -1834,21 +1834,6 @@ exec_stmt_raise(PLpgSQL_execstate * estate, PLpgSQL_stmt_raise * stmt) continue; } - /* - * Occurrences of single ' are removed. double ' are reduced to - * single ones. We must do this because the parameter stored by - * the grammar is the raw T_STRING input literal, rather than the - * de-lexed string as you might expect ... - */ - if (*cp == '\'') - { - cp++; - if (*cp == '\'') - plpgsql_dstring_append(&ds, c); - else - cp--; - continue; - } plpgsql_dstring_append(&ds, c); } diff --git a/src/pl/plpgsql/src/plpgsql.h b/src/pl/plpgsql/src/plpgsql.h index 5c48018c42..911e331adf 100644 --- a/src/pl/plpgsql/src/plpgsql.h +++ b/src/pl/plpgsql/src/plpgsql.h @@ -3,7 +3,7 @@ * procedural language * * IDENTIFICATION - * $PostgreSQL: pgsql/src/pl/plpgsql/src/plpgsql.h,v 1.43 2003/11/29 19:52:12 pgsql Exp $ + * $PostgreSQL: pgsql/src/pl/plpgsql/src/plpgsql.h,v 1.44 2004/02/25 18:10:51 tgl Exp $ * * This software is copyrighted by Jan Wieck - Hamburg. * @@ -694,5 +694,6 @@ extern void plpgsql_push_back_token(int token); extern int plpgsql_scanner_lineno(void); extern void plpgsql_scanner_init(const char *str, int functype); extern void plpgsql_scanner_finish(void); +extern char *plpgsql_get_string_value(void); #endif /* PLPGSQL_H */ diff --git a/src/pl/plpgsql/src/scan.l b/src/pl/plpgsql/src/scan.l index b891e2b9e1..de447e09f1 100644 --- a/src/pl/plpgsql/src/scan.l +++ b/src/pl/plpgsql/src/scan.l @@ -4,7 +4,7 @@ * procedural language * * IDENTIFICATION - * $PostgreSQL: pgsql/src/pl/plpgsql/src/scan.l,v 1.31 2004/02/24 22:06:32 tgl Exp $ + * $PostgreSQL: pgsql/src/pl/plpgsql/src/scan.l,v 1.32 2004/02/25 18:10:51 tgl Exp $ * * This software is copyrighted by Jan Wieck - Hamburg. * @@ -57,6 +57,8 @@ static int lookahead_token; static bool have_lookahead_token; static const char *cur_line_start; static int cur_line_num; +static char *dolqstart; /* current $foo$ quote start string */ +static int dolqlen; /* signal to plpgsql_get_string_value */ int plpgsql_SpaceScanned = 0; %} @@ -70,7 +72,9 @@ int plpgsql_SpaceScanned = 0; %option case-insensitive -%x IN_STRING IN_COMMENT +%x IN_STRING +%x IN_COMMENT +%x IN_DOLLARQUOTE digit [0-9] ident_start [A-Za-z\200-\377_] @@ -84,6 +88,14 @@ param \${digit}+ space [ \t\n\r\f] +/* $foo$ style quotes ("dollar quoting") + * copied straight from the backend SQL parser + */ +dolq_start [A-Za-z\200-\377_] +dolq_cont [A-Za-z\200-\377_0-9] +dolqdelim \$({dolq_start}{dolq_cont}*)?\$ +dolqinside [^$]+ + %% /* ---------- * Local variables in scanner to remember where @@ -97,7 +109,7 @@ space [ \t\n\r\f] * Reset the state when entering the scanner * ---------- */ - BEGIN INITIAL; + BEGIN(INITIAL); plpgsql_SpaceScanned = 0; /* ---------- @@ -247,9 +259,9 @@ dump { return O_DUMP; } --[^\r\n]* ; \/\* { start_lineno = plpgsql_scanner_lineno(); - BEGIN IN_COMMENT; + BEGIN(IN_COMMENT); } -\*\/ { BEGIN INITIAL; plpgsql_SpaceScanned = 1; } +\*\/ { BEGIN(INITIAL); plpgsql_SpaceScanned = 1; } \n ; . ; <> { @@ -260,7 +272,7 @@ dump { return O_DUMP; } } /* ---------- - * Collect anything inside of ''s and return one STRING + * Collect anything inside of ''s and return one STRING token * * Hacking yytext/yyleng here lets us avoid using yymore(), which is * a win for performance. It's safe because we know the underlying @@ -270,15 +282,18 @@ dump { return O_DUMP; } ' { start_lineno = plpgsql_scanner_lineno(); start_charpos = yytext; - BEGIN IN_STRING; + BEGIN(IN_STRING); } \\. { } \\ { /* can only happen with \ at EOF */ } '' { } ' { - yyleng -= (yytext - start_charpos); + /* tell plpgsql_get_string_value it's not a dollar quote */ + dolqlen = 0; + /* adjust yytext/yyleng to describe whole string token */ + yyleng += (yytext - start_charpos); yytext = start_charpos; - BEGIN INITIAL; + BEGIN(INITIAL); return T_STRING; } [^'\\]+ { } @@ -289,6 +304,43 @@ dump { return O_DUMP; } errmsg("unterminated string"))); } +{dolqdelim} { + start_lineno = plpgsql_scanner_lineno(); + start_charpos = yytext; + dolqstart = pstrdup(yytext); + BEGIN(IN_DOLLARQUOTE); + } +{dolqdelim} { + if (strcmp(yytext, dolqstart) == 0) + { + pfree(dolqstart); + /* tell plpgsql_get_string_value it is a dollar quote */ + dolqlen = yyleng; + /* adjust yytext/yyleng to describe whole string token */ + yyleng += (yytext - start_charpos); + yytext = start_charpos; + BEGIN(INITIAL); + return T_STRING; + } + else + { + /* + * When we fail to match $...$ to dolqstart, transfer + * the $... part to the output, but put back the final + * $ for rescanning. Consider $delim$...$junk$delim$ + */ + yyless(yyleng-1); + } + } +{dolqinside} { } +. { /* needed for $ inside the quoted text */ } +<> { + plpgsql_error_lineno = start_lineno; + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("unterminated dollar-quoted string"))); + } + /* ---------- * Any unmatched character is returned as is * ---------- @@ -429,7 +481,6 @@ plpgsql_scanner_init(const char *str, int functype) BEGIN(INITIAL); } - /* * Called after parsing is done to clean up after plpgsql_scanner_init() */ @@ -439,3 +490,54 @@ plpgsql_scanner_finish(void) yy_delete_buffer(scanbufhandle); pfree(scanbuf); } + +/* + * Called after a T_STRING token is read to get the string literal's value + * as a malloc'd string. (We make this a separate call because in many + * scenarios there's no need to get the decoded value.) + * + * Note: we expect the literal to be the most recently lexed token. This + * would not work well if we supported multiple-token pushback or if + * plpgsql_yylex() wanted to read ahead beyond a T_STRING token. + */ +char * +plpgsql_get_string_value(void) +{ + char *result; + const char *cp; + int len; + + if (dolqlen > 0) + { + /* Token is a $foo$...$foo$ string */ + len = yyleng - 2 * dolqlen; + Assert(len >= 0); + result = (char *) malloc(len + 1); + memcpy(result, yytext + dolqlen, len); + result[len] = '\0'; + } + else + { + /* Token is a '...' string */ + result = (char *) malloc(yyleng + 1); /* more than enough room */ + len = 0; + for (cp = yytext; *cp; cp++) + { + if (*cp == '\'') + { + if (cp[1] == '\'') + result[len++] = *cp++; + /* else it must be string start or end quote */ + } + else if (*cp == '\\') + { + if (cp[1] != '\0') /* just a paranoid check */ + result[len++] = *(++cp); + } + else + result[len++] = *cp; + } + result[len] = '\0'; + } + return result; +} -- 2.40.0