From 1ea47dd8cbcb485a06676f12a36244270e18e192 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 21 Oct 2007 22:29:56 +0000 Subject: [PATCH] Fix shared tsvector/tsquery input code so that we don't say "syntax error in tsvector" when we are really parsing a tsquery. Report the bogus input, too. Make styles of some related error messages more consistent. --- src/backend/utils/adt/tsquery.c | 16 ++-- src/backend/utils/adt/tsvector.c | 10 +-- src/backend/utils/adt/tsvector_parser.c | 115 +++++++++++++----------- src/include/tsearch/ts_utils.h | 8 +- 4 files changed, 81 insertions(+), 68 deletions(-) diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 7fd018915c..f8e84cb266 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.7 2007/09/11 16:01:40 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.8 2007/10/21 22:29:56 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -141,7 +141,7 @@ gettoken_query(TSQueryParserState state, { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error at start of operand in tsearch query: \"%s\"", + errmsg("syntax error in tsquery: \"%s\"", state->buffer))); } else if (!t_isspace(state->buf)) @@ -159,7 +159,7 @@ gettoken_query(TSQueryParserState state, else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("no operand in tsearch query: \"%s\"", + errmsg("no operand in tsquery: \"%s\"", state->buffer))); } break; @@ -232,12 +232,12 @@ pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int if (distance >= MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("value is too big in tsearch query: \"%s\"", + errmsg("value is too big in tsquery: \"%s\"", state->buffer))); if (lenval >= MAXSTRLEN) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("operand is too long in tsearch query: \"%s\"", + errmsg("operand is too long in tsquery: \"%s\"", state->buffer))); tmp = (QueryOperand *) palloc(sizeof(QueryOperand)); @@ -264,7 +264,7 @@ pushValue(TSQueryParserState state, char *strval, int lenval, int2 weight) if (lenval >= MAXSTRLEN) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("word is too long in tsearch query: \"%s\"", + errmsg("word is too long in tsquery: \"%s\"", state->buffer))); INIT_CRC32(valcrc); @@ -372,7 +372,7 @@ makepol(TSQueryParserState state, default: ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsearch query: \"%s\"", + errmsg("syntax error in tsquery: \"%s\"", state->buffer))); } } @@ -478,7 +478,7 @@ parse_tsquery(char *buf, state.polstr = NIL; /* init value parser's state */ - state.valstate = init_tsvector_parser(NULL, true); + state.valstate = init_tsvector_parser(state.buffer, true, true); /* init list of operand */ state.sumlen = 0; diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 992da4a9b4..0d82da1f90 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.4 2007/09/07 16:03:40 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.5 2007/10/21 22:29:56 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -204,7 +204,7 @@ tsvectorin(PG_FUNCTION_ARGS) pg_verifymbstr(buf, strlen(buf), false); - state = init_tsvector_parser(buf, false); + state = init_tsvector_parser(buf, false, false); arrlen = 64; arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen); @@ -224,7 +224,7 @@ tsvectorin(PG_FUNCTION_ARGS) if (cur - tmpbuf > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("position value too large"))); + errmsg("position value is too large"))); /* * Enlarge buffers if needed @@ -496,7 +496,7 @@ tsvectorrecv(PG_FUNCTION_ARGS) datalen += lex_len; if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0) - elog(ERROR, "lexemes are unordered"); + elog(ERROR, "lexemes are misordered"); /* Receive positions */ @@ -523,7 +523,7 @@ tsvectorrecv(PG_FUNCTION_ARGS) { wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos)); if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) - elog(ERROR, "position information is unordered"); + elog(ERROR, "position information is misordered"); } datalen += (npos + 1) * sizeof(WordEntry); diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c index 26a271679d..5ee8bb7842 100644 --- a/src/backend/utils/adt/tsvector_parser.c +++ b/src/backend/utils/adt/tsvector_parser.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.1 2007/09/07 15:09:56 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.2 2007/10/21 22:29:56 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -20,35 +20,49 @@ #include "tsearch/ts_utils.h" #include "utils/memutils.h" + +/* + * Private state of tsvector parser. Note that tsquery also uses this code to + * parse its input, hence the boolean flags. The two flags are both true or + * both false in current usage, but we keep them separate for clarity. + * is_tsquery affects *only* the content of error messages. + */ struct TSVectorParseStateData { - char *prsbuf; - char *word; /* buffer to hold the current word */ - int len; /* size in bytes allocated for 'word' */ - bool oprisdelim; + char *prsbuf; /* next input character */ + char *bufstart; /* whole string (used only for errors) */ + char *word; /* buffer to hold the current word */ + int len; /* size in bytes allocated for 'word' */ + int eml; /* max bytes per character */ + bool oprisdelim; /* treat ! | * ( ) as delimiters? */ + bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */ }; + /* * Initializes parser for the input string. If oprisdelim is set, the * following characters are treated as delimiters in addition to whitespace: * ! | & ( ) */ TSVectorParseState -init_tsvector_parser(char *input, bool oprisdelim) +init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery) { TSVectorParseState state; state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData)); state->prsbuf = input; + state->bufstart = input; state->len = 32; state->word = (char *) palloc(state->len); + state->eml = pg_database_encoding_max_length(); state->oprisdelim = oprisdelim; + state->is_tsquery = is_tsquery; return state; } /* - * Reinitializes parser for parsing 'input', instead of previous input. + * Reinitializes parser to parse 'input', instead of previous input. */ void reset_tsvector_parser(TSVectorParseState state, char *input) @@ -66,21 +80,21 @@ close_tsvector_parser(TSVectorParseState state) pfree(state); } +/* increase the size of 'word' if needed to hold one more character */ #define RESIZEPRSBUF \ do { \ - if ( curpos - state->word + pg_database_encoding_max_length() >= state->len ) \ + int clen = curpos - state->word; \ + if ( clen + state->eml >= state->len ) \ { \ - int clen = curpos - state->word; \ state->len *= 2; \ - state->word = (char*)repalloc( (void*)state->word, state->len ); \ + state->word = (char *) repalloc(state->word, state->len); \ curpos = state->word + clen; \ } \ } while (0) - #define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) ) -/* Fills the output parameters, and returns true */ +/* Fills gettoken_tsvector's output parameters, and returns true */ #define RETURN_TOKEN \ do { \ if (pos_ptr != NULL) \ @@ -111,18 +125,34 @@ do { \ #define WAITPOSDELIM 7 #define WAITCHARCMPLX 8 +#define PRSSYNTAXERROR prssyntaxerror(state) + +static void +prssyntaxerror(TSVectorParseState state) +{ + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + state->is_tsquery ? + errmsg("syntax error in tsquery: \"%s\"", state->bufstart) : + errmsg("syntax error in tsvector: \"%s\"", state->bufstart))); +} + + /* - * Get next token from string being parsed. Returns false if - * end of input string is reached, otherwise strval, lenval, pos_ptr - * and poslen output parameters are filled in: + * Get next token from string being parsed. Returns true if successful, + * false if end of input string is reached. On success, these output + * parameters are filled in: * - * *strval token - * *lenval length of*strval + * *strval pointer to token + * *lenval length of *strval * *pos_ptr pointer to a palloc'd array of positions and weights * associated with the token. If the caller is not interested * in the information, NULL can be supplied. Otherwise * the caller is responsible for pfreeing the array. * *poslen number of elements in *pos_ptr + * *endptr scan resumption point + * + * Pass NULL for unwanted output parameters. */ bool gettoken_tsvector(TSVectorParseState state, @@ -155,9 +185,7 @@ gettoken_tsvector(TSVectorParseState state, oldstate = WAITENDWORD; } else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; else if (!t_isspace(state->prsbuf)) { COPYCHAR(curpos, state->prsbuf); @@ -170,7 +198,8 @@ gettoken_tsvector(TSVectorParseState state, if (*(state->prsbuf) == '\0') ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("there is no escaped character"))); + errmsg("there is no escaped character: \"%s\"", + state->bufstart))); else { RESIZEPRSBUF; @@ -192,18 +221,14 @@ gettoken_tsvector(TSVectorParseState state, { RESIZEPRSBUF; if (curpos == state->word) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; *(curpos) = '\0'; RETURN_TOKEN; } else if (t_iseq(state->prsbuf, ':')) { if (curpos == state->word) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; *(curpos) = '\0'; if (state->oprisdelim) RETURN_TOKEN; @@ -229,9 +254,7 @@ gettoken_tsvector(TSVectorParseState state, oldstate = WAITENDCMPLX; } else if (*(state->prsbuf) == '\0') - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; else { RESIZEPRSBUF; @@ -253,9 +276,7 @@ gettoken_tsvector(TSVectorParseState state, RESIZEPRSBUF; *(curpos) = '\0'; if (curpos == state->word) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; if (state->oprisdelim) { /* state->prsbuf+=pg_mblen(state->prsbuf); */ @@ -290,17 +311,17 @@ gettoken_tsvector(TSVectorParseState state, } npos++; WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf))); + /* we cannot get here in tsquery, so no need for 2 errmsgs */ if (WEP_GETPOS(pos[npos - 1]) == 0) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("wrong position info in tsvector"))); + errmsg("wrong position info in tsvector: \"%s\"", + state->bufstart))); WEP_SETWEIGHT(pos[npos - 1], 0); statecode = WAITPOSDELIM; } else - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; } else if (statecode == WAITPOSDELIM) { @@ -309,42 +330,32 @@ gettoken_tsvector(TSVectorParseState state, else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) { if (WEP_GETWEIGHT(pos[npos - 1])) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 3); } else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) { if (WEP_GETWEIGHT(pos[npos - 1])) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 2); } else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) { if (WEP_GETWEIGHT(pos[npos - 1])) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 1); } else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) { if (WEP_GETWEIGHT(pos[npos - 1])) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 0); } else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0') RETURN_TOKEN; else if (!t_isdigit(state->prsbuf)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); + PRSSYNTAXERROR; } else /* internal error */ elog(ERROR, "internal error in gettoken_tsvector"); diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h index 60f176054a..ed9137b074 100644 --- a/src/include/tsearch/ts_utils.h +++ b/src/include/tsearch/ts_utils.h @@ -5,7 +5,7 @@ * * Copyright (c) 1998-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.5 2007/10/19 22:01:45 tgl Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.6 2007/10/21 22:29:56 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,10 +22,12 @@ /* tsvector parser support. */ -struct TSVectorParseStateData; +struct TSVectorParseStateData; /* opaque struct in tsvector_parser.c */ typedef struct TSVectorParseStateData *TSVectorParseState; -extern TSVectorParseState init_tsvector_parser(char *input, bool oprisdelim); +extern TSVectorParseState init_tsvector_parser(char *input, + bool oprisdelim, + bool is_tsquery); extern void reset_tsvector_parser(TSVectorParseState state, char *input); extern bool gettoken_tsvector(TSVectorParseState state, char **token, int *len, -- 2.40.0