1 /*-------------------------------------------------------------------------
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
10 * src/backend/utils/adt/tsvector_parser.c
12 *-------------------------------------------------------------------------
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_utils.h"
22 * Private state of tsvector parser. Note that tsquery also uses this code to
23 * parse its input, hence the boolean flags. The two flags are both true or
24 * both false in current usage, but we keep them separate for clarity.
25 * is_tsquery affects *only* the content of error messages.
27 struct TSVectorParseStateData
29 char *prsbuf; /* next input character */
30 char *bufstart; /* whole string (used only for errors) */
31 char *word; /* buffer to hold the current word */
32 int len; /* size in bytes allocated for 'word' */
33 int eml; /* max bytes per character */
34 bool oprisdelim; /* treat ! | * ( ) as delimiters? */
35 bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
40 * Initializes parser for the input string. If oprisdelim is set, the
41 * following characters are treated as delimiters in addition to whitespace:
45 init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
47 TSVectorParseState state;
49 state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
50 state->prsbuf = input;
51 state->bufstart = input;
53 state->word = (char *) palloc(state->len);
54 state->eml = pg_database_encoding_max_length();
55 state->oprisdelim = oprisdelim;
56 state->is_tsquery = is_tsquery;
62 * Reinitializes parser to parse 'input', instead of previous input.
65 reset_tsvector_parser(TSVectorParseState state, char *input)
67 state->prsbuf = input;
71 * Shuts down a tsvector parser.
74 close_tsvector_parser(TSVectorParseState state)
80 /* increase the size of 'word' if needed to hold one more character */
81 #define RESIZEPRSBUF \
83 int clen = curpos - state->word; \
84 if ( clen + state->eml >= state->len ) \
87 state->word = (char *) repalloc(state->word, state->len); \
88 curpos = state->word + clen; \
92 /* phrase operator begins with '<' */
93 #define ISOPERATOR(x) \
94 ( pg_mblen(x) == 1 && ( *(x) == '!' || \
102 /* Fills gettoken_tsvector's output parameters, and returns true */
103 #define RETURN_TOKEN \
105 if (pos_ptr != NULL) \
110 else if (pos != NULL) \
113 if (strval != NULL) \
114 *strval = state->word; \
115 if (lenval != NULL) \
116 *lenval = curpos - state->word; \
117 if (endptr != NULL) \
118 *endptr = state->prsbuf; \
123 /* State codes used in gettoken_tsvector */
125 #define WAITENDWORD 2
126 #define WAITNEXTCHAR 3
127 #define WAITENDCMPLX 4
128 #define WAITPOSINFO 5
130 #define WAITPOSDELIM 7
131 #define WAITCHARCMPLX 8
133 #define PRSSYNTAXERROR prssyntaxerror(state)
136 prssyntaxerror(TSVectorParseState state)
139 (errcode(ERRCODE_SYNTAX_ERROR),
141 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
142 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
147 * Get next token from string being parsed. Returns true if successful,
148 * false if end of input string is reached. On success, these output
149 * parameters are filled in:
151 * *strval pointer to token
152 * *lenval length of *strval
153 * *pos_ptr pointer to a palloc'd array of positions and weights
154 * associated with the token. If the caller is not interested
155 * in the information, NULL can be supplied. Otherwise
156 * the caller is responsible for pfreeing the array.
157 * *poslen number of elements in *pos_ptr
158 * *endptr scan resumption point
160 * Pass NULL for unwanted output parameters.
163 gettoken_tsvector(TSVectorParseState state,
164 char **strval, int *lenval,
165 WordEntryPos **pos_ptr, int *poslen,
169 char *curpos = state->word;
170 int statecode = WAITWORD;
173 * pos is for collecting the comma delimited list of positions followed by
176 WordEntryPos *pos = NULL;
177 int npos = 0; /* elements of pos used */
178 int posalen = 0; /* allocated size of pos */
182 if (statecode == WAITWORD)
184 if (*(state->prsbuf) == '\0')
186 else if (t_iseq(state->prsbuf, '\''))
187 statecode = WAITENDCMPLX;
188 else if (t_iseq(state->prsbuf, '\\'))
190 statecode = WAITNEXTCHAR;
191 oldstate = WAITENDWORD;
193 else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
195 else if (!t_isspace(state->prsbuf))
197 COPYCHAR(curpos, state->prsbuf);
198 curpos += pg_mblen(state->prsbuf);
199 statecode = WAITENDWORD;
202 else if (statecode == WAITNEXTCHAR)
204 if (*(state->prsbuf) == '\0')
206 (errcode(ERRCODE_SYNTAX_ERROR),
207 errmsg("there is no escaped character: \"%s\"",
212 COPYCHAR(curpos, state->prsbuf);
213 curpos += pg_mblen(state->prsbuf);
214 Assert(oldstate != 0);
215 statecode = oldstate;
218 else if (statecode == WAITENDWORD)
220 if (t_iseq(state->prsbuf, '\\'))
222 statecode = WAITNEXTCHAR;
223 oldstate = WAITENDWORD;
225 else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
226 (state->oprisdelim && ISOPERATOR(state->prsbuf)))
229 if (curpos == state->word)
234 else if (t_iseq(state->prsbuf, ':'))
236 if (curpos == state->word)
239 if (state->oprisdelim)
242 statecode = INPOSINFO;
247 COPYCHAR(curpos, state->prsbuf);
248 curpos += pg_mblen(state->prsbuf);
251 else if (statecode == WAITENDCMPLX)
253 if (t_iseq(state->prsbuf, '\''))
255 statecode = WAITCHARCMPLX;
257 else if (t_iseq(state->prsbuf, '\\'))
259 statecode = WAITNEXTCHAR;
260 oldstate = WAITENDCMPLX;
262 else if (*(state->prsbuf) == '\0')
267 COPYCHAR(curpos, state->prsbuf);
268 curpos += pg_mblen(state->prsbuf);
271 else if (statecode == WAITCHARCMPLX)
273 if (t_iseq(state->prsbuf, '\''))
276 COPYCHAR(curpos, state->prsbuf);
277 curpos += pg_mblen(state->prsbuf);
278 statecode = WAITENDCMPLX;
284 if (curpos == state->word)
286 if (state->oprisdelim)
288 /* state->prsbuf+=pg_mblen(state->prsbuf); */
292 statecode = WAITPOSINFO;
293 continue; /* recheck current character */
296 else if (statecode == WAITPOSINFO)
298 if (t_iseq(state->prsbuf, ':'))
299 statecode = INPOSINFO;
303 else if (statecode == INPOSINFO)
305 if (t_isdigit(state->prsbuf))
310 pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
313 else if (npos + 1 >= posalen)
316 pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
319 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
320 /* we cannot get here in tsquery, so no need for 2 errmsgs */
321 if (WEP_GETPOS(pos[npos - 1]) == 0)
323 (errcode(ERRCODE_SYNTAX_ERROR),
324 errmsg("wrong position info in tsvector: \"%s\"",
326 WEP_SETWEIGHT(pos[npos - 1], 0);
327 statecode = WAITPOSDELIM;
332 else if (statecode == WAITPOSDELIM)
334 if (t_iseq(state->prsbuf, ','))
335 statecode = INPOSINFO;
336 else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
338 if (WEP_GETWEIGHT(pos[npos - 1]))
340 WEP_SETWEIGHT(pos[npos - 1], 3);
342 else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
344 if (WEP_GETWEIGHT(pos[npos - 1]))
346 WEP_SETWEIGHT(pos[npos - 1], 2);
348 else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
350 if (WEP_GETWEIGHT(pos[npos - 1]))
352 WEP_SETWEIGHT(pos[npos - 1], 1);
354 else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
356 if (WEP_GETWEIGHT(pos[npos - 1]))
358 WEP_SETWEIGHT(pos[npos - 1], 0);
360 else if (t_isspace(state->prsbuf) ||
361 *(state->prsbuf) == '\0')
363 else if (!t_isdigit(state->prsbuf))
366 else /* internal error */
367 elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
371 state->prsbuf += pg_mblen(state->prsbuf);