]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/tsvector_parser.c
Add support for EUI-64 MAC addresses as macaddr8
[postgresql] / src / backend / utils / adt / tsvector_parser.c
1 /*-------------------------------------------------------------------------
2  *
3  * tsvector_parser.c
4  *        Parser for tsvector
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *        src/backend/utils/adt/tsvector_parser.c
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_utils.h"
19
20
21 /*
22  * Private state of tsvector parser.  Note that tsquery also uses this code to
23  * parse its input, hence the boolean flags.  The two flags are both true or
24  * both false in current usage, but we keep them separate for clarity.
25  * is_tsquery affects *only* the content of error messages.
26  */
27 struct TSVectorParseStateData
28 {
29         char       *prsbuf;                     /* next input character */
30         char       *bufstart;           /* whole string (used only for errors) */
31         char       *word;                       /* buffer to hold the current word */
32         int                     len;                    /* size in bytes allocated for 'word' */
33         int                     eml;                    /* max bytes per character */
34         bool            oprisdelim;             /* treat ! | * ( ) as delimiters? */
35         bool            is_tsquery;             /* say "tsquery" not "tsvector" in errors? */
36 };
37
38
39 /*
40  * Initializes parser for the input string. If oprisdelim is set, the
41  * following characters are treated as delimiters in addition to whitespace:
42  * ! | & ( )
43  */
44 TSVectorParseState
45 init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
46 {
47         TSVectorParseState state;
48
49         state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
50         state->prsbuf = input;
51         state->bufstart = input;
52         state->len = 32;
53         state->word = (char *) palloc(state->len);
54         state->eml = pg_database_encoding_max_length();
55         state->oprisdelim = oprisdelim;
56         state->is_tsquery = is_tsquery;
57
58         return state;
59 }
60
61 /*
62  * Reinitializes parser to parse 'input', instead of previous input.
63  */
64 void
65 reset_tsvector_parser(TSVectorParseState state, char *input)
66 {
67         state->prsbuf = input;
68 }
69
70 /*
71  * Shuts down a tsvector parser.
72  */
73 void
74 close_tsvector_parser(TSVectorParseState state)
75 {
76         pfree(state->word);
77         pfree(state);
78 }
79
80 /* increase the size of 'word' if needed to hold one more character */
81 #define RESIZEPRSBUF \
82 do { \
83         int clen = curpos - state->word; \
84         if ( clen + state->eml >= state->len ) \
85         { \
86                 state->len *= 2; \
87                 state->word = (char *) repalloc(state->word, state->len); \
88                 curpos = state->word + clen; \
89         } \
90 } while (0)
91
92 /* phrase operator begins with '<' */
93 #define ISOPERATOR(x) \
94         ( pg_mblen(x) == 1 && ( *(x) == '!' ||  \
95                                                         *(x) == '&' ||  \
96                                                         *(x) == '|' ||  \
97                                                         *(x) == '(' ||  \
98                                                         *(x) == ')' ||  \
99                                                         *(x) == '<'             \
100                                                   ) )
101
102 /* Fills gettoken_tsvector's output parameters, and returns true */
103 #define RETURN_TOKEN \
104 do { \
105         if (pos_ptr != NULL) \
106         { \
107                 *pos_ptr = pos; \
108                 *poslen = npos; \
109         } \
110         else if (pos != NULL) \
111                 pfree(pos); \
112         \
113         if (strval != NULL) \
114                 *strval = state->word; \
115         if (lenval != NULL) \
116                 *lenval = curpos - state->word; \
117         if (endptr != NULL) \
118                 *endptr = state->prsbuf; \
119         return true; \
120 } while(0)
121
122
123 /* State codes used in gettoken_tsvector */
124 #define WAITWORD                1
125 #define WAITENDWORD             2
126 #define WAITNEXTCHAR    3
127 #define WAITENDCMPLX    4
128 #define WAITPOSINFO             5
129 #define INPOSINFO               6
130 #define WAITPOSDELIM    7
131 #define WAITCHARCMPLX   8
132
133 #define PRSSYNTAXERROR prssyntaxerror(state)
134
135 static void
136 prssyntaxerror(TSVectorParseState state)
137 {
138         ereport(ERROR,
139                         (errcode(ERRCODE_SYNTAX_ERROR),
140                          state->is_tsquery ?
141                          errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
142                          errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
143 }
144
145
146 /*
147  * Get next token from string being parsed. Returns true if successful,
148  * false if end of input string is reached.  On success, these output
149  * parameters are filled in:
150  *
151  * *strval              pointer to token
152  * *lenval              length of *strval
153  * *pos_ptr             pointer to a palloc'd array of positions and weights
154  *                              associated with the token. If the caller is not interested
155  *                              in the information, NULL can be supplied. Otherwise
156  *                              the caller is responsible for pfreeing the array.
157  * *poslen              number of elements in *pos_ptr
158  * *endptr              scan resumption point
159  *
160  * Pass NULL for unwanted output parameters.
161  */
162 bool
163 gettoken_tsvector(TSVectorParseState state,
164                                   char **strval, int *lenval,
165                                   WordEntryPos **pos_ptr, int *poslen,
166                                   char **endptr)
167 {
168         int                     oldstate = 0;
169         char       *curpos = state->word;
170         int                     statecode = WAITWORD;
171
172         /*
173          * pos is for collecting the comma delimited list of positions followed by
174          * the actual token.
175          */
176         WordEntryPos *pos = NULL;
177         int                     npos = 0;               /* elements of pos used */
178         int                     posalen = 0;    /* allocated size of pos */
179
180         while (1)
181         {
182                 if (statecode == WAITWORD)
183                 {
184                         if (*(state->prsbuf) == '\0')
185                                 return false;
186                         else if (t_iseq(state->prsbuf, '\''))
187                                 statecode = WAITENDCMPLX;
188                         else if (t_iseq(state->prsbuf, '\\'))
189                         {
190                                 statecode = WAITNEXTCHAR;
191                                 oldstate = WAITENDWORD;
192                         }
193                         else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
194                                 PRSSYNTAXERROR;
195                         else if (!t_isspace(state->prsbuf))
196                         {
197                                 COPYCHAR(curpos, state->prsbuf);
198                                 curpos += pg_mblen(state->prsbuf);
199                                 statecode = WAITENDWORD;
200                         }
201                 }
202                 else if (statecode == WAITNEXTCHAR)
203                 {
204                         if (*(state->prsbuf) == '\0')
205                                 ereport(ERROR,
206                                                 (errcode(ERRCODE_SYNTAX_ERROR),
207                                                  errmsg("there is no escaped character: \"%s\"",
208                                                                 state->bufstart)));
209                         else
210                         {
211                                 RESIZEPRSBUF;
212                                 COPYCHAR(curpos, state->prsbuf);
213                                 curpos += pg_mblen(state->prsbuf);
214                                 Assert(oldstate != 0);
215                                 statecode = oldstate;
216                         }
217                 }
218                 else if (statecode == WAITENDWORD)
219                 {
220                         if (t_iseq(state->prsbuf, '\\'))
221                         {
222                                 statecode = WAITNEXTCHAR;
223                                 oldstate = WAITENDWORD;
224                         }
225                         else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
226                                          (state->oprisdelim && ISOPERATOR(state->prsbuf)))
227                         {
228                                 RESIZEPRSBUF;
229                                 if (curpos == state->word)
230                                         PRSSYNTAXERROR;
231                                 *(curpos) = '\0';
232                                 RETURN_TOKEN;
233                         }
234                         else if (t_iseq(state->prsbuf, ':'))
235                         {
236                                 if (curpos == state->word)
237                                         PRSSYNTAXERROR;
238                                 *(curpos) = '\0';
239                                 if (state->oprisdelim)
240                                         RETURN_TOKEN;
241                                 else
242                                         statecode = INPOSINFO;
243                         }
244                         else
245                         {
246                                 RESIZEPRSBUF;
247                                 COPYCHAR(curpos, state->prsbuf);
248                                 curpos += pg_mblen(state->prsbuf);
249                         }
250                 }
251                 else if (statecode == WAITENDCMPLX)
252                 {
253                         if (t_iseq(state->prsbuf, '\''))
254                         {
255                                 statecode = WAITCHARCMPLX;
256                         }
257                         else if (t_iseq(state->prsbuf, '\\'))
258                         {
259                                 statecode = WAITNEXTCHAR;
260                                 oldstate = WAITENDCMPLX;
261                         }
262                         else if (*(state->prsbuf) == '\0')
263                                 PRSSYNTAXERROR;
264                         else
265                         {
266                                 RESIZEPRSBUF;
267                                 COPYCHAR(curpos, state->prsbuf);
268                                 curpos += pg_mblen(state->prsbuf);
269                         }
270                 }
271                 else if (statecode == WAITCHARCMPLX)
272                 {
273                         if (t_iseq(state->prsbuf, '\''))
274                         {
275                                 RESIZEPRSBUF;
276                                 COPYCHAR(curpos, state->prsbuf);
277                                 curpos += pg_mblen(state->prsbuf);
278                                 statecode = WAITENDCMPLX;
279                         }
280                         else
281                         {
282                                 RESIZEPRSBUF;
283                                 *(curpos) = '\0';
284                                 if (curpos == state->word)
285                                         PRSSYNTAXERROR;
286                                 if (state->oprisdelim)
287                                 {
288                                         /* state->prsbuf+=pg_mblen(state->prsbuf); */
289                                         RETURN_TOKEN;
290                                 }
291                                 else
292                                         statecode = WAITPOSINFO;
293                                 continue;               /* recheck current character */
294                         }
295                 }
296                 else if (statecode == WAITPOSINFO)
297                 {
298                         if (t_iseq(state->prsbuf, ':'))
299                                 statecode = INPOSINFO;
300                         else
301                                 RETURN_TOKEN;
302                 }
303                 else if (statecode == INPOSINFO)
304                 {
305                         if (t_isdigit(state->prsbuf))
306                         {
307                                 if (posalen == 0)
308                                 {
309                                         posalen = 4;
310                                         pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
311                                         npos = 0;
312                                 }
313                                 else if (npos + 1 >= posalen)
314                                 {
315                                         posalen *= 2;
316                                         pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
317                                 }
318                                 npos++;
319                                 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
320                                 /* we cannot get here in tsquery, so no need for 2 errmsgs */
321                                 if (WEP_GETPOS(pos[npos - 1]) == 0)
322                                         ereport(ERROR,
323                                                         (errcode(ERRCODE_SYNTAX_ERROR),
324                                                          errmsg("wrong position info in tsvector: \"%s\"",
325                                                                         state->bufstart)));
326                                 WEP_SETWEIGHT(pos[npos - 1], 0);
327                                 statecode = WAITPOSDELIM;
328                         }
329                         else
330                                 PRSSYNTAXERROR;
331                 }
332                 else if (statecode == WAITPOSDELIM)
333                 {
334                         if (t_iseq(state->prsbuf, ','))
335                                 statecode = INPOSINFO;
336                         else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
337                         {
338                                 if (WEP_GETWEIGHT(pos[npos - 1]))
339                                         PRSSYNTAXERROR;
340                                 WEP_SETWEIGHT(pos[npos - 1], 3);
341                         }
342                         else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
343                         {
344                                 if (WEP_GETWEIGHT(pos[npos - 1]))
345                                         PRSSYNTAXERROR;
346                                 WEP_SETWEIGHT(pos[npos - 1], 2);
347                         }
348                         else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
349                         {
350                                 if (WEP_GETWEIGHT(pos[npos - 1]))
351                                         PRSSYNTAXERROR;
352                                 WEP_SETWEIGHT(pos[npos - 1], 1);
353                         }
354                         else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
355                         {
356                                 if (WEP_GETWEIGHT(pos[npos - 1]))
357                                         PRSSYNTAXERROR;
358                                 WEP_SETWEIGHT(pos[npos - 1], 0);
359                         }
360                         else if (t_isspace(state->prsbuf) ||
361                                          *(state->prsbuf) == '\0')
362                                 RETURN_TOKEN;
363                         else if (!t_isdigit(state->prsbuf))
364                                 PRSSYNTAXERROR;
365                 }
366                 else    /* internal error */
367                         elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
368                                  statecode);
369
370                 /* get next char */
371                 state->prsbuf += pg_mblen(state->prsbuf);
372         }
373 }