1 /*-------------------------------------------------------------------------
4 * I/O functions for tsquery
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
10 * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.18 2008/05/16 16:31:01 tgl Exp $
12 *-------------------------------------------------------------------------
17 #include "libpq/pqformat.h"
18 #include "miscadmin.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
23 #include "utils/memutils.h"
24 #include "utils/pg_crc.h"
27 struct TSQueryParserStateData
29 /* State for gettoken_query */
30 char *buffer; /* entire string we are scanning */
31 char *buf; /* current scan point */
33 int count; /* nesting count, incremented by (,
36 /* polish (prefix) notation in list, filled in by push* functions */
40 * Strings from operands are collected in op. curop is a pointer to the
41 * end of used space of op.
45 int lenop; /* allocated size of op */
46 int sumlen; /* used size of op */
48 /* state for value's parser */
49 TSVectorParseState valstate;
54 #define WAITOPERATOR 2
55 #define WAITFIRSTOPERAND 3
56 #define WAITSINGLEOPERAND 4
59 * subroutine to parse the modifiers (weight and prefix flag currently)
60 * part, like ':1AB' of a query.
63 get_modifiers(char *buf, int16 *weight, bool *prefix)
68 if (!t_iseq(buf, ':'))
72 while (*buf && pg_mblen(buf) == 1)
105 * token types for parsing
118 * get token from query string
120 * *operator is filled in with OP_* when return values is PT_OPR
121 * *strval, *lenval and *weight are filled in when return value is PT_VAL
124 gettoken_query(TSQueryParserState state,
126 int *lenval, char **strval, int16 *weight, bool *prefix)
133 switch (state->state)
135 case WAITFIRSTOPERAND:
137 if (t_iseq(state->buf, '!'))
139 (state->buf)++; /* can safely ++, t_iseq guarantee
140 * that pg_mblen()==1 */
142 state->state = WAITOPERAND;
145 else if (t_iseq(state->buf, '('))
149 state->state = WAITOPERAND;
152 else if (t_iseq(state->buf, ':'))
155 (errcode(ERRCODE_SYNTAX_ERROR),
156 errmsg("syntax error in tsquery: \"%s\"",
159 else if (!t_isspace(state->buf))
162 * We rely on the tsvector parser to parse the value for
165 reset_tsvector_parser(state->valstate, state->buf);
166 if (gettoken_tsvector(state->valstate, strval, lenval, NULL, NULL, &state->buf))
168 state->buf = get_modifiers(state->buf, weight, prefix);
169 state->state = WAITOPERATOR;
172 else if (state->state == WAITFIRSTOPERAND)
176 (errcode(ERRCODE_SYNTAX_ERROR),
177 errmsg("no operand in tsquery: \"%s\"",
182 if (t_iseq(state->buf, '&'))
184 state->state = WAITOPERAND;
189 if (t_iseq(state->buf, '|'))
191 state->state = WAITOPERAND;
196 else if (t_iseq(state->buf, ')'))
200 return (state->count < 0) ? PT_ERR : PT_CLOSE;
202 else if (*(state->buf) == '\0')
203 return (state->count) ? PT_ERR : PT_END;
204 else if (!t_isspace(state->buf))
207 case WAITSINGLEOPERAND:
208 if (*(state->buf) == '\0')
210 *strval = state->buf;
211 *lenval = strlen(state->buf);
212 state->buf += strlen(state->buf);
219 state->buf += pg_mblen(state->buf);
225 * Push an operator to state->polstr
228 pushOperator(TSQueryParserState state, int8 oper)
232 Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR);
234 tmp = (QueryOperator *) palloc0(sizeof(QueryOperator));
237 /* left is filled in later with findoprnd */
239 state->polstr = lcons(tmp, state->polstr);
243 pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int lenval, int weight, bool prefix)
247 if (distance >= MAXSTRPOS)
249 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
250 errmsg("value is too big in tsquery: \"%s\"",
252 if (lenval >= MAXSTRLEN)
254 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
255 errmsg("operand is too long in tsquery: \"%s\"",
258 tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
260 tmp->weight = weight;
261 tmp->prefix = prefix;
262 tmp->valcrc = (int32) valcrc;
263 tmp->length = lenval;
264 tmp->distance = distance;
266 state->polstr = lcons(tmp, state->polstr);
270 * Push an operand to state->polstr.
272 * strval must point to a string equal to state->curop. lenval is the length
276 pushValue(TSQueryParserState state, char *strval, int lenval, int2 weight, bool prefix)
280 if (lenval >= MAXSTRLEN)
282 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
283 errmsg("word is too long in tsquery: \"%s\"",
287 COMP_CRC32(valcrc, strval, lenval);
289 pushValue_internal(state, valcrc, state->curop - state->op, lenval, weight, prefix);
291 /* append the value string to state.op, enlarging buffer if needed first */
292 while (state->curop - state->op + lenval + 1 >= state->lenop)
294 int used = state->curop - state->op;
297 state->op = (char *) repalloc((void *) state->op, state->lenop);
298 state->curop = state->op + used;
300 memcpy((void *) state->curop, (void *) strval, lenval);
301 state->curop += lenval;
302 *(state->curop) = '\0';
304 state->sumlen += lenval + 1 /* \0 */ ;
309 * Push a stopword placeholder to state->polstr
312 pushStop(TSQueryParserState state)
316 tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
317 tmp->type = QI_VALSTOP;
319 state->polstr = lcons(tmp, state->polstr);
323 #define STACKDEPTH 32
326 * Make polish (prefix) notation of query.
328 * See parse_tsquery for explanation of pushval.
331 makepol(TSQueryParserState state,
332 PushFunction pushval,
339 int8 opstack[STACKDEPTH];
344 /* since this function recurses, it could be driven to stack overflow */
347 while ((type = gettoken_query(state, &operator, &lenval, &strval, &weight, &prefix)) != PT_END)
352 pushval(opaque, state, strval, lenval, weight, prefix);
353 while (lenstack && (opstack[lenstack - 1] == OP_AND ||
354 opstack[lenstack - 1] == OP_NOT))
357 pushOperator(state, opstack[lenstack]);
361 if (lenstack && operator == OP_OR)
362 pushOperator(state, OP_OR);
365 if (lenstack == STACKDEPTH) /* internal error */
366 elog(ERROR, "tsquery stack too small");
367 opstack[lenstack] = operator;
372 makepol(state, pushval, opaque);
374 if (lenstack && (opstack[lenstack - 1] == OP_AND ||
375 opstack[lenstack - 1] == OP_NOT))
378 pushOperator(state, opstack[lenstack]);
385 pushOperator(state, opstack[lenstack]);
391 (errcode(ERRCODE_SYNTAX_ERROR),
392 errmsg("syntax error in tsquery: \"%s\"",
399 pushOperator(state, opstack[lenstack]);
404 findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
406 /* since this function recurses, it could be driven to stack overflow. */
410 elog(ERROR, "malformed tsquery: operand not found");
412 if (ptr[*pos].type == QI_VAL ||
413 ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, they
414 * haven't been cleaned away yet. */
420 Assert(ptr[*pos].type == QI_OPR);
422 if (ptr[*pos].operator.oper == OP_NOT)
424 ptr[*pos].operator.left = 1;
426 findoprnd_recurse(ptr, pos, nnodes);
430 QueryOperator *curitem = &ptr[*pos].operator;
433 Assert(curitem->oper == OP_AND || curitem->oper == OP_OR);
436 findoprnd_recurse(ptr, pos, nnodes);
437 curitem->left = *pos - tmp;
438 findoprnd_recurse(ptr, pos, nnodes);
445 * Fills in the left-fields previously left unfilled. The input
446 * QueryItems must be in polish (prefix) notation.
449 findoprnd(QueryItem *ptr, int size)
454 findoprnd_recurse(ptr, &pos, size);
457 elog(ERROR, "malformed tsquery: extra nodes");
462 * Each value (operand) in the query is be passed to pushval. pushval can
463 * transform the simple value to an arbitrarily complex expression using
464 * pushValue and pushOperator. It must push a single value with pushValue,
465 * a complete expression with all operands, or a a stopword placeholder
466 * with pushStop, otherwise the prefix notation representation will be broken,
467 * having an operator with no operand.
469 * opaque is passed on to pushval as is, pushval can use it to store its
472 * The returned query might contain QI_STOPVAL nodes. The caller is responsible
473 * for cleaning them up (with clean_fakeval)
476 parse_tsquery(char *buf,
477 PushFunction pushval,
481 struct TSQueryParserStateData state;
491 state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND;
495 /* init value parser's state */
496 state.valstate = init_tsvector_parser(state.buffer, true, true);
498 /* init list of operand */
501 state.curop = state.op = (char *) palloc(state.lenop);
502 *(state.curop) = '\0';
504 /* parse query & make polish notation (postfix, but in reverse order) */
505 makepol(&state, pushval, opaque);
507 close_tsvector_parser(state.valstate);
509 if (list_length(state.polstr) == 0)
512 (errmsg("text-search query doesn't contain lexemes: \"%s\"",
514 query = (TSQuery) palloc(HDRSIZETQ);
515 SET_VARSIZE(query, HDRSIZETQ);
520 /* Pack the QueryItems in the final TSQuery struct to return to caller */
521 commonlen = COMPUTESIZE(list_length(state.polstr), state.sumlen);
522 query = (TSQuery) palloc0(commonlen);
523 SET_VARSIZE(query, commonlen);
524 query->size = list_length(state.polstr);
525 ptr = GETQUERY(query);
527 /* Copy QueryItems to TSQuery */
529 foreach(cell, state.polstr)
531 QueryItem *item = (QueryItem *) lfirst(cell);
536 memcpy(&ptr[i], item, sizeof(QueryOperand));
539 ptr[i].type = QI_VALSTOP;
542 memcpy(&ptr[i], item, sizeof(QueryOperator));
545 elog(ERROR, "unrecognized QueryItem type: %d", item->type);
550 /* Copy all the operand strings to TSQuery */
551 memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen);
554 /* Set left operand pointers for every operator. */
555 findoprnd(ptr, query->size);
561 pushval_asis(Datum opaque, TSQueryParserState state, char *strval, int lenval,
562 int16 weight, bool prefix)
564 pushValue(state, strval, lenval, weight, prefix);
568 * in without morphology
571 tsqueryin(PG_FUNCTION_ARGS)
573 char *in = PG_GETARG_CSTRING(0);
575 pg_verifymbstr(in, strlen(in), false);
577 PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), false));
592 /* Makes sure inf->buf is large enough for adding 'addsize' bytes */
593 #define RESIZEBUF(inf, addsize) \
594 while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \
596 int len = (inf)->cur - (inf)->buf; \
597 (inf)->buflen *= 2; \
598 (inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \
599 (inf)->cur = (inf)->buf + len; \
603 * recursive walk on tree and print it in
604 * infix (human-readable) view
607 infix(INFIX *in, bool first)
609 /* since this function recurses, it could be driven to stack overflow. */
612 if (in->curpol->type == QI_VAL)
614 QueryOperand *curpol = &in->curpol->operand;
615 char *op = in->op + curpol->distance;
618 RESIZEBUF(in, curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 6);
623 if (t_iseq(op, '\''))
628 else if (t_iseq(op, '\\'))
633 COPYCHAR(in->cur, op);
641 if (curpol->weight || curpol->prefix)
645 if ( curpol->prefix )
650 if (curpol->weight & (1 << 3))
655 if (curpol->weight & (1 << 2))
660 if (curpol->weight & (1 << 1))
665 if (curpol->weight & 1)
674 else if (in->curpol->operator.oper == OP_NOT)
684 if (in->curpol->type == QI_OPR)
688 sprintf(in->cur, "( ");
689 in->cur = strchr(in->cur, '\0');
696 sprintf(in->cur, " )");
697 in->cur = strchr(in->cur, '\0');
702 int8 op = in->curpol->operator.oper;
706 if (op == OP_OR && !first)
709 sprintf(in->cur, "( ");
710 in->cur = strchr(in->cur, '\0');
713 nrm.curpol = in->curpol;
716 nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
718 /* get right operand */
721 /* get & print left operand */
722 in->curpol = nrm.curpol;
725 /* print operator & right operand */
726 RESIZEBUF(in, 3 + (nrm.cur - nrm.buf));
730 sprintf(in->cur, " | %s", nrm.buf);
733 sprintf(in->cur, " & %s", nrm.buf);
736 /* OP_NOT is handled in above if-branch */
737 elog(ERROR, "unrecognized operator type: %d", op);
739 in->cur = strchr(in->cur, '\0');
742 if (op == OP_OR && !first)
745 sprintf(in->cur, " )");
746 in->cur = strchr(in->cur, '\0');
753 tsqueryout(PG_FUNCTION_ARGS)
755 TSQuery query = PG_GETARG_TSQUERY(0);
758 if (query->size == 0)
763 PG_RETURN_POINTER(b);
765 nrm.curpol = GETQUERY(query);
767 nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
769 nrm.op = GETOPERAND(query);
772 PG_FREE_IF_COPY(query, 0);
773 PG_RETURN_CSTRING(nrm.buf);
777 * Binary Input / Output functions. The binary format is as follows:
779 * uint32 number of operators/operands in the query
781 * Followed by the operators and operands, in prefix notation. For each
786 * operand text in client encoding, null-terminated
791 * uint8 operator, one of OP_AND, OP_OR, OP_NOT.
794 tsquerysend(PG_FUNCTION_ARGS)
796 TSQuery query = PG_GETARG_TSQUERY(0);
799 QueryItem *item = GETQUERY(query);
801 pq_begintypsend(&buf);
803 pq_sendint(&buf, query->size, sizeof(uint32));
804 for (i = 0; i < query->size; i++)
806 pq_sendint(&buf, item->type, sizeof(item->type));
811 pq_sendint(&buf, item->operand.weight, sizeof(uint8));
812 pq_sendint(&buf, item->operand.prefix, sizeof(uint8));
813 pq_sendstring(&buf, GETOPERAND(query) + item->operand.distance);
816 pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper));
819 elog(ERROR, "unrecognized tsquery node type: %d", item->type);
824 PG_FREE_IF_COPY(query, 0);
826 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
830 tsqueryrecv(PG_FUNCTION_ARGS)
832 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
840 const char **operands;
842 size = pq_getmsgint(buf, sizeof(uint32));
843 if (size > (MaxAllocSize / sizeof(QueryItem)))
844 elog(ERROR, "invalid size of tsquery");
846 /* Allocate space to temporarily hold operand strings */
847 operands = palloc(size * sizeof(char *));
849 /* Allocate space for all the QueryItems. */
850 len = HDRSIZETQ + sizeof(QueryItem) * size;
851 query = (TSQuery) palloc0(len);
853 item = GETQUERY(query);
856 for (i = 0; i < size; i++)
858 item->type = (int8) pq_getmsgint(buf, sizeof(int8));
860 if (item->type == QI_VAL)
862 size_t val_len; /* length after recoding to server encoding */
868 weight = (uint8) pq_getmsgint(buf, sizeof(uint8));
869 prefix = (uint8) pq_getmsgint(buf, sizeof(uint8));
870 val = pq_getmsgstring(buf);
871 val_len = strlen(val);
876 elog(ERROR, "invalid tsquery: invalid weight bitmap");
878 if (val_len > MAXSTRLEN)
879 elog(ERROR, "invalid tsquery: operand too long");
881 if (datalen > MAXSTRPOS)
882 elog(ERROR, "invalid tsquery: total operand length exceeded");
887 COMP_CRC32(valcrc, val, val_len);
890 item->operand.weight = weight;
891 item->operand.prefix = (prefix) ? true : false;
892 item->operand.valcrc = (int32) valcrc;
893 item->operand.length = val_len;
894 item->operand.distance = datalen;
897 * Operand strings are copied to the final struct after this loop;
898 * here we just collect them to an array
902 datalen += val_len + 1; /* + 1 for the '\0' terminator */
904 else if (item->type == QI_OPR)
908 oper = (int8) pq_getmsgint(buf, sizeof(int8));
909 if (oper != OP_NOT && oper != OP_OR && oper != OP_AND)
910 elog(ERROR, "invalid tsquery: unrecognized operator type %d",
913 elog(ERROR, "invalid pointer to right operand");
915 item->operator.oper = oper;
918 elog(ERROR, "unrecognized tsquery node type: %d", item->type);
923 /* Enlarge buffer to make room for the operand values. */
924 query = (TSQuery) repalloc(query, len + datalen);
925 item = GETQUERY(query);
926 ptr = GETOPERAND(query);
929 * Fill in the left-pointers. Checks that the tree is well-formed as a
932 findoprnd(item, size);
934 /* Copy operands to output struct */
935 for (i = 0; i < size; i++)
937 if (item->type == QI_VAL)
939 memcpy(ptr, operands[i], item->operand.length + 1);
940 ptr += item->operand.length + 1;
947 Assert(ptr - GETOPERAND(query) == datalen);
949 SET_VARSIZE(query, len + datalen);
951 PG_RETURN_TSVECTOR(query);
955 * debug function, used only for view query
956 * which will be executed in non-leaf pages in index
959 tsquerytree(PG_FUNCTION_ARGS)
961 TSQuery query = PG_GETARG_TSQUERY(0);
967 if (query->size == 0)
969 res = (text *) palloc(VARHDRSZ);
970 SET_VARSIZE(res, VARHDRSZ);
971 PG_RETURN_POINTER(res);
974 q = clean_NOT(GETQUERY(query), &len);
978 res = cstring_to_text("T");
984 nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
986 nrm.op = GETOPERAND(query);
988 res = cstring_to_text_with_len(nrm.buf, nrm.cur - nrm.buf);
992 PG_FREE_IF_COPY(query, 0);
994 PG_RETURN_TEXT_P(res);