1 /*-------------------------------------------------------------------------
4 * I/O functions for tsvector
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
10 * src/backend/utils/adt/tsvector.c
12 *-------------------------------------------------------------------------
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_utils.h"
20 #include "utils/memutils.h"
24 WordEntry entry; /* must be first! */
26 int poslen; /* number of elements in pos */
30 /* Compare two WordEntryPos values for qsort */
32 compareWordEntryPos(const void *a, const void *b)
34 int apos = WEP_GETPOS(*(const WordEntryPos *) a);
35 int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
39 return (apos > bpos) ? 1 : -1;
43 * Removes duplicate pos entries. If there's two entries with same pos
44 * but different weight, the higher weight is retained.
49 uniquePos(WordEntryPos *a, int l)
57 qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
63 if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
67 if (res - a >= MAXNUMPOS - 1 ||
68 WEP_GETPOS(*res) == MAXENTRYPOS - 1)
71 else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
72 WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
79 /* Compare two WordEntryIN values for qsort */
81 compareentry(const void *va, const void *vb, void *arg)
83 const WordEntryIN *a = (const WordEntryIN *) va;
84 const WordEntryIN *b = (const WordEntryIN *) vb;
85 char *BufferStr = (char *) arg;
87 return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
88 &BufferStr[b->entry.pos], b->entry.len,
93 * Sort an array of WordEntryIN, remove duplicates.
94 * *outbuflen receives the amount of space needed for strings and positions.
97 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
106 qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
114 if (!(ptr->entry.len == res->entry.len &&
115 strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
116 res->entry.len) == 0))
118 /* done accumulating data into *res, count space needed */
119 buflen += res->entry.len;
120 if (res->entry.haspos)
122 res->poslen = uniquePos(res->pos, res->poslen);
123 buflen = SHORTALIGN(buflen);
124 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
128 memcpy(res, ptr, sizeof(WordEntryIN));
130 else if (ptr->entry.haspos)
132 if (res->entry.haspos)
134 /* append ptr's positions to res's positions */
135 int newlen = ptr->poslen + res->poslen;
137 res->pos = (WordEntryPos *)
138 repalloc(res->pos, newlen * sizeof(WordEntryPos));
139 memcpy(&res->pos[res->poslen], ptr->pos,
140 ptr->poslen * sizeof(WordEntryPos));
141 res->poslen = newlen;
146 /* just give ptr's positions to pos */
147 res->entry.haspos = 1;
149 res->poslen = ptr->poslen;
155 /* count space needed for last item */
156 buflen += res->entry.len;
157 if (res->entry.haspos)
159 res->poslen = uniquePos(res->pos, res->poslen);
160 buflen = SHORTALIGN(buflen);
161 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
169 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
171 return compareentry(a, b, buf);
176 tsvectorin(PG_FUNCTION_ARGS)
178 char *buf = PG_GETARG_CSTRING(0);
179 TSVectorParseState state;
182 int arrlen; /* allocated size of arr */
195 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
200 int buflen = 256; /* allocated size of tmpbuf */
202 state = init_tsvector_parser(buf, false, false);
205 arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
206 cur = tmpbuf = (char *) palloc(buflen);
208 while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
210 if (toklen >= MAXSTRLEN)
212 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
213 errmsg("word is too long (%ld bytes, max %ld bytes)",
215 (long) (MAXSTRLEN - 1))));
217 if (cur - tmpbuf > MAXSTRPOS)
219 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
220 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
221 (long) (cur - tmpbuf), (long) MAXSTRPOS)));
224 * Enlarge buffers if needed
229 arr = (WordEntryIN *)
230 repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
232 while ((cur - tmpbuf) + toklen >= buflen)
234 int dist = cur - tmpbuf;
237 tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
240 arr[len].entry.len = toklen;
241 arr[len].entry.pos = cur - tmpbuf;
242 memcpy((void *) cur, (void *) token, toklen);
247 arr[len].entry.haspos = 1;
249 arr[len].poslen = poslen;
253 arr[len].entry.haspos = 0;
260 close_tsvector_parser(state);
263 len = uniqueentry(arr, len, tmpbuf, &buflen);
267 if (buflen > MAXSTRPOS)
269 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
270 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
272 totallen = CALCDATASIZE(len, buflen);
273 in = (TSVector) palloc0(totallen);
274 SET_VARSIZE(in, totallen);
279 for (i = 0; i < len; i++)
281 memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
282 arr[i].entry.pos = stroff;
283 stroff += arr[i].entry.len;
284 if (arr[i].entry.haspos)
286 if (arr[i].poslen > 0xFFFF)
287 elog(ERROR, "positions array too long");
289 /* Copy number of positions */
290 stroff = SHORTALIGN(stroff);
291 *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
292 stroff += sizeof(uint16);
295 memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
296 stroff += arr[i].poslen * sizeof(WordEntryPos);
300 inarr[i] = arr[i].entry;
303 Assert((strbuf + stroff - (char *) in) == totallen);
305 PG_RETURN_TSVECTOR(in);
309 tsvectorout(PG_FUNCTION_ARGS)
311 TSVector out = PG_GETARG_TSVECTOR(0);
316 WordEntry *ptr = ARRPTR(out);
321 lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
322 for (i = 0; i < out->size; i++)
324 lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
326 lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
329 curout = outbuf = (char *) palloc(lenbuf);
330 for (i = 0; i < out->size; i++)
332 curbegin = curin = STRPTR(out) + ptr->pos;
336 while (curin - curbegin < ptr->len)
338 int len = pg_mblen(curin);
340 if (t_iseq(curin, '\''))
342 else if (t_iseq(curin, '\\'))
346 *curout++ = *curin++;
350 if ((pp = POSDATALEN(out, ptr)) != 0)
355 wptr = POSDATAPTR(out, ptr);
358 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
359 switch (WEP_GETWEIGHT(*wptr))
385 PG_FREE_IF_COPY(out, 0);
386 PG_RETURN_CSTRING(outbuf);
390 * Binary Input / Output functions. The binary format is as follows:
392 * uint32 number of lexemes
395 * lexeme text in client encoding, null-terminated
396 * uint16 number of positions
398 * uint16 WordEntryPos
402 tsvectorsend(PG_FUNCTION_ARGS)
404 TSVector vec = PG_GETARG_TSVECTOR(0);
408 WordEntry *weptr = ARRPTR(vec);
410 pq_begintypsend(&buf);
412 pq_sendint(&buf, vec->size, sizeof(int32));
413 for (i = 0; i < vec->size; i++)
418 * the strings in the TSVector array are not null-terminated, so we
419 * have to send the null-terminator separately
421 pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
422 pq_sendbyte(&buf, '\0');
424 npos = POSDATALEN(vec, weptr);
425 pq_sendint(&buf, npos, sizeof(uint16));
429 WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
431 for (j = 0; j < npos; j++)
432 pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
437 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
441 tsvectorrecv(PG_FUNCTION_ARGS)
443 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
447 int datalen; /* number of bytes used in the variable size
448 * area after fixed size TSVector header and
451 Size len; /* allocated size of vec */
452 bool needSort = false;
454 nentries = pq_getmsgint(buf, sizeof(int32));
455 if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
456 elog(ERROR, "invalid size of tsvector");
458 hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
460 len = hdrlen * 2; /* times two to make room for lexemes */
461 vec = (TSVector) palloc0(len);
462 vec->size = nentries;
465 for (i = 0; i < nentries; i++)
471 lexeme = pq_getmsgstring(buf);
472 npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
476 lex_len = strlen(lexeme);
477 if (lex_len > MAXSTRLEN)
478 elog(ERROR, "invalid tsvector: lexeme too long");
480 if (datalen > MAXSTRPOS)
481 elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
483 if (npos > MAXNUMPOS)
484 elog(ERROR, "unexpected number of tsvector positions");
487 * Looks valid. Fill the WordEntry struct, and copy lexeme.
489 * But make sure the buffer is large enough first.
491 while (hdrlen + SHORTALIGN(datalen + lex_len) +
492 (npos + 1) * sizeof(WordEntryPos) >= len)
495 vec = (TSVector) repalloc(vec, len);
498 vec->entries[i].haspos = (npos > 0) ? 1 : 0;
499 vec->entries[i].len = lex_len;
500 vec->entries[i].pos = datalen;
502 memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
506 if (i > 0 && WordEntryCMP(&vec->entries[i],
507 &vec->entries[i - 1],
511 /* Receive positions */
515 WordEntryPos *wepptr;
518 * Pad to 2-byte alignment if necessary. Though we used palloc0
519 * for the initial allocation, subsequent repalloc'd memory areas
520 * are not initialized to zero.
522 if (datalen != SHORTALIGN(datalen))
524 *(STRPTR(vec) + datalen) = '\0';
525 datalen = SHORTALIGN(datalen);
528 memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
530 wepptr = POSDATAPTR(vec, &vec->entries[i]);
531 for (j = 0; j < npos; j++)
533 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
534 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
535 elog(ERROR, "position information is misordered");
538 datalen += (npos + 1) * sizeof(WordEntry);
542 SET_VARSIZE(vec, hdrlen + datalen);
545 qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
546 compareentry, (void *) STRPTR(vec));
548 PG_RETURN_TSVECTOR(vec);