]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/tsvector.c
Stamp copyrights for year 2011.
[postgresql] / src / backend / utils / adt / tsvector.c
1 /*-------------------------------------------------------------------------
2  *
3  * tsvector.c
4  *        I/O functions for tsvector
5  *
6  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *        src/backend/utils/adt/tsvector.c
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_type.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_utils.h"
21 #include "utils/memutils.h"
22
23 typedef struct
24 {
25         WordEntry       entry;                  /* must be first! */
26         WordEntryPos *pos;
27         int                     poslen;                 /* number of elements in pos */
28 } WordEntryIN;
29
30
31 /* Compare two WordEntryPos values for qsort */
32 static int
33 comparePos(const void *a, const void *b)
34 {
35         int                     apos = WEP_GETPOS(*(const WordEntryPos *) a);
36         int                     bpos = WEP_GETPOS(*(const WordEntryPos *) b);
37
38         if (apos == bpos)
39                 return 0;
40         return (apos > bpos) ? 1 : -1;
41 }
42
43 /*
44  * Removes duplicate pos entries. If there's two entries with same pos
45  * but different weight, the higher weight is retained.
46  *
47  * Returns new length.
48  */
49 static int
50 uniquePos(WordEntryPos *a, int l)
51 {
52         WordEntryPos *ptr,
53                            *res;
54
55         if (l <= 1)
56                 return l;
57
58         qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
59
60         res = a;
61         ptr = a + 1;
62         while (ptr - a < l)
63         {
64                 if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
65                 {
66                         res++;
67                         *res = *ptr;
68                         if (res - a >= MAXNUMPOS - 1 ||
69                                 WEP_GETPOS(*res) == MAXENTRYPOS - 1)
70                                 break;
71                 }
72                 else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
73                         WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
74                 ptr++;
75         }
76
77         return res + 1 - a;
78 }
79
80 /* Compare two WordEntryIN values for qsort */
81 static int
82 compareentry(const void *va, const void *vb, void *arg)
83 {
84         const WordEntryIN *a = (const WordEntryIN *) va;
85         const WordEntryIN *b = (const WordEntryIN *) vb;
86         char       *BufferStr = (char *) arg;
87
88         return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
89                                                    &BufferStr[b->entry.pos], b->entry.len,
90                                                    false);
91 }
92
93 /*
94  * Sort an array of WordEntryIN, remove duplicates.
95  * *outbuflen receives the amount of space needed for strings and positions.
96  */
97 static int
98 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
99 {
100         int                     buflen;
101         WordEntryIN *ptr,
102                            *res;
103
104         Assert(l >= 1);
105
106         if (l > 1)
107                 qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
108                                   (void *) buf);
109
110         buflen = 0;
111         res = a;
112         ptr = a + 1;
113         while (ptr - a < l)
114         {
115                 if (!(ptr->entry.len == res->entry.len &&
116                           strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
117                                           res->entry.len) == 0))
118                 {
119                         /* done accumulating data into *res, count space needed */
120                         buflen += res->entry.len;
121                         if (res->entry.haspos)
122                         {
123                                 res->poslen = uniquePos(res->pos, res->poslen);
124                                 buflen = SHORTALIGN(buflen);
125                                 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
126                         }
127                         res++;
128                         memcpy(res, ptr, sizeof(WordEntryIN));
129                 }
130                 else if (ptr->entry.haspos)
131                 {
132                         if (res->entry.haspos)
133                         {
134                                 /* append ptr's positions to res's positions */
135                                 int                     newlen = ptr->poslen + res->poslen;
136
137                                 res->pos = (WordEntryPos *)
138                                         repalloc(res->pos, newlen * sizeof(WordEntryPos));
139                                 memcpy(&res->pos[res->poslen], ptr->pos,
140                                            ptr->poslen * sizeof(WordEntryPos));
141                                 res->poslen = newlen;
142                                 pfree(ptr->pos);
143                         }
144                         else
145                         {
146                                 /* just give ptr's positions to pos */
147                                 res->entry.haspos = 1;
148                                 res->pos = ptr->pos;
149                                 res->poslen = ptr->poslen;
150                         }
151                 }
152                 ptr++;
153         }
154
155         /* count space needed for last item */
156         buflen += res->entry.len;
157         if (res->entry.haspos)
158         {
159                 res->poslen = uniquePos(res->pos, res->poslen);
160                 buflen = SHORTALIGN(buflen);
161                 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
162         }
163
164         *outbuflen = buflen;
165         return res + 1 - a;
166 }
167
168 static int
169 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
170 {
171         return compareentry(a, b, buf);
172 }
173
174
175 Datum
176 tsvectorin(PG_FUNCTION_ARGS)
177 {
178         char       *buf = PG_GETARG_CSTRING(0);
179         TSVectorParseState state;
180         WordEntryIN *arr;
181         int                     totallen;
182         int                     arrlen;                 /* allocated size of arr */
183         WordEntry  *inarr;
184         int                     len = 0;
185         TSVector        in;
186         int                     i;
187         char       *token;
188         int                     toklen;
189         WordEntryPos *pos;
190         int                     poslen;
191         char       *strbuf;
192         int                     stroff;
193
194         /*
195          * Tokens are appended to tmpbuf, cur is a pointer to the end of used
196          * space in tmpbuf.
197          */
198         char       *tmpbuf;
199         char       *cur;
200         int                     buflen = 256;   /* allocated size of tmpbuf */
201
202         pg_verifymbstr(buf, strlen(buf), false);
203
204         state = init_tsvector_parser(buf, false, false);
205
206         arrlen = 64;
207         arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
208         cur = tmpbuf = (char *) palloc(buflen);
209
210         while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
211         {
212                 if (toklen >= MAXSTRLEN)
213                         ereport(ERROR,
214                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
215                                          errmsg("word is too long (%ld bytes, max %ld bytes)",
216                                                         (long) toklen,
217                                                         (long) (MAXSTRLEN - 1))));
218
219                 if (cur - tmpbuf > MAXSTRPOS)
220                         ereport(ERROR,
221                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
222                                          errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
223                                                         (long) (cur - tmpbuf), (long) MAXSTRPOS)));
224
225                 /*
226                  * Enlarge buffers if needed
227                  */
228                 if (len >= arrlen)
229                 {
230                         arrlen *= 2;
231                         arr = (WordEntryIN *)
232                                 repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
233                 }
234                 while ((cur - tmpbuf) + toklen >= buflen)
235                 {
236                         int                     dist = cur - tmpbuf;
237
238                         buflen *= 2;
239                         tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
240                         cur = tmpbuf + dist;
241                 }
242                 arr[len].entry.len = toklen;
243                 arr[len].entry.pos = cur - tmpbuf;
244                 memcpy((void *) cur, (void *) token, toklen);
245                 cur += toklen;
246
247                 if (poslen != 0)
248                 {
249                         arr[len].entry.haspos = 1;
250                         arr[len].pos = pos;
251                         arr[len].poslen = poslen;
252                 }
253                 else
254                 {
255                         arr[len].entry.haspos = 0;
256                         arr[len].pos = NULL;
257                         arr[len].poslen = 0;
258                 }
259                 len++;
260         }
261
262         close_tsvector_parser(state);
263
264         if (len > 0)
265                 len = uniqueentry(arr, len, tmpbuf, &buflen);
266         else
267                 buflen = 0;
268
269         if (buflen > MAXSTRPOS)
270                 ereport(ERROR,
271                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
272                                  errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
273
274         totallen = CALCDATASIZE(len, buflen);
275         in = (TSVector) palloc0(totallen);
276         SET_VARSIZE(in, totallen);
277         in->size = len;
278         inarr = ARRPTR(in);
279         strbuf = STRPTR(in);
280         stroff = 0;
281         for (i = 0; i < len; i++)
282         {
283                 memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
284                 arr[i].entry.pos = stroff;
285                 stroff += arr[i].entry.len;
286                 if (arr[i].entry.haspos)
287                 {
288                         if (arr[i].poslen > 0xFFFF)
289                                 elog(ERROR, "positions array too long");
290
291                         /* Copy number of positions */
292                         stroff = SHORTALIGN(stroff);
293                         *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
294                         stroff += sizeof(uint16);
295
296                         /* Copy positions */
297                         memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
298                         stroff += arr[i].poslen * sizeof(WordEntryPos);
299
300                         pfree(arr[i].pos);
301                 }
302                 inarr[i] = arr[i].entry;
303         }
304
305         Assert((strbuf + stroff - (char *) in) == totallen);
306
307         PG_RETURN_TSVECTOR(in);
308 }
309
310 Datum
311 tsvectorout(PG_FUNCTION_ARGS)
312 {
313         TSVector        out = PG_GETARG_TSVECTOR(0);
314         char       *outbuf;
315         int4            i,
316                                 lenbuf = 0,
317                                 pp;
318         WordEntry  *ptr = ARRPTR(out);
319         char       *curbegin,
320                            *curin,
321                            *curout;
322
323         lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
324         for (i = 0; i < out->size; i++)
325         {
326                 lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
327                 if (ptr[i].haspos)
328                         lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
329         }
330
331         curout = outbuf = (char *) palloc(lenbuf);
332         for (i = 0; i < out->size; i++)
333         {
334                 curbegin = curin = STRPTR(out) + ptr->pos;
335                 if (i != 0)
336                         *curout++ = ' ';
337                 *curout++ = '\'';
338                 while (curin - curbegin < ptr->len)
339                 {
340                         int                     len = pg_mblen(curin);
341
342                         if (t_iseq(curin, '\''))
343                                 *curout++ = '\'';
344                         else if (t_iseq(curin, '\\'))
345                                 *curout++ = '\\';
346
347                         while (len--)
348                                 *curout++ = *curin++;
349                 }
350
351                 *curout++ = '\'';
352                 if ((pp = POSDATALEN(out, ptr)) != 0)
353                 {
354                         WordEntryPos *wptr;
355
356                         *curout++ = ':';
357                         wptr = POSDATAPTR(out, ptr);
358                         while (pp)
359                         {
360                                 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
361                                 switch (WEP_GETWEIGHT(*wptr))
362                                 {
363                                         case 3:
364                                                 *curout++ = 'A';
365                                                 break;
366                                         case 2:
367                                                 *curout++ = 'B';
368                                                 break;
369                                         case 1:
370                                                 *curout++ = 'C';
371                                                 break;
372                                         case 0:
373                                         default:
374                                                 break;
375                                 }
376
377                                 if (pp > 1)
378                                         *curout++ = ',';
379                                 pp--;
380                                 wptr++;
381                         }
382                 }
383                 ptr++;
384         }
385
386         *curout = '\0';
387         PG_FREE_IF_COPY(out, 0);
388         PG_RETURN_CSTRING(outbuf);
389 }
390
391 /*
392  * Binary Input / Output functions. The binary format is as follows:
393  *
394  * uint32       number of lexemes
395  *
396  * for each lexeme:
397  *              lexeme text in client encoding, null-terminated
398  *              uint16  number of positions
399  *              for each position:
400  *                      uint16 WordEntryPos
401  */
402
403 Datum
404 tsvectorsend(PG_FUNCTION_ARGS)
405 {
406         TSVector        vec = PG_GETARG_TSVECTOR(0);
407         StringInfoData buf;
408         int                     i,
409                                 j;
410         WordEntry  *weptr = ARRPTR(vec);
411
412         pq_begintypsend(&buf);
413
414         pq_sendint(&buf, vec->size, sizeof(int32));
415         for (i = 0; i < vec->size; i++)
416         {
417                 uint16          npos;
418
419                 /*
420                  * the strings in the TSVector array are not null-terminated, so we
421                  * have to send the null-terminator separately
422                  */
423                 pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
424                 pq_sendbyte(&buf, '\0');
425
426                 npos = POSDATALEN(vec, weptr);
427                 pq_sendint(&buf, npos, sizeof(uint16));
428
429                 if (npos > 0)
430                 {
431                         WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
432
433                         for (j = 0; j < npos; j++)
434                                 pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
435                 }
436                 weptr++;
437         }
438
439         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
440 }
441
442 Datum
443 tsvectorrecv(PG_FUNCTION_ARGS)
444 {
445         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
446         TSVector        vec;
447         int                     i;
448         int32           nentries;
449         int                     datalen;                /* number of bytes used in the variable size
450                                                                  * area after fixed size TSVector header and
451                                                                  * WordEntries */
452         Size            hdrlen;
453         Size            len;                    /* allocated size of vec */
454         bool            needSort = false;
455
456         nentries = pq_getmsgint(buf, sizeof(int32));
457         if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
458                 elog(ERROR, "invalid size of tsvector");
459
460         hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
461
462         len = hdrlen * 2;                       /* times two to make room for lexemes */
463         vec = (TSVector) palloc0(len);
464         vec->size = nentries;
465
466         datalen = 0;
467         for (i = 0; i < nentries; i++)
468         {
469                 const char *lexeme;
470                 uint16          npos;
471                 size_t          lex_len;
472
473                 lexeme = pq_getmsgstring(buf);
474                 npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
475
476                 /* sanity checks */
477
478                 lex_len = strlen(lexeme);
479                 if (lex_len > MAXSTRLEN)
480                         elog(ERROR, "invalid tsvector: lexeme too long");
481
482                 if (datalen > MAXSTRPOS)
483                         elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
484
485                 if (npos > MAXNUMPOS)
486                         elog(ERROR, "unexpected number of tsvector positions");
487
488                 /*
489                  * Looks valid. Fill the WordEntry struct, and copy lexeme.
490                  *
491                  * But make sure the buffer is large enough first.
492                  */
493                 while (hdrlen + SHORTALIGN(datalen + lex_len) +
494                            (npos + 1) * sizeof(WordEntryPos) >= len)
495                 {
496                         len *= 2;
497                         vec = (TSVector) repalloc(vec, len);
498                 }
499
500                 vec->entries[i].haspos = (npos > 0) ? 1 : 0;
501                 vec->entries[i].len = lex_len;
502                 vec->entries[i].pos = datalen;
503
504                 memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
505
506                 datalen += lex_len;
507
508                 if (i > 0 && WordEntryCMP(&vec->entries[i],
509                                                                   &vec->entries[i - 1],
510                                                                   STRPTR(vec)) <= 0)
511                         needSort = true;
512
513                 /* Receive positions */
514                 if (npos > 0)
515                 {
516                         uint16          j;
517                         WordEntryPos *wepptr;
518
519                         /*
520                          * Pad to 2-byte alignment if necessary. Though we used palloc0
521                          * for the initial allocation, subsequent repalloc'd memory areas
522                          * are not initialized to zero.
523                          */
524                         if (datalen != SHORTALIGN(datalen))
525                         {
526                                 *(STRPTR(vec) + datalen) = '\0';
527                                 datalen = SHORTALIGN(datalen);
528                         }
529
530                         memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
531
532                         wepptr = POSDATAPTR(vec, &vec->entries[i]);
533                         for (j = 0; j < npos; j++)
534                         {
535                                 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
536                                 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
537                                         elog(ERROR, "position information is misordered");
538                         }
539
540                         datalen += (npos + 1) * sizeof(WordEntry);
541                 }
542         }
543
544         SET_VARSIZE(vec, hdrlen + datalen);
545
546         if (needSort)
547                 qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
548                                   compareentry, (void *) STRPTR(vec));
549
550         PG_RETURN_TSVECTOR(vec);
551 }