*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/to_tsany.c,v 1.4 2007/09/26 10:09:57 teodor Exp $
+ * $PostgreSQL: pgsql/src/backend/tsearch/to_tsany.c,v 1.5 2007/10/23 00:51:23 tgl Exp $
*
*-------------------------------------------------------------------------
*/
TSVector
make_tsvector(ParsedText *prs)
{
- int4 i,
+ int i,
j,
lenstr = 0,
totallen;
TSVector in;
WordEntry *ptr;
- char *str,
- *cur;
+ char *str;
+ int stroff;
prs->curwords = uniqueWORD(prs->words, prs->curwords);
for (i = 0; i < prs->curwords; i++)
{
- lenstr += SHORTALIGN(prs->words[i].len);
-
+ lenstr += prs->words[i].len;
if (prs->words[i].alen)
+ {
+ lenstr = SHORTALIGN(lenstr);
lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
+ }
}
+ if (lenstr > MAXSTRPOS)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("string is too long for tsvector")));
+
totallen = CALCDATASIZE(prs->curwords, lenstr);
in = (TSVector) palloc0(totallen);
SET_VARSIZE(in, totallen);
in->size = prs->curwords;
ptr = ARRPTR(in);
- cur = str = STRPTR(in);
+ str = STRPTR(in);
+ stroff = 0;
for (i = 0; i < prs->curwords; i++)
{
ptr->len = prs->words[i].len;
- if (cur - str > MAXSTRPOS)
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("string is too long for tsvector")));
- ptr->pos = cur - str;
- memcpy((void *) cur, (void *) prs->words[i].word, prs->words[i].len);
+ ptr->pos = stroff;
+ memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
+ stroff += prs->words[i].len;
pfree(prs->words[i].word);
- cur += SHORTALIGN(prs->words[i].len);
if (prs->words[i].alen)
{
+ int k = prs->words[i].pos.apos[0];
WordEntryPos *wptr;
+ if (k > 0xFFFF)
+ elog(ERROR, "positions array too long");
+
ptr->haspos = 1;
- *(uint16 *) cur = prs->words[i].pos.apos[0];
+ stroff = SHORTALIGN(stroff);
+ *(uint16 *) (str + stroff) = (uint16) k;
wptr = POSDATAPTR(in, ptr);
- for (j = 0; j < *(uint16 *) cur; j++)
+ for (j = 0; j < k; j++)
{
WEP_SETWEIGHT(wptr[j], 0);
WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
}
- cur += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
+ stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
pfree(prs->words[i].pos.apos);
}
else
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.5 2007/10/21 22:29:56 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.6 2007/10/23 00:51:23 tgl Exp $
*
*-------------------------------------------------------------------------
*/
typedef struct
{
- WordEntry entry; /* should be first ! */
+ WordEntry entry; /* must be first! */
WordEntryPos *pos;
int poslen; /* number of elements in pos */
} WordEntryIN;
+
+/* Compare two WordEntryPos values for qsort */
static int
comparePos(const void *a, const void *b)
{
- int apos = WEP_GETPOS(*(WordEntryPos *) a);
- int bpos = WEP_GETPOS(*(WordEntryPos *) b);
+ int apos = WEP_GETPOS(*(const WordEntryPos *) a);
+ int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
if (apos == bpos)
return 0;
if (l <= 1)
return l;
- res = a;
qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
+ res = a;
ptr = a + 1;
while (ptr - a < l)
{
{
res++;
*res = *ptr;
- if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1)
+ if (res - a >= MAXNUMPOS - 1 ||
+ WEP_GETPOS(*res) == MAXENTRYPOS - 1)
break;
}
else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
return res + 1 - a;
}
+/* Compare two WordEntryIN values for qsort */
static int
compareentry(const void *va, const void *vb, void *arg)
{
+ const WordEntryIN *a = (const WordEntryIN *) va;
+ const WordEntryIN *b = (const WordEntryIN *) vb;
char *BufferStr = (char *) arg;
- WordEntryIN *a = (WordEntryIN *) va;
- WordEntryIN *b = (WordEntryIN *) vb;
if (a->entry.len == b->entry.len)
{
return (a->entry.len > b->entry.len) ? 1 : -1;
}
+/*
+ * Sort an array of WordEntryIN, remove duplicates.
+ * *outbuflen receives the amount of space needed for strings and positions.
+ */
static int
uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
{
+ int buflen;
WordEntryIN *ptr,
*res;
Assert(l >= 1);
- if (l == 1)
- {
- if (a->entry.haspos)
- {
- a->poslen = uniquePos(a->pos, a->poslen);
- *outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos);
- }
- else
- *outbuflen = a->entry.len;
+ if (l > 1)
+ qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
+ (void *) buf);
- return l;
- }
+ buflen = 0;
res = a;
-
ptr = a + 1;
- qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf);
-
while (ptr - a < l)
{
if (!(ptr->entry.len == res->entry.len &&
- strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0))
+ strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
+ res->entry.len) == 0))
{
+ /* done accumulating data into *res, count space needed */
+ buflen += res->entry.len;
if (res->entry.haspos)
{
- *outbuflen += SHORTALIGN(res->entry.len);
res->poslen = uniquePos(res->pos, res->poslen);
- *outbuflen += res->poslen * sizeof(WordEntryPos);
+ buflen = SHORTALIGN(buflen);
+ buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
}
- else
- *outbuflen += res->entry.len;
res++;
memcpy(res, ptr, sizeof(WordEntryIN));
}
{
if (res->entry.haspos)
{
+ /* append ptr's positions to res's positions */
int newlen = ptr->poslen + res->poslen;
- /* Append res to pos */
-
- res->pos = (WordEntryPos *) repalloc(res->pos, newlen * sizeof(WordEntryPos));
- memcpy(&res->pos[res->poslen],
- ptr->pos, ptr->poslen * sizeof(WordEntryPos));
+ res->pos = (WordEntryPos *)
+ repalloc(res->pos, newlen * sizeof(WordEntryPos));
+ memcpy(&res->pos[res->poslen], ptr->pos,
+ ptr->poslen * sizeof(WordEntryPos));
res->poslen = newlen;
pfree(ptr->pos);
}
else
{
+ /* just give ptr's positions to pos */
res->entry.haspos = 1;
res->pos = ptr->pos;
+ res->poslen = ptr->poslen;
}
}
ptr++;
}
- /* add last item */
-
+ /* count space needed for last item */
+ buflen += res->entry.len;
if (res->entry.haspos)
{
- *outbuflen += SHORTALIGN(res->entry.len);
-
res->poslen = uniquePos(res->pos, res->poslen);
- *outbuflen += res->poslen * sizeof(WordEntryPos);
+ buflen = SHORTALIGN(buflen);
+ buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
}
- else
- *outbuflen += res->entry.len;
+ *outbuflen = buflen;
return res + 1 - a;
}
int toklen;
WordEntryPos *pos;
int poslen;
+ char *strbuf;
+ int stroff;
/*
* Tokens are appended to tmpbuf, cur is a pointer
while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
{
-
if (toklen >= MAXSTRLEN)
ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long (%ld bytes, max %ld bytes)",
(long) toklen,
- (long) MAXSTRLEN)));
-
+ (long) (MAXSTRLEN-1))));
if (cur - tmpbuf > MAXSTRPOS)
ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("position value is too large")));
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("string is too long for tsvector")));
/*
* Enlarge buffers if needed
if (len >= arrlen)
{
arrlen *= 2;
- arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
+ arr = (WordEntryIN *)
+ repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
}
while ((cur - tmpbuf) + toklen >= buflen)
{
arr[len].poslen = poslen;
}
else
+ {
arr[len].entry.haspos = 0;
+ arr[len].pos = NULL;
+ arr[len].poslen = 0;
+ }
len++;
}
len = uniqueentry(arr, len, tmpbuf, &buflen);
else
buflen = 0;
+
+ if (buflen > MAXSTRPOS)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("string is too long for tsvector")));
+
totallen = CALCDATASIZE(len, buflen);
in = (TSVector) palloc0(totallen);
-
SET_VARSIZE(in, totallen);
in->size = len;
- cur = STRPTR(in);
inarr = ARRPTR(in);
+ strbuf = STRPTR(in);
+ stroff = 0;
for (i = 0; i < len; i++)
{
- memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
- arr[i].entry.pos = cur - STRPTR(in);
- cur += SHORTALIGN(arr[i].entry.len);
+ memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
+ arr[i].entry.pos = stroff;
+ stroff += arr[i].entry.len;
if (arr[i].entry.haspos)
{
- uint16 tmplen;
-
- if(arr[i].poslen > 0xFFFF)
+ if (arr[i].poslen > 0xFFFF)
elog(ERROR, "positions array too long");
- tmplen = (uint16) arr[i].poslen;
-
- /* Copy length to output struct */
- memcpy(cur, &tmplen, sizeof(uint16));
- cur += sizeof(uint16);
+ /* Copy number of positions */
+ stroff = SHORTALIGN(stroff);
+ *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
+ stroff += sizeof(uint16);
/* Copy positions */
- memcpy(cur, arr[i].pos, (arr[i].poslen) * sizeof(WordEntryPos));
- cur += arr[i].poslen * sizeof(WordEntryPos);
+ memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
+ stroff += arr[i].poslen * sizeof(WordEntryPos);
pfree(arr[i].pos);
}
inarr[i] = arr[i].entry;
}
+ Assert((strbuf + stroff - (char *) in) == totallen);
+
PG_RETURN_TSVECTOR(in);
}
datalen += lex_len;
- if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0)
+ if (i > 0 && WordEntryCMP(&vec->entries[i],
+ &vec->entries[i - 1],
+ STRPTR(vec)) <= 0)
elog(ERROR, "lexemes are misordered");
/* Receive positions */
-
if (npos > 0)
{
uint16 j;
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.5 2007/09/11 08:46:29 teodor Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.6 2007/10/23 00:51:23 tgl Exp $
*
*-------------------------------------------------------------------------
*/
return (a->len > b->len) ? 1 : -1;
}
+/*
+ * Add positions from src to dest after offsetting them by maxpos.
+ * Return the number added (might be less than expected due to overflow)
+ */
static int4
-add_pos(TSVector src, WordEntry * srcptr, TSVector dest, WordEntry * destptr, int4 maxpos)
+add_pos(TSVector src, WordEntry * srcptr,
+ TSVector dest, WordEntry * destptr,
+ int4 maxpos)
{
uint16 *clen = &_POSVECPTR(dest, destptr)->npos;
int i;
*clen = 0;
startlen = *clen;
- for (i = 0; i < slen && *clen < MAXNUMPOS && (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); i++)
+ for (i = 0;
+ i < slen && *clen < MAXNUMPOS &&
+ (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
+ i++)
{
WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
i,
j,
i1,
- i2;
- char *cur;
+ i2,
+ dataoff;
char *data,
*data1,
*data2;
data2 = STRPTR(in2);
i1 = in1->size;
i2 = in2->size;
+ /* conservative estimate of space needed */
out = (TSVector) palloc0(VARSIZE(in1) + VARSIZE(in2));
SET_VARSIZE(out, VARSIZE(in1) + VARSIZE(in2));
out->size = in1->size + in2->size;
- data = cur = STRPTR(out);
ptr = ARRPTR(out);
+ data = STRPTR(out);
+ dataoff = 0;
while (i1 && i2)
{
int cmp = compareEntry(data1, ptr1, data2, ptr2);
{ /* in1 first */
ptr->haspos = ptr1->haspos;
ptr->len = ptr1->len;
- memcpy(cur, data1 + ptr1->pos, ptr1->len);
- ptr->pos = cur - data;
+ memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
+ ptr->pos = dataoff;
+ dataoff += ptr1->len;
if (ptr->haspos)
{
- cur += SHORTALIGN(ptr1->len);
- memcpy(cur, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
- cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
+ dataoff = SHORTALIGN(dataoff);
+ memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
+ dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
}
- else
- cur += ptr1->len;
ptr++;
ptr1++;
{ /* in2 first */
ptr->haspos = ptr2->haspos;
ptr->len = ptr2->len;
- memcpy(cur, data2 + ptr2->pos, ptr2->len);
- ptr->pos = cur - data;
+ memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
+ ptr->pos = dataoff;
+ dataoff += ptr2->len;
if (ptr->haspos)
{
int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
- cur += SHORTALIGN(ptr2->len);
-
if (addlen == 0)
ptr->haspos = 0;
else
- cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ {
+ dataoff = SHORTALIGN(dataoff);
+ dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ }
}
- else
- cur += ptr2->len;
ptr++;
ptr2++;
{
ptr->haspos = ptr1->haspos | ptr2->haspos;
ptr->len = ptr1->len;
- memcpy(cur, data1 + ptr1->pos, ptr1->len);
- ptr->pos = cur - data;
+ memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
+ ptr->pos = dataoff;
+ dataoff += ptr1->len;
if (ptr->haspos)
{
- cur += SHORTALIGN(ptr1->len);
if (ptr1->haspos)
{
- memcpy(cur, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
- cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
+ dataoff = SHORTALIGN(dataoff);
+ memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
+ dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
if (ptr2->haspos)
- cur += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
+ dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
}
- else if (ptr2->haspos)
+ else /* must have ptr2->haspos */
{
int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
if (addlen == 0)
ptr->haspos = 0;
else
- cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ {
+ dataoff = SHORTALIGN(dataoff);
+ dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ }
}
}
- else
- cur += ptr1->len;
ptr++;
ptr1++;
{
ptr->haspos = ptr1->haspos;
ptr->len = ptr1->len;
- memcpy(cur, data1 + ptr1->pos, ptr1->len);
- ptr->pos = cur - data;
+ memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
+ ptr->pos = dataoff;
+ dataoff += ptr1->len;
if (ptr->haspos)
{
- cur += SHORTALIGN(ptr1->len);
- memcpy(cur, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
- cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
+ dataoff = SHORTALIGN(dataoff);
+ memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
+ dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
}
- else
- cur += ptr1->len;
ptr++;
ptr1++;
{
ptr->haspos = ptr2->haspos;
ptr->len = ptr2->len;
- memcpy(cur, data2 + ptr2->pos, ptr2->len);
- ptr->pos = cur - data;
+ memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
+ ptr->pos = dataoff;
+ dataoff += ptr2->len;
if (ptr->haspos)
{
int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
- cur += SHORTALIGN(ptr2->len);
-
if (addlen == 0)
ptr->haspos = 0;
else
- cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ {
+ dataoff = SHORTALIGN(dataoff);
+ dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ }
}
- else
- cur += ptr2->len;
ptr++;
ptr2++;
i2--;
}
+ /*
+ * Instead of checking each offset individually, we check for overflow
+ * of pos fields once at the end.
+ */
+ if (dataoff > MAXSTRPOS)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("string is too long for tsvector")));
+
out->size = ptr - ARRPTR(out);
- SET_VARSIZE(out, CALCDATASIZE(out->size, cur - data));
+ SET_VARSIZE(out, CALCDATASIZE(out->size, dataoff));
if (data != STRPTR(out))
- memmove(STRPTR(out), data, cur - data);
+ memmove(STRPTR(out), data, dataoff);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);