]> granicus.if.org Git - postgresql/blob - contrib/tsearch2/rank.c
Standard pgindent run for 8.1.
[postgresql] / contrib / tsearch2 / rank.c
1 /*
2  * Relevation
3  * Teodor Sigaev <teodor@sigaev.ru>
4  */
5 #include "postgres.h"
6 #include <math.h>
7
8 #include "access/gist.h"
9 #include "access/itup.h"
10 #include "utils/builtins.h"
11 #include "fmgr.h"
12 #include "funcapi.h"
13 #include "storage/bufpage.h"
14 #include "executor/spi.h"
15 #include "commands/trigger.h"
16 #include "nodes/pg_list.h"
17 #include "catalog/namespace.h"
18
19 #include "utils/array.h"
20
21 #include "tsvector.h"
22 #include "query.h"
23 #include "common.h"
24
25 PG_FUNCTION_INFO_V1(rank);
26 Datum           rank(PG_FUNCTION_ARGS);
27
28 PG_FUNCTION_INFO_V1(rank_def);
29 Datum           rank_def(PG_FUNCTION_ARGS);
30
31 PG_FUNCTION_INFO_V1(rank_cd);
32 Datum           rank_cd(PG_FUNCTION_ARGS);
33
34 PG_FUNCTION_INFO_V1(rank_cd_def);
35 Datum           rank_cd_def(PG_FUNCTION_ARGS);
36
37 PG_FUNCTION_INFO_V1(get_covers);
38 Datum           get_covers(PG_FUNCTION_ARGS);
39
40 static float weights[] = {0.1, 0.2, 0.4, 1.0};
41
42 #define wpos(wep)       ( w[ WEP_GETWEIGHT(wep) ] )
43
44 #define DEF_NORM_METHOD 0
45
46 static float calc_rank_or(float *w, tsvector * t, QUERYTYPE * q);
47 static float calc_rank_and(float *w, tsvector * t, QUERYTYPE * q);
48
49 /*
50  * Returns a weight of a word collocation
51  */
52 static float4
53 word_distance(int4 w)
54 {
55         if (w > 100)
56                 return 1e-30;
57
58         return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
59 }
60
61 static int
62 cnt_length(tsvector * t)
63 {
64         WordEntry  *ptr = ARRPTR(t),
65                            *end = (WordEntry *) STRPTR(t);
66         int                     len = 0,
67                                 clen;
68
69         while (ptr < end)
70         {
71                 if ((clen = POSDATALEN(t, ptr)) == 0)
72                         len += 1;
73                 else
74                         len += clen;
75                 ptr++;
76         }
77
78         return len;
79 }
80
81 static int4
82 WordECompareITEM(char *eval, char *qval, WordEntry * ptr, ITEM * item)
83 {
84         if (ptr->len == item->length)
85                 return strncmp(
86                                            eval + ptr->pos,
87                                            qval + item->distance,
88                                            item->length);
89
90         return (ptr->len > item->length) ? 1 : -1;
91 }
92
93 static WordEntry *
94 find_wordentry(tsvector * t, QUERYTYPE * q, ITEM * item)
95 {
96         WordEntry  *StopLow = ARRPTR(t);
97         WordEntry  *StopHigh = (WordEntry *) STRPTR(t);
98         WordEntry  *StopMiddle;
99         int                     difference;
100
101         /* Loop invariant: StopLow <= item < StopHigh */
102
103         while (StopLow < StopHigh)
104         {
105                 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
106                 difference = WordECompareITEM(STRPTR(t), GETOPERAND(q), StopMiddle, item);
107                 if (difference == 0)
108                         return StopMiddle;
109                 else if (difference < 0)
110                         StopLow = StopMiddle + 1;
111                 else
112                         StopHigh = StopMiddle;
113         }
114
115         return NULL;
116 }
117
118
119 static char *SortAndUniqOperand = NULL;
120
121 static int
122 compareITEM(const void *a, const void *b)
123 {
124         if ((*(ITEM **) a)->length == (*(ITEM **) b)->length)
125                 return strncmp(SortAndUniqOperand + (*(ITEM **) a)->distance,
126                                            SortAndUniqOperand + (*(ITEM **) b)->distance,
127                                            (*(ITEM **) b)->length);
128
129         return ((*(ITEM **) a)->length > (*(ITEM **) b)->length) ? 1 : -1;
130 }
131
132 static ITEM **
133 SortAndUniqItems(char *operand, ITEM * item, int *size)
134 {
135         ITEM      **res,
136                           **ptr,
137                           **prevptr;
138
139         ptr = res = (ITEM **) palloc(sizeof(ITEM *) * *size);
140
141         while ((*size)--)
142         {
143                 if (item->type == VAL)
144                 {
145                         *ptr = item;
146                         ptr++;
147                 }
148                 item++;
149         }
150
151         *size = ptr - res;
152         if (*size < 2)
153                 return res;
154
155         SortAndUniqOperand = operand;
156         qsort(res, *size, sizeof(ITEM **), compareITEM);
157
158         ptr = res + 1;
159         prevptr = res;
160
161         while (ptr - res < *size)
162         {
163                 if (compareITEM((void *) ptr, (void *) prevptr) != 0)
164                 {
165                         prevptr++;
166                         *prevptr = *ptr;
167                 }
168                 ptr++;
169         }
170
171         *size = prevptr + 1 - res;
172         return res;
173 }
174
175 static WordEntryPos POSNULL[] = {
176         0,
177         0
178 };
179
180 static float
181 calc_rank_and(float *w, tsvector * t, QUERYTYPE * q)
182 {
183         uint16    **pos;
184         int                     i,
185                                 k,
186                                 l,
187                                 p;
188         WordEntry  *entry;
189         WordEntryPos *post,
190                            *ct;
191         int4            dimt,
192                                 lenct,
193                                 dist;
194         float           res = -1.0;
195         ITEM      **item;
196         int                     size = q->size;
197
198         item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size);
199         if (size < 2)
200         {
201                 pfree(item);
202                 return calc_rank_or(w, t, q);
203         }
204         pos = (uint16 **) palloc(sizeof(uint16 *) * q->size);
205         memset(pos, 0, sizeof(uint16 *) * q->size);
206         *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
207         WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1);
208
209         for (i = 0; i < size; i++)
210         {
211                 entry = find_wordentry(t, q, item[i]);
212                 if (!entry)
213                         continue;
214
215                 if (entry->haspos)
216                         pos[i] = (uint16 *) _POSDATAPTR(t, entry);
217                 else
218                         pos[i] = (uint16 *) POSNULL;
219
220
221                 dimt = *(uint16 *) (pos[i]);
222                 post = (WordEntryPos *) (pos[i] + 1);
223                 for (k = 0; k < i; k++)
224                 {
225                         if (!pos[k])
226                                 continue;
227                         lenct = *(uint16 *) (pos[k]);
228                         ct = (WordEntryPos *) (pos[k] + 1);
229                         for (l = 0; l < dimt; l++)
230                         {
231                                 for (p = 0; p < lenct; p++)
232                                 {
233                                         dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
234                                         if (dist || (dist == 0 && (pos[i] == (uint16 *) POSNULL || pos[k] == (uint16 *) POSNULL)))
235                                         {
236                                                 float           curw;
237
238                                                 if (!dist)
239                                                         dist = MAXENTRYPOS;
240                                                 curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
241                                                 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
242                                         }
243                                 }
244                         }
245                 }
246         }
247         pfree(pos);
248         pfree(item);
249         return res;
250 }
251
252 static float
253 calc_rank_or(float *w, tsvector * t, QUERYTYPE * q)
254 {
255         WordEntry  *entry;
256         WordEntryPos *post;
257         int4            dimt,
258                                 j,
259                                 i;
260         float           res = -1.0;
261         ITEM      **item;
262         int                     size = q->size;
263
264         *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
265         item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size);
266
267         for (i = 0; i < size; i++)
268         {
269                 entry = find_wordentry(t, q, item[i]);
270                 if (!entry)
271                         continue;
272
273                 if (entry->haspos)
274                 {
275                         dimt = POSDATALEN(t, entry);
276                         post = POSDATAPTR(t, entry);
277                 }
278                 else
279                 {
280                         dimt = *(uint16 *) POSNULL;
281                         post = POSNULL + 1;
282                 }
283
284                 for (j = 0; j < dimt; j++)
285                 {
286                         if (res < 0)
287                                 res = wpos(post[j]);
288                         else
289                                 res = 1.0 - (1.0 - res) * (1.0 - wpos(post[j]));
290                 }
291         }
292         pfree(item);
293         return res;
294 }
295
296 static float
297 calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
298 {
299         ITEM       *item = GETQUERY(q);
300         float           res = 0.0;
301         int                     len;
302
303         if (!t->size || !q->size)
304                 return 0.0;
305
306         res = (item->type != VAL && item->val == (int4) '&') ?
307                 calc_rank_and(w, t, q) : calc_rank_or(w, t, q);
308
309         if (res < 0)
310                 res = 1e-20;
311
312         switch (method)
313         {
314                 case 0:
315                         break;
316                 case 1:
317                         res /= log((float) (cnt_length(t) + 1)) / log(2.0);
318                         break;
319                 case 2:
320                         len = cnt_length(t);
321                         if (len > 0)
322                                 res /= (float) len;
323                         break;
324                 default:
325                         /* internal error */
326                         elog(ERROR, "unrecognized normalization method: %d", method);
327         }
328
329         return res;
330 }
331
332 Datum
333 rank(PG_FUNCTION_ARGS)
334 {
335         ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
336         tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
337         QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(2));
338         int                     method = DEF_NORM_METHOD;
339         float           res = 0.0;
340         float           ws[lengthof(weights)];
341         int                     i;
342
343         if (ARR_NDIM(win) != 1)
344                 ereport(ERROR,
345                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
346                                  errmsg("array of weight must be one-dimensional")));
347
348         if (ARRNELEMS(win) < lengthof(weights))
349                 ereport(ERROR,
350                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
351                                  errmsg("array of weight is too short")));
352
353         for (i = 0; i < lengthof(weights); i++)
354         {
355                 ws[i] = (((float4 *) ARR_DATA_PTR(win))[i] >= 0) ? ((float4 *) ARR_DATA_PTR(win))[i] : weights[i];
356                 if (ws[i] > 1.0)
357                         ereport(ERROR,
358                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
359                                          errmsg("weight out of range")));
360         }
361
362         if (PG_NARGS() == 4)
363                 method = PG_GETARG_INT32(3);
364
365         res = calc_rank(ws, txt, query, method);
366
367         PG_FREE_IF_COPY(win, 0);
368         PG_FREE_IF_COPY(txt, 1);
369         PG_FREE_IF_COPY(query, 2);
370         PG_RETURN_FLOAT4(res);
371 }
372
373 Datum
374 rank_def(PG_FUNCTION_ARGS)
375 {
376         tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
377         QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
378         float           res = 0.0;
379         int                     method = DEF_NORM_METHOD;
380
381         if (PG_NARGS() == 3)
382                 method = PG_GETARG_INT32(2);
383
384         res = calc_rank(weights, txt, query, method);
385
386         PG_FREE_IF_COPY(txt, 0);
387         PG_FREE_IF_COPY(query, 1);
388         PG_RETURN_FLOAT4(res);
389 }
390
391
392 typedef struct
393 {
394         ITEM       *item;
395         int32           pos;
396 }       DocRepresentation;
397
398 static int
399 compareDocR(const void *a, const void *b)
400 {
401         if (((DocRepresentation *) a)->pos == ((DocRepresentation *) b)->pos)
402                 return 0;
403         return (((DocRepresentation *) a)->pos > ((DocRepresentation *) b)->pos) ? 1 : -1;
404 }
405
406
407 typedef struct
408 {
409         DocRepresentation *doc;
410         int                     len;
411 }       ChkDocR;
412
413 static bool
414 checkcondition_DR(void *checkval, ITEM * val)
415 {
416         DocRepresentation *ptr = ((ChkDocR *) checkval)->doc;
417
418         while (ptr - ((ChkDocR *) checkval)->doc < ((ChkDocR *) checkval)->len)
419         {
420                 if (val == ptr->item || compareITEM(&val, &(ptr->item)) == 0)
421                         return true;
422                 ptr++;
423         }
424
425         return false;
426 }
427
428
429 static bool
430 Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int *q)
431 {
432         int                     i;
433         DocRepresentation *ptr,
434                            *f = (DocRepresentation *) 0xffffffff;
435         ITEM       *item = GETQUERY(query);
436         int                     lastpos = *pos;
437         int                     oldq = *q;
438
439         *p = 0x7fffffff;
440         *q = 0;
441
442         for (i = 0; i < query->size; i++)
443         {
444                 if (item->type != VAL)
445                 {
446                         item++;
447                         continue;
448                 }
449                 ptr = doc + *pos;
450
451                 while (ptr - doc < len)
452                 {
453                         if (ptr->item == item)
454                         {
455                                 if (ptr->pos > *q)
456                                 {
457                                         *q = ptr->pos;
458                                         lastpos = ptr - doc;
459                                 }
460                                 break;
461                         }
462                         ptr++;
463                 }
464
465                 item++;
466         }
467
468         if (*q == 0)
469                 return false;
470
471         if (*q == oldq)
472         {                                                       /* already check this pos */
473                 (*pos)++;
474                 return Cover(doc, len, query, pos, p, q);
475         }
476
477         item = GETQUERY(query);
478         for (i = 0; i < query->size; i++)
479         {
480                 if (item->type != VAL)
481                 {
482                         item++;
483                         continue;
484                 }
485                 ptr = doc + lastpos;
486
487                 while (ptr >= doc + *pos)
488                 {
489                         if (ptr->item == item)
490                         {
491                                 if (ptr->pos < *p)
492                                 {
493                                         *p = ptr->pos;
494                                         f = ptr;
495                                 }
496                                 break;
497                         }
498                         ptr--;
499                 }
500                 item++;
501         }
502
503         if (*p <= *q)
504         {
505                 ChkDocR         ch;
506
507                 ch.doc = f;
508                 ch.len = (doc + lastpos) - f + 1;
509                 *pos = f - doc + 1;
510                 SortAndUniqOperand = GETOPERAND(query);
511                 if (TS_execute(GETQUERY(query), &ch, false, checkcondition_DR))
512                 {
513                         /*
514                          * elog(NOTICE,"OP:%d NP:%d P:%d Q:%d", *pos, lastpos, *p, *q);
515                          */
516                         return true;
517                 }
518                 else
519                         return Cover(doc, len, query, pos, p, q);
520         }
521
522         return false;
523 }
524
525 static DocRepresentation *
526 get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
527 {
528         ITEM       *item = GETQUERY(query);
529         WordEntry  *entry;
530         WordEntryPos *post;
531         int4            dimt,
532                                 j,
533                                 i;
534         int                     len = query->size * 4,
535                                 cur = 0;
536         DocRepresentation *doc;
537
538         *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
539         doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
540         for (i = 0; i < query->size; i++)
541         {
542                 if (item[i].type != VAL)
543                         continue;
544
545                 entry = find_wordentry(txt, query, &(item[i]));
546                 if (!entry)
547                         continue;
548
549                 if (entry->haspos)
550                 {
551                         dimt = POSDATALEN(txt, entry);
552                         post = POSDATAPTR(txt, entry);
553                 }
554                 else
555                 {
556                         dimt = *(uint16 *) POSNULL;
557                         post = POSNULL + 1;
558                 }
559
560                 while (cur + dimt >= len)
561                 {
562                         len *= 2;
563                         doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
564                 }
565
566                 for (j = 0; j < dimt; j++)
567                 {
568                         doc[cur].item = &(item[i]);
569                         doc[cur].pos = WEP_GETPOS(post[j]);
570                         cur++;
571                 }
572         }
573
574         *doclen = cur;
575
576         if (cur > 0)
577         {
578                 if (cur > 1)
579                         qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
580                 return doc;
581         }
582
583         pfree(doc);
584         return NULL;
585 }
586
587
588 Datum
589 rank_cd(PG_FUNCTION_ARGS)
590 {
591         int                     K = PG_GETARG_INT32(0);
592         tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
593         QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(2));
594         int                     method = DEF_NORM_METHOD;
595         DocRepresentation *doc;
596         float           res = 0.0;
597         int                     p = 0,
598                                 q = 0,
599                                 len,
600                                 cur;
601
602         doc = get_docrep(txt, query, &len);
603         if (!doc)
604         {
605                 PG_FREE_IF_COPY(txt, 1);
606                 PG_FREE_IF_COPY(query, 2);
607                 PG_RETURN_FLOAT4(0.0);
608         }
609
610         cur = 0;
611         if (K <= 0)
612                 K = 4;
613         while (Cover(doc, len, query, &cur, &p, &q))
614                 res += (q - p + 1 > K) ? ((float) K) / ((float) (q - p + 1)) : 1.0;
615
616         if (PG_NARGS() == 4)
617                 method = PG_GETARG_INT32(3);
618
619         switch (method)
620         {
621                 case 0:
622                         break;
623                 case 1:
624                         res /= log((float) (cnt_length(txt) + 1));
625                         break;
626                 case 2:
627                         len = cnt_length(txt);
628                         if (len > 0)
629                                 res /= (float) len;
630                         break;
631                 default:
632                         /* internal error */
633                         elog(ERROR, "unrecognized normalization method: %d", method);
634         }
635
636         pfree(doc);
637         PG_FREE_IF_COPY(txt, 1);
638         PG_FREE_IF_COPY(query, 2);
639
640         PG_RETURN_FLOAT4(res);
641 }
642
643
644 Datum
645 rank_cd_def(PG_FUNCTION_ARGS)
646 {
647         PG_RETURN_DATUM(DirectFunctionCall4(
648                                                                                 rank_cd,
649                                                                                 Int32GetDatum(-1),
650                                                                                 PG_GETARG_DATUM(0),
651                                                                                 PG_GETARG_DATUM(1),
652           (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : Int32GetDatum(DEF_NORM_METHOD)
653                                                                                 ));
654 }
655
656 /**************debug*************/
657
658 typedef struct
659 {
660         char       *w;
661         int2            len;
662         int2            pos;
663         int2            start;
664         int2            finish;
665 }       DocWord;
666
667 static int
668 compareDocWord(const void *a, const void *b)
669 {
670         if (((DocWord *) a)->pos == ((DocWord *) b)->pos)
671                 return 0;
672         return (((DocWord *) a)->pos > ((DocWord *) b)->pos) ? 1 : -1;
673 }
674
675
676 Datum
677 get_covers(PG_FUNCTION_ARGS)
678 {
679         tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
680         QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
681         WordEntry  *pptr = ARRPTR(txt);
682         int                     i,
683                                 dlen = 0,
684                                 j,
685                                 cur = 0,
686                                 len = 0,
687                                 rlen;
688         DocWord    *dw,
689                            *dwptr;
690         text       *out;
691         char       *cptr;
692         DocRepresentation *doc;
693         int                     pos = 0,
694                                 p,
695                                 q,
696                                 olddwpos = 0;
697         int                     ncover = 1;
698
699         doc = get_docrep(txt, query, &rlen);
700
701         if (!doc)
702         {
703                 out = palloc(VARHDRSZ);
704                 VARATT_SIZEP(out) = VARHDRSZ;
705                 PG_FREE_IF_COPY(txt, 0);
706                 PG_FREE_IF_COPY(query, 1);
707                 PG_RETURN_POINTER(out);
708         }
709
710         for (i = 0; i < txt->size; i++)
711         {
712                 if (!pptr[i].haspos)
713                         ereport(ERROR,
714                                         (errcode(ERRCODE_SYNTAX_ERROR),
715                                          errmsg("no pos info")));
716                 dlen += POSDATALEN(txt, &(pptr[i]));
717         }
718
719         dwptr = dw = palloc(sizeof(DocWord) * dlen);
720         memset(dw, 0, sizeof(DocWord) * dlen);
721
722         for (i = 0; i < txt->size; i++)
723         {
724                 WordEntryPos *posdata = POSDATAPTR(txt, &(pptr[i]));
725
726                 for (j = 0; j < POSDATALEN(txt, &(pptr[i])); j++)
727                 {
728                         dw[cur].w = STRPTR(txt) + pptr[i].pos;
729                         dw[cur].len = pptr[i].len;
730                         dw[cur].pos = WEP_GETPOS(posdata[j]);
731                         cur++;
732                 }
733                 len += (pptr[i].len + 1) * (int) POSDATALEN(txt, &(pptr[i]));
734         }
735         qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord);
736
737         while (Cover(doc, rlen, query, &pos, &p, &q))
738         {
739                 dwptr = dw + olddwpos;
740                 while (dwptr->pos < p && dwptr - dw < dlen)
741                         dwptr++;
742                 olddwpos = dwptr - dw;
743                 dwptr->start = ncover;
744                 while (dwptr->pos < q + 1 && dwptr - dw < dlen)
745                         dwptr++;
746                 (dwptr - 1)->finish = ncover;
747                 len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ;
748                 ncover++;
749         }
750
751         out = palloc(VARHDRSZ + len);
752         cptr = ((char *) out) + VARHDRSZ;
753         dwptr = dw;
754
755         while (dwptr - dw < dlen)
756         {
757                 if (dwptr->start)
758                 {
759                         sprintf(cptr, "{%d ", dwptr->start);
760                         cptr = strchr(cptr, '\0');
761                 }
762                 memcpy(cptr, dwptr->w, dwptr->len);
763                 cptr += dwptr->len;
764                 *cptr = ' ';
765                 cptr++;
766                 if (dwptr->finish)
767                 {
768                         sprintf(cptr, "}%d ", dwptr->finish);
769                         cptr = strchr(cptr, '\0');
770                 }
771                 dwptr++;
772         }
773
774         VARATT_SIZEP(out) = cptr - ((char *) out);
775
776         pfree(dw);
777         pfree(doc);
778
779         PG_FREE_IF_COPY(txt, 0);
780         PG_FREE_IF_COPY(query, 1);
781         PG_RETURN_POINTER(out);
782 }