From: Teodor Sigaev Date: Fri, 28 May 2004 15:36:49 +0000 (+0000) Subject: Stat function now can show statistics per weight of lexemes X-Git-Tag: REL8_0_0BETA1~499 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a6ea6457facb28743371d0d33d34b767ccf4b4cc;p=postgresql Stat function now can show statistics per weight of lexemes --- diff --git a/contrib/tsearch2/expected/tsearch2.out b/contrib/tsearch2/expected/tsearch2.out index 47e51f7c32..658d51dd3f 100644 --- a/contrib/tsearch2/expected/tsearch2.out +++ b/contrib/tsearch2/expected/tsearch2.out @@ -782,6 +782,7 @@ select rank(' a:1 s:2 d g'::tsvector, 'a & s'); (1 row) insert into test_tsvector (t) values ('foo bar foo the over foo qq bar'); +drop trigger tsvectorupdate on test_tsvector; select * from stat('select a from test_tsvector') order by ndoc desc, nentry desc, word; word | ndoc | nentry -----------+------+-------- @@ -1933,6 +1934,55 @@ select * from stat('select a from test_tsvector') order by ndoc desc, nentry des qwerti | 1 | 1 (1146 rows) +insert into test_tsvector values ('1', 'a:1a,2,3b b:5a,6a,7c,8'); +insert into test_tsvector values ('1', 'a:1a,2,3c b:5a,6b,7c,8b'); +select * from stat('select a from test_tsvector','a') order by ndoc desc, nentry desc, word; + word | ndoc | nentry +------+------+-------- + b | 2 | 3 + a | 2 | 2 +(2 rows) + +select * from stat('select a from test_tsvector','b') order by ndoc desc, nentry desc, word; + word | ndoc | nentry +------+------+-------- + b | 1 | 2 + a | 1 | 1 +(2 rows) + +select * from stat('select a from test_tsvector','c') order by ndoc desc, nentry desc, word; + word | ndoc | nentry +------+------+-------- + b | 2 | 2 + a | 1 | 1 +(2 rows) + +select * from stat('select a from test_tsvector','d') order by ndoc desc, nentry desc, word; + word | ndoc | nentry +-----------+------+-------- + a | 2 | 2 + copyright | 2 | 2 + foo | 1 | 3 + bar | 1 | 2 + 345 | 1 | 1 + b | 1 | 1 + qq | 1 | 1 + qwerti | 1 | 1 +(8 rows) + +select * from stat('select a from test_tsvector','ad') order by ndoc desc, nentry desc, word; + word | ndoc | nentry +-----------+------+-------- + a | 2 | 4 + b | 2 | 4 + copyright | 2 | 2 + foo | 1 | 3 + bar | 1 | 2 + 345 | 1 | 1 + qq | 1 | 1 + qwerti | 1 | 1 +(8 rows) + select reset_tsearch(); NOTICE: TSearch cache cleaned reset_tsearch @@ -2092,7 +2142,6 @@ select * from ts_debug('Tsearch module for PostgreSQL 7.3.3'); (5 rows) --check ordering -drop trigger tsvectorupdate on test_tsvector; insert into test_tsvector values (null, null); select a is null, a from test_tsvector order by a; ?column? | a @@ -2108,6 +2157,8 @@ select a is null, a from test_tsvector order by a; f | f | '345':1 'qwerti':2 'copyright':3 f | 'qq':7 'bar':2,8 'foo':1,3,6 'copyright':9 + f | 'a':1A,2,3C 'b':5A,6B,7C,8B + f | 'a':1A,2,3B 'b':5A,6A,7C,8 f | '7w' 'ch' 'd7' 'eo' 'gw' 'i4' 'lq' 'o6' 'qt' 'y0' f | 'ar' 'ei' 'kq' 'ma' 'qa' 'qh' 'qq' 'qz' 'rx' 'st' f | 'gs' 'i6' 'i9' 'j2' 'l0' 'oq' 'qx' 'sc' 'xe' 'yu' @@ -2609,5 +2660,5 @@ select a is null, a from test_tsvector order by a; f | '1b' '42' 'a7' 'ab' 'ak' 'ap' 'at' 'av' 'ay' 'b0' 'b9' 'bb' 'bp' 'bu' 'bz' 'cq' 'da' 'de' 'dn' 'e0' 'eb' 'ef' 'eg' 'ek' 'eq' 'er' 'eu' 'ey' 'fn' 'ft' 'gg' 'h4' 'hk' 'hl' 'i7' 'ig' 'ik' 'ip' 'ir' 'iu' 'iw' 'jr' 'jw' 'jx' 'kg' 'lc' 'lg' 'm0' 'na' 'np' 'om' 'on' 'oz' 'pg' 'pn' 'ps' 'pt' 'pz' 'q3' 'q6' 'qa' 'qb' 'ql' 'qq' 'qt' 'qv' 'qw' 'qy' 'r8' 'rf' 'ri' 'rk' 'rl' 'rw' 'sg' 'si' 'sp' 'sw' 'ta' 'th' 'ua' 'uj' 'uu' 'uv' 'uz' 'vj' 'vk' 'vm' 'wc' 'wf' 'wh' 'wn' 'wo' 'ww' 'xb' 'xk' 'xt' 'xw' 'y7' 'ye' 'yl' 'yt' 'yw' 'z4' 'z7' 'zc' 'zw' f | '1h' '3s' 'ab' 'ae' 'ax' 'b1' 'bz' 'cy' 'dk' 'dq' 'ds' 'du' 'e8' 'ef' 'ej' 'ek' 'ex' 'f1' 'fe' 'ff' 'fn' 'fo' 'ft' 'fx' 'ge' 'go' 'gz' 'h6' 'hz' 'i2' 'iv' 'iy' 'j5' 'j6' 'ke' 'kf' 'lh' 'lr' 'mc' 'mj' 'na' 'ng' 'oh' 'om' 'oy' 'p2' 'pi' 'pk' 'py' 'q3' 'qb' 'qc' 'qg' 'qn' 'qo' 'qq' 'qu' 'qw' 'qx' 'qy' 'qz' 'r1' 'rk' 'rl' 'rq' 'rs' 'rt' 'ry' 'rz' 'sk' 'sl' 'so' 't9' 'td' 'te' 'tn' 'tw' 'tz' 'ud' 'uk' 'uo' 'uq' 'uw' 'ux' 'uy' 'v1' 'vg' 'vq' 'w4' 'w9' 'wa' 'wg' 'wj' 'wm' 'wo' 'wr' 'ww' 'wy' 'xf' 'xg' 'y9' 'yh' 'yi' 'yk' 'ym' 'yq' 'yv' 'zm' t | -(512 rows) +(514 rows) diff --git a/contrib/tsearch2/sql/tsearch2.sql b/contrib/tsearch2/sql/tsearch2.sql index 6e8edc3fb2..745d35b73e 100644 --- a/contrib/tsearch2/sql/tsearch2.sql +++ b/contrib/tsearch2/sql/tsearch2.sql @@ -150,7 +150,15 @@ select rank(' a:1 s:2B d g'::tsvector, 'a & s'); select rank(' a:1 s:2 d g'::tsvector, 'a & s'); insert into test_tsvector (t) values ('foo bar foo the over foo qq bar'); +drop trigger tsvectorupdate on test_tsvector; select * from stat('select a from test_tsvector') order by ndoc desc, nentry desc, word; +insert into test_tsvector values ('1', 'a:1a,2,3b b:5a,6a,7c,8'); +insert into test_tsvector values ('1', 'a:1a,2,3c b:5a,6b,7c,8b'); +select * from stat('select a from test_tsvector','a') order by ndoc desc, nentry desc, word; +select * from stat('select a from test_tsvector','b') order by ndoc desc, nentry desc, word; +select * from stat('select a from test_tsvector','c') order by ndoc desc, nentry desc, word; +select * from stat('select a from test_tsvector','d') order by ndoc desc, nentry desc, word; +select * from stat('select a from test_tsvector','ad') order by ndoc desc, nentry desc, word; select reset_tsearch(); select to_tsquery('default', 'skies & books'); @@ -249,7 +257,6 @@ Upon a woman s face. E. J. Pratt (1882 1964) select * from ts_debug('Tsearch module for PostgreSQL 7.3.3'); --check ordering -drop trigger tsvectorupdate on test_tsvector; insert into test_tsvector values (null, null); select a is null, a from test_tsvector order by a; diff --git a/contrib/tsearch2/ts_stat.c b/contrib/tsearch2/ts_stat.c index a6518e3439..5552957123 100644 --- a/contrib/tsearch2/ts_stat.c +++ b/contrib/tsearch2/ts_stat.c @@ -15,9 +15,10 @@ Datum tsstat_in(PG_FUNCTION_ARGS) { tsstat *stat = palloc(STATHDRSIZE); - + stat->len = STATHDRSIZE; stat->size = 0; + stat->weight = 0; PG_RETURN_POINTER(stat); } @@ -32,6 +33,20 @@ tsstat_out(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } +static int +check_weight(tsvector *txt, WordEntry *wptr, int8 weight) { + int len = POSDATALEN(txt, wptr); + int num=0; + WordEntryPos *ptr = POSDATAPTR(txt, wptr); + + while (len--) { + if (weight & (1 << ptr->weight)) + num++; + ptr++; + } + return num; +} + static WordEntry ** SEI_realloc(WordEntry ** in, uint32 *len) { @@ -83,6 +98,7 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len) totallen = CALCSTATSIZE(nentry, slen); newstat = palloc(totallen); newstat->len = totallen; + newstat->weight = stat->weight; newstat->size = nentry; memcpy(STATSTRPTR(newstat), STATSTRPTR(stat), STATSTRSIZE(stat)); @@ -107,8 +123,9 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len) } nptr = STATPTR(newstat) + (StopLow - STATPTR(stat)); memcpy(STATPTR(newstat), STATPTR(stat), sizeof(StatEntry) * (StopLow - STATPTR(stat))); - nptr->nentry = POSDATALEN(txt, *ptr); - if (nptr->nentry == 0) + if ( (*ptr)->haspos ) { + nptr->nentry = ( stat->weight ) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr); + } else nptr->nentry = 1; nptr->ndoc = 1; nptr->len = (*ptr)->len; @@ -127,8 +144,9 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len) } else { - nptr->nentry = POSDATALEN(txt, *ptr); - if (nptr->nentry == 0) + if ( (*ptr)->haspos ) { + nptr->nentry = ( stat->weight ) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr); + } else nptr->nentry = 1; nptr->ndoc = 1; nptr->len = (*ptr)->len; @@ -144,8 +162,9 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len) while (ptr - entry < len) { - nptr->nentry = POSDATALEN(txt, *ptr); - if (nptr->nentry == 0) + if ( (*ptr)->haspos ) { + nptr->nentry = ( stat->weight ) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr); + } else nptr->nentry = 1; nptr->ndoc = 1; nptr->len = (*ptr)->len; @@ -173,12 +192,14 @@ ts_accum(PG_FUNCTION_ARGS) cur = 0; StatEntry *sptr; WordEntry *wptr; + int n=0; if (stat == NULL || PG_ARGISNULL(0)) { /* Init in first */ stat = palloc(STATHDRSIZE); stat->len = STATHDRSIZE; stat->size = 0; + stat->weight = 0; } /* simple check of correctness */ @@ -201,32 +222,37 @@ ts_accum(PG_FUNCTION_ARGS) sptr++; else if (cmp == 0) { - int n = POSDATALEN(txt, wptr); - - if (n == 0) - n = 1; - sptr->ndoc++; - sptr->nentry += n; + if ( stat->weight == 0 ) { + sptr->ndoc++; + sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1; + } else if ( wptr->haspos && (n=check_weight(txt, wptr, stat->weight))!=0 ) { + sptr->ndoc++; + sptr->nentry += n; + } sptr++; wptr++; } else { - if (cur == len) - newentry = SEI_realloc(newentry, &len); - newentry[cur] = wptr; + if ( stat->weight == 0 || check_weight(txt, wptr, stat->weight)!=0 ) { + if (cur == len) + newentry = SEI_realloc(newentry, &len); + newentry[cur] = wptr; + cur++; + } wptr++; - cur++; } } while (wptr - ARRPTR(txt) < txt->size) { - if (cur == len) - newentry = SEI_realloc(newentry, &len); - newentry[cur] = wptr; + if ( stat->weight == 0 || check_weight(txt, wptr, stat->weight)!=0 ) { + if (cur == len) + newentry = SEI_realloc(newentry, &len); + newentry[cur] = wptr; + cur++; + } wptr++; - cur++; } } else @@ -243,12 +269,13 @@ ts_accum(PG_FUNCTION_ARGS) cmp = compareStatWord(sptr, wptr, stat, txt); if (cmp == 0) { - int n = POSDATALEN(txt, wptr); - - if (n == 0) - n = 1; - sptr->ndoc++; - sptr->nentry += n; + if ( stat->weight == 0 ) { + sptr->ndoc++; + sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1; + } else if ( wptr->haspos && (n=check_weight(txt, wptr, stat->weight))!=0 ) { + sptr->ndoc++; + sptr->nentry += n; + } break; } else if (cmp < 0) @@ -259,10 +286,12 @@ ts_accum(PG_FUNCTION_ARGS) if (StopLow >= StopHigh) { /* not found */ - if (cur == len) - newentry = SEI_realloc(newentry, &len); - newentry[cur] = wptr; - cur++; + if ( stat->weight == 0 || check_weight(txt, wptr, stat->weight)!=0 ) { + if (cur == len) + newentry = SEI_realloc(newentry, &len); + newentry[cur] = wptr; + cur++; + } } wptr++; } @@ -389,7 +418,7 @@ get_ti_Oid(void) } static tsstat * -ts_stat_sql(text *txt) +ts_stat_sql(text *txt, text *ws) { char *query = text2char(txt); int i; @@ -423,6 +452,31 @@ ts_stat_sql(text *txt) stat = palloc(STATHDRSIZE); stat->len = STATHDRSIZE; stat->size = 0; + stat->weight = 0; + + if ( ws ) { + char *buf; + buf = VARDATA(ws); + while( buf - VARDATA(ws) < VARSIZE(buf) - VARHDRSZ ) { + switch (tolower(*buf)) { + case 'a': + stat->weight |= 1 << 3; + break; + case 'b': + stat->weight |= 1 << 2; + break; + case 'c': + stat->weight |= 1 << 1; + break; + case 'd': + stat->weight |= 1; + break; + default: + stat->weight |= 0; + } + buf++; + } + } while (SPI_processed > 0) { @@ -467,11 +521,13 @@ ts_stat(PG_FUNCTION_ARGS) { tsstat *stat; text *txt = PG_GETARG_TEXT_P(0); + text *ws = (PG_NARGS() > 1) ? PG_GETARG_TEXT_P(1) : NULL; funcctx = SRF_FIRSTCALL_INIT(); SPI_connect(); - stat = ts_stat_sql(txt); + stat = ts_stat_sql(txt,ws); PG_FREE_IF_COPY(txt, 0); + if (PG_NARGS() > 1 ) PG_FREE_IF_COPY(ws, 1); ts_setup_firstcall(funcctx, stat); SPI_finish(); } diff --git a/contrib/tsearch2/ts_stat.h b/contrib/tsearch2/ts_stat.h index 37d1e7b660..de43c60603 100644 --- a/contrib/tsearch2/ts_stat.h +++ b/contrib/tsearch2/ts_stat.h @@ -20,10 +20,11 @@ typedef struct { int4 len; int4 size; + int4 weight; char data[1]; } tsstat; -#define STATHDRSIZE (sizeof(int4)*2) +#define STATHDRSIZE (sizeof(int4)*4) #define CALCSTATSIZE(x, lenstr) ( x * sizeof(StatEntry) + STATHDRSIZE + lenstr ) #define STATPTR(x) ( (StatEntry*) ( (char*)x + STATHDRSIZE ) ) #define STATSTRPTR(x) ( (char*)x + STATHDRSIZE + ( sizeof(StatEntry) * ((tsvector*)x)->size ) ) diff --git a/contrib/tsearch2/tsearch.sql.in b/contrib/tsearch2/tsearch.sql.in index 005a3dd67c..1f8ed4b1d8 100644 --- a/contrib/tsearch2/tsearch.sql.in +++ b/contrib/tsearch2/tsearch.sql.in @@ -652,6 +652,12 @@ CREATE FUNCTION stat(text) language 'C' with (isstrict); +CREATE FUNCTION stat(text,text) + returns setof statinfo + as 'MODULE_PATHNAME', 'ts_stat' + language 'C' + with (isstrict); + --reset - just for debuging CREATE FUNCTION reset_tsearch() returns void diff --git a/contrib/tsearch2/untsearch.sql.in b/contrib/tsearch2/untsearch.sql.in index f775ba4b9d..b55e3185ab 100644 --- a/contrib/tsearch2/untsearch.sql.in +++ b/contrib/tsearch2/untsearch.sql.in @@ -59,6 +59,8 @@ DROP FUNCTION gtsvector_penalty(internal,internal,internal); DROP FUNCTION gtsvector_picksplit(internal, internal); DROP FUNCTION gtsvector_union(internal, internal); DROP FUNCTION reset_tsearch(); +DROP FUNCTION stat(text); +DROP FUNCTION stat(text,stat); DROP FUNCTION tsearch2() CASCADE; DROP FUNCTION _get_parser_from_curcfg();