-# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.14 2006/05/02 11:28:54 teodor Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.15 2006/05/31 14:05:31 teodor Exp $
MODULE_big = tsearch2
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
- dict_snowball.o dict_ispell.o dict_syn.o \
+ dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \
wparser.o wparser_def.o \
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
tsvector_op.o rank.o ts_stat.o \
query_util.o query_support.o query_rewrite.o query_gist.o \
- ts_locale.o ginidx.o
+ ts_locale.o ts_lexize.o ginidx.o
SUBDIRS := snowball ispell wordparser
SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o)
PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser
-DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8
+DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8 thesaurus
DATA_built = tsearch2.sql untsearch2.sql
DOCS = README.tsearch2
REGRESS = tsearch2
#include "catalog/pg_proc.h"
#include "catalog/pg_namespace.h"
#include "utils/syscache.h"
+#include "miscadmin.h"
#include "ts_cfg.h"
#include "dict.h"
return nspoid;
}
+
+ /* if path is relative, take it as relative to share dir */
+char *
+to_absfilename(char *filename) {
+ if (!is_absolute_path(filename)) {
+ char sharepath[MAXPGPATH];
+ char *absfn;
+#ifdef WIN32
+ char delim = '\\';
+#else
+ char delim = '/';
+#endif
+ get_share_path(my_exec_path, sharepath);
+ absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
+ sprintf(absfn, "%s%c%s", sharepath, delim, filename);
+ filename = absfn;
+ }
+
+ return filename;
+}
int text_cmp(text *a, text *b);
+char * to_absfilename(char *filename);
+
#define NEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
#define ARRNELEMS(x) ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
-/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.11 2006/03/11 04:38:30 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.12 2006/05/31 14:05:31 teodor Exp $ */
/*
* interface functions to dictionary
Datum opt;
Oid oid = InvalidOid;
+ /* setup dictlexize method */
+ oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
+ if (isnull || oid == InvalidOid)
+ ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
+ fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
+
+ /* setup and call dictinit method, optinally */
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
if (!(isnull || oid == InvalidOid))
{
opt = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 2, &isnull);
dict->dictionary = (void *) DatumGetPointer(OidFunctionCall1(oid, opt));
}
- oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
- if (isnull || oid == InvalidOid)
- ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
- fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
dict->dict_id = id;
}
else
return (((DictInfo *) a)->dict_id < ((DictInfo *) b)->dict_id) ? -1 : 1;
}
+static void
+insertdict(Oid id) {
+ DictInfo newdict;
+
+ if (DList.len == DList.reallen)
+ {
+ DictInfo *tmp;
+ int reallen = (DList.reallen) ? 2 * DList.reallen : 16;
+
+ tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
+ if (!tmp)
+ ts_error(ERROR, "No memory");
+ DList.reallen = reallen;
+ DList.list = tmp;
+ }
+ init_dict(id, &newdict);
+
+ DList.list[DList.len] = newdict;
+ DList.len++;
+
+ qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
+}
+
DictInfo *
finddict(Oid id)
{
return DList.last_dict;
}
- /* last chance */
- if (DList.len == DList.reallen)
- {
- DictInfo *tmp;
- int reallen = (DList.reallen) ? 2 * DList.reallen : 16;
-
- tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
- if (!tmp)
- ts_error(ERROR, "No memory");
- DList.reallen = reallen;
- DList.list = tmp;
- }
- DList.last_dict = &(DList.list[DList.len]);
- init_dict(id, DList.last_dict);
-
- DList.len++;
- qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
+ /* insert new dictionary */
+ insertdict(id);
return finddict(id); /* qsort changed order!! */ ;
}
*ptr;
Datum *da;
ArrayType *a;
+ DictSubState dstate = { false, false, NULL };
SET_FUNCOID();
dict = finddict(PG_GETARG_OID(0));
ptr = res = (TSLexeme *) DatumGetPointer(
- FunctionCall3(&(dict->lexize_info),
+ FunctionCall4(&(dict->lexize_info),
+ PointerGetDatum(dict->dictionary),
+ PointerGetDatum(VARDATA(in)),
+ Int32GetDatum(VARSIZE(in) - VARHDRSZ),
+ PointerGetDatum(&dstate)
+ )
+ );
+
+ if (dstate.getnext) {
+ dstate.isend = true;
+ ptr = res = (TSLexeme *) DatumGetPointer(
+ FunctionCall4(&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
PointerGetDatum(VARDATA(in)),
- Int32GetDatum(VARSIZE(in) - VARHDRSZ)
+ Int32GetDatum(VARSIZE(in) - VARHDRSZ),
+ PointerGetDatum(&dstate)
)
);
+ }
+
PG_FREE_IF_COPY(in, 1);
if (!res)
{
-/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.6 2006/03/11 04:38:30 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.7 2006/05/31 14:05:31 teodor Exp $ */
#ifndef __DICT_H__
#define __DICT_H__
#include "postgres.h"
#include "fmgr.h"
+#include "ts_cfg.h"
typedef struct
{
Oid name2id_dict(text *name);
void reset_dict(void);
+typedef struct {
+ bool isend; /* in: marks for lexize_info about text end is reached */
+ bool getnext; /* out: dict wants next lexeme */
+ void *private; /* internal dict state between calls with getnext == true */
+} DictSubState;
/* simple parser of cfg string */
typedef struct
/*
* number of variant of split word , for example Word 'fotballklubber'
* (norwegian) has two varian to split: ( fotball, klubb ) and ( fot,
- * ball, klubb ). So, dictionary should return: nvariant lexeme 1
- * fotball 1 klubb 2 fot 2 ball 2 klubb
- *
+ * ball, klubb ). So, dictionary should return:
+ * nvariant lexeme
+ * 1 fotball
+ * 1 klubb
+ * 2 fot
+ * 2 ball
+ * 2 klubb
*/
uint16 nvariant;
- /* currently unused */
uint16 flags;
/* C-string */
char *lexeme;
} TSLexeme;
+#define TSL_ADDPOS 0x01
+
+
+/*
+ * Lexize subsystem
+ */
+
+typedef struct ParsedLex {
+ int type;
+ char *lemm;
+ int lenlemm;
+ bool resfollow;
+ struct ParsedLex *next;
+} ParsedLex;
+
+typedef struct ListParsedLex {
+ ParsedLex *head;
+ ParsedLex *tail;
+} ListParsedLex;
+
+typedef struct {
+ TSCfgInfo *cfg;
+ Oid curDictId;
+ int posDict;
+ DictSubState dictState;
+ ParsedLex *curSub;
+ ListParsedLex towork; /* current list to work */
+ ListParsedLex waste; /* list of lexemes that already lexized */
+
+ /* fields to store last variant to lexize (basically, thesaurus
+ or similar to, which wants several lexemes */
+
+ ParsedLex *lastRes;
+ TSLexeme *tmpRes;
+} LexizeData;
+
+
+void LexizeInit(LexizeData *ld, TSCfgInfo *cfg);
+void LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm);
+TSLexeme* LexizeExec(LexizeData *ld, ParsedLex **correspondLexem);
+
#endif
--- /dev/null
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.1 2006/05/31 14:05:31 teodor Exp $ */
+
+/*
+ * thesaurus
+ * Teodor Sigaev <teodor@sigaev.ru>
+ */
+#include "postgres.h"
+#include "executor/spi.h"
+
+#include <ctype.h>
+
+#include "dict.h"
+#include "common.h"
+#include "ts_locale.h"
+
+typedef struct LexemeInfo {
+ uint16 idsubst; /* entry's number in DictThesaurus->subst */
+ uint16 posinsubst; /* pos info in entry */
+ uint16 tnvariant; /* total num lexemes in one variant */
+ struct LexemeInfo *nextentry;
+ struct LexemeInfo *nextvariant;
+} LexemeInfo;
+
+typedef struct {
+ char *lexeme;
+ LexemeInfo *entries;
+} TheLexeme;
+
+typedef struct {
+ uint16 lastlexeme; /* number lexemes to substitute */
+ uint16 reslen;
+ TSLexeme *res; /* prepared substituted result */
+} TheSubstitute;
+
+typedef struct
+{
+ /* subdictionary to normalize lexemes */
+ DictInfo subdict;
+
+ /* Array to search lexeme by exact match */
+ TheLexeme *wrds;
+ int nwrds;
+ int ntwrds;
+
+ /* Storage of substituted result, n-th element is for
+ n-th expression */
+ TheSubstitute *subst;
+ int nsubst;
+} DictThesaurus;
+
+PG_FUNCTION_INFO_V1(thesaurus_init);
+Datum thesaurus_init(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(thesaurus_lexize);
+Datum thesaurus_lexize(PG_FUNCTION_ARGS);
+
+static void
+freeDictThesaurus(DictThesaurus * d)
+{
+ free(d);
+}
+
+static void
+newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst ) {
+ TheLexeme *ptr;
+
+ if ( d->nwrds >= d->ntwrds ) {
+ if ( d->ntwrds == 0 ) {
+ d->ntwrds = 16;
+ d->wrds = (TheLexeme*)malloc(sizeof(TheLexeme) * d->ntwrds);
+ } else {
+ d->ntwrds *= 2;
+ d->wrds = (TheLexeme*)realloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
+ }
+ if (!d->wrds)
+ elog(ERROR,"Out of memory");
+ }
+
+ ptr = d->wrds + d->nwrds;
+ d->nwrds++;
+
+ if ( (ptr->lexeme = malloc(e-b+1)) == NULL )
+ elog(ERROR,"Out of memory");
+
+ memcpy(ptr->lexeme, b, e-b);
+ ptr->lexeme[e-b] = '\0';
+
+ if ( (ptr->entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) ))==NULL )
+ elog(ERROR,"Out of memory");
+
+ ptr->entries->nextentry=NULL;
+ ptr->entries->idsubst = idsubst;
+ ptr->entries->posinsubst = posinsubst;
+}
+
+static void
+addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) {
+ static int nres=0;
+ static int ntres = 0;
+ TheSubstitute *ptr;
+
+ if ( nwrd == 0 ) {
+ nres = ntres = 0;
+
+ if ( idsubst <= d->nsubst ) {
+ if ( d->nsubst == 0 ) {
+ d->nsubst = 16;
+ d->subst = (TheSubstitute*)malloc(sizeof(TheSubstitute) * d->nsubst);
+ } else {
+ d->nsubst *= 2;
+ d->subst = (TheSubstitute*)realloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
+ }
+ if (!d->subst)
+ elog(ERROR,"Out of memory");
+ }
+ }
+
+ ptr = d->subst + idsubst;
+
+ ptr->lastlexeme = posinsubst-1;
+
+ if ( nres+1 >= ntres ) {
+ if ( ntres == 0 ) {
+ ntres = 2;
+ ptr->res = (TSLexeme*)malloc( sizeof(TSLexeme) * ntres );
+ } else {
+ ntres *= 2;
+ ptr->res = (TSLexeme*)realloc( ptr->res, sizeof(TSLexeme) * ntres );
+ }
+
+ if ( !ptr->res )
+ elog(ERROR,"Out of memory");
+ }
+
+ if ( (ptr->res[ nres ].lexeme = malloc(e-b+1))==0 )
+ elog(ERROR,"Out of memory");
+ memcpy(ptr->res[ nres ].lexeme, b, e-b);
+ ptr->res[ nres ].lexeme[e-b] = '\0';
+
+ ptr->res[ nres ].nvariant = nwrd;
+ ptr->res[ nres ].flags = TSL_ADDPOS;
+
+ ptr->res[ ++nres ].lexeme = NULL;
+}
+
+#define TR_WAITLEX 1
+#define TR_INLEX 2
+#define TR_WAITSUBS 3
+#define TR_INSUBS 4
+
+static void
+thesaurusRead( char *filename, DictThesaurus *d ) {
+ FILE *fh;
+ char str[BUFSIZ];
+ int lineno=0;
+ uint16 idsubst = 0;
+
+ fh = fopen(to_absfilename(filename), "r");
+ if (!fh)
+ elog(ERROR,"Thesaurus: can't open '%s' file", filename);
+
+ while( fgets(str, sizeof(str), fh)) {
+ char *ptr = str;
+ int state = TR_WAITLEX;
+ char *beginwrd = NULL;
+ uint16 posinsubst=0;
+ uint16 nwrd=0;
+
+ lineno++;
+
+ /* is it comment ? */
+ while( t_isspace(ptr) )
+ ptr += pg_mblen(ptr);
+ if ( t_iseq(str, '#') || *str=='\0' || t_iseq(str, '\n') || t_iseq(str, '\r') )
+ continue;
+
+ pg_verifymbstr(ptr, strlen(ptr), false);
+ while(*ptr) {
+ if ( state == TR_WAITLEX ) {
+ if ( t_iseq(ptr, ':' ) ) {
+ if ( posinsubst == 0 ) {
+ fclose(fh);
+ elog(ERROR, "Thesaurus: Unexpected delimiter at %d line", lineno);
+ }
+ state = TR_WAITSUBS;
+ } else if ( !t_isspace(ptr) ) {
+ beginwrd = ptr;
+ state = TR_INLEX;
+ }
+ } else if ( state == TR_INLEX ) {
+ if ( t_iseq(ptr, ':') ) {
+ newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ );
+ state = TR_WAITSUBS;
+ } else if ( t_isspace(ptr) ) {
+ newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ );
+ state = TR_WAITLEX;
+ }
+ } else if ( state == TR_WAITSUBS ) {
+ if ( !t_isspace(ptr) ) {
+ beginwrd = ptr;
+ state = TR_INSUBS;
+ }
+ } else if ( state == TR_INSUBS ) {
+ if ( t_isspace(ptr) ) {
+ addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
+ state = TR_WAITSUBS;
+ }
+ } else
+ elog(ERROR,"Thesaurus: Unknown state: %d", state);
+
+ ptr += pg_mblen(ptr);
+ }
+
+ if ( state == TR_INSUBS )
+ addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
+
+ idsubst++;
+
+ if ( !(nwrd && posinsubst) ) {
+ fclose(fh);
+ elog(ERROR, "Thesaurus: Unexpected end of line at %d line", lineno);
+ }
+
+ }
+
+ d->nsubst = idsubst;
+
+ fclose(fh);
+}
+
+static TheLexeme*
+addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo* src, uint16 tnvariant) {
+
+ if ( *nnw >= *tnm ) {
+ *tnm *= 2;
+ newwrds = (TheLexeme*)realloc( newwrds, sizeof(TheLexeme) * *tnm);
+ if (!newwrds)
+ elog(ERROR,"Out of memory");
+ }
+
+ newwrds[ *nnw ].entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) );
+ if (!newwrds[ *nnw ].entries)
+ elog(ERROR,"Out of memory");
+
+ if ( lexeme && lexeme->lexeme ) {
+ newwrds[ *nnw ].lexeme = strdup( lexeme->lexeme );
+ if ( !newwrds[ *nnw ].lexeme )
+ elog(ERROR,"Out of memory");
+
+ newwrds[ *nnw ].entries->tnvariant = tnvariant;
+ } else {
+ newwrds[ *nnw ].lexeme = NULL;
+ newwrds[ *nnw ].entries->tnvariant = 1;
+ }
+
+ newwrds[ *nnw ].entries->idsubst = src->idsubst;
+ newwrds[ *nnw ].entries->posinsubst = src->posinsubst;
+
+ newwrds[ *nnw ].entries->nextentry = NULL;
+
+ (*nnw)++;
+ return newwrds;
+}
+
+static int
+cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b) {
+ if ( a==NULL || b==NULL )
+ return 0;
+
+ if ( a->idsubst == b->idsubst ) {
+ if ( a->posinsubst == b->posinsubst ) {
+ if ( a->tnvariant == b->tnvariant )
+ return 0;
+
+ return ( a->tnvariant > b->tnvariant ) ? 1 : -1;
+ }
+
+ return ( a->posinsubst > b->posinsubst ) ? 1 : -1;
+ }
+
+ return ( a->idsubst > b->idsubst ) ? 1 : -1;
+}
+
+static int
+cmpLexeme(TheLexeme *a, TheLexeme* b) {
+ if ( a->lexeme == NULL ) {
+ if ( b->lexeme == NULL )
+ return 0;
+ else
+ return 1;
+ } else if ( b->lexeme == NULL )
+ return -1;
+
+ return strcmp( a->lexeme, b->lexeme );
+}
+
+static int
+cmpLexemeQ(const void *a, const void *b) {
+ return cmpLexeme( (TheLexeme*)a, (TheLexeme*)b );
+}
+
+static int cmpTheLexeme(const void *a, const void *b) {
+ TheLexeme *la = (TheLexeme*)a;
+ TheLexeme *lb = (TheLexeme*)b;
+ int res;
+
+ if ( (res=cmpLexeme(la, lb)) != 0 )
+ return res;
+
+ return -cmpLexemeInfo(la->entries, lb->entries);
+}
+
+static void
+compileTheLexeme(DictThesaurus *d) {
+ int i,nnw=0, tnm=16;
+ TheLexeme *newwrds = (TheLexeme*)malloc(sizeof(TheLexeme)*tnm), *ptrwrds;
+
+ if (!newwrds)
+ elog(ERROR,"Out of memory");
+
+ for(i=0;i<d->nwrds;i++) {
+ TSLexeme *ptr = (TSLexeme*) DatumGetPointer(
+ FunctionCall4(
+ &(d->subdict.lexize_info),
+ PointerGetDatum(d->subdict.dictionary),
+ PointerGetDatum(d->wrds[i].lexeme),
+ Int32GetDatum(strlen(d->wrds[i].lexeme)),
+ PointerGetDatum(NULL)
+ )
+ );
+
+ if ( !(ptr && ptr->lexeme) ) {
+ newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
+ elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, assign any non-recognized word", d->wrds[i].lexeme);
+ } else {
+ while( ptr->lexeme ) {
+ TSLexeme *remptr = ptr+1;
+ int tnvar = 1;
+ int curvar = ptr->nvariant;
+
+ /* compute n words in one variant */
+ while( remptr->lexeme ) {
+ if ( remptr->nvariant != (remptr-1)->nvariant )
+ break;
+ tnvar++;
+ remptr++;
+ }
+
+ remptr = ptr;
+ while( remptr->lexeme && remptr->nvariant == curvar ) {
+ newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
+ remptr++;
+ }
+
+ ptr = remptr;
+ }
+ }
+
+ free( d->wrds[i].lexeme );
+ free( d->wrds[i].entries );
+ }
+
+ free( d->wrds );
+ d->wrds = newwrds;
+ d->nwrds = nnw;
+ d->ntwrds = tnm;
+
+ if ( d->nwrds > 1 ) {
+ qsort( d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme );
+
+ /* uniq */
+ newwrds = d->wrds;
+ ptrwrds = d->wrds + 1;
+ while( ptrwrds - d->wrds < d->nwrds ) {
+ if ( cmpLexeme( ptrwrds, newwrds ) == 0 ) {
+ if ( cmpLexemeInfo(ptrwrds->entries, newwrds->entries) ) {
+ ptrwrds->entries->nextentry = newwrds->entries;
+ newwrds->entries = ptrwrds->entries;
+ } else
+ free( ptrwrds->entries );
+
+ if ( ptrwrds->lexeme )
+ free( ptrwrds->lexeme );
+ } else {
+ newwrds++;
+ *newwrds = *ptrwrds;
+ }
+
+ ptrwrds++;
+ }
+
+ d->nwrds = newwrds - d->wrds + 1;
+ d->wrds = (TheLexeme*)realloc( d->wrds, sizeof(TheLexeme) * d->nwrds );
+ }
+}
+
+static void
+compileTheSubstitute(DictThesaurus *d) {
+ int i;
+
+ for(i=0;i<d->nsubst;i++) {
+ TSLexeme *rem = d->subst[i].res, *outptr, *inptr;
+ int n=2;
+
+ outptr = d->subst[i].res = (TSLexeme*)malloc( sizeof(TSLexeme) * n );
+ if ( d->subst[i].res == NULL )
+ elog(ERROR,"Out of Memory");
+ outptr->lexeme = NULL;
+ inptr = rem;
+
+ while( inptr && inptr->lexeme ) {
+ TSLexeme *reml, *lexized = (TSLexeme*) DatumGetPointer(
+ FunctionCall4(
+ &(d->subdict.lexize_info),
+ PointerGetDatum(d->subdict.dictionary),
+ PointerGetDatum(inptr->lexeme),
+ Int32GetDatum(strlen(inptr->lexeme)),
+ PointerGetDatum(NULL)
+ )
+ );
+
+ reml = lexized;
+ if ( lexized ) {
+ int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1;
+
+ while( lexized->lexeme ) {
+ if ( outptr - d->subst[i].res + 1 >= n ) {
+ int diff = outptr - d->subst[i].res;
+ n *= 2;
+ d->subst[i].res = (TSLexeme*)realloc( d->subst[i].res, sizeof(TSLexeme) * n );
+ if ( d->subst[i].res == NULL )
+ elog(ERROR,"Out of Memory");
+ outptr = d->subst[i].res + diff;
+ }
+
+ *outptr = *lexized;
+ if ( (outptr->lexeme = strdup(lexized->lexeme)) == NULL )
+ elog(ERROR,"Out of Memory");
+
+ outptr++;
+ lexized++;
+ }
+
+ if ( toset > 0)
+ d->subst[i].res[toset].flags |= TSL_ADDPOS;
+ }
+
+ if ( inptr->lexeme )
+ free( inptr->lexeme );
+ inptr++;
+ }
+
+ d->subst[i].reslen = outptr - d->subst[i].res;
+
+ free(rem);
+ }
+}
+
+Datum
+thesaurus_init(PG_FUNCTION_ARGS)
+{
+ DictThesaurus *d;
+ Map *cfg,
+ *pcfg;
+ text *in, *subdictname=NULL;
+ bool fileloaded = false;
+
+ if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("Thesaurus confguration error")));
+
+ d = (DictThesaurus *) malloc(sizeof(DictThesaurus));
+ if (!d)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ memset(d, 0, sizeof(DictThesaurus));
+
+ in = PG_GETARG_TEXT_P(0);
+ parse_cfgdict(in, &cfg);
+ PG_FREE_IF_COPY(in, 0);
+ pcfg = cfg;
+ while (pcfg->key)
+ {
+ if (pg_strcasecmp("DictFile", pcfg->key) == 0)
+ {
+ if (fileloaded)
+ {
+ freeDictThesaurus(d);
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("Thesaurus file is already loaded")));
+ }
+ fileloaded = true;
+ thesaurusRead( pcfg->value, d );
+ }
+ else if (pg_strcasecmp("Dictionary", pcfg->key) == 0)
+ {
+ if (subdictname)
+ {
+ freeDictThesaurus(d);
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("Thesaurus: SubDictionary is already defined")));
+ }
+ subdictname = char2text( pcfg->value );
+ }
+ else
+ {
+ freeDictThesaurus(d);
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("unrecognized option: %s => %s",
+ pcfg->key, pcfg->value)));
+ }
+ pfree(pcfg->key);
+ pfree(pcfg->value);
+ pcfg++;
+ }
+ pfree(cfg);
+
+ if (!fileloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("Thesaurus file isn't defined")));
+
+ if ( subdictname ) {
+ DictInfo *subdictptr;
+ /*
+ * we already in SPI, but name2id_dict()/finddict()
+ * invoke SPI_connect()
+ */
+ SPI_push();
+
+ subdictptr = finddict( name2id_dict( subdictname ) );
+
+ SPI_pop();
+
+ d->subdict = *subdictptr;
+ } else
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("Thesaurus: SubDictionary isn't defined")));
+
+ compileTheLexeme( d );
+ compileTheSubstitute(d);
+
+ PG_RETURN_POINTER(d);
+}
+
+static LexemeInfo*
+findTheLexeme(DictThesaurus *d, char * lexeme) {
+ TheLexeme key = { lexeme, NULL }, *res;
+
+ if ( d->nwrds == 0 )
+ return NULL;
+
+ res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
+
+ if ( res == NULL )
+ return NULL;
+ return res->entries;
+}
+
+static bool
+matchIdSubst(LexemeInfo *stored, uint16 idsubst) {
+ bool res = true;
+
+ if (stored) {
+ res = false;
+
+ for(; stored; stored=stored->nextvariant)
+ if ( stored->idsubst == idsubst ) {
+ res = true;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static LexemeInfo*
+findVariant( LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn) {
+ for(;;) {
+ int i;
+ LexemeInfo *ptr = newin[0];
+
+ for(i=0; i<newn; i++) {
+ while(newin[i] && newin[i]->idsubst < ptr->idsubst)
+ newin[i] = newin[i]->nextentry;
+
+ if ( newin[i] == NULL )
+ return in;
+
+ if ( newin[i]->idsubst > ptr->idsubst ) {
+ ptr = newin[i];
+ i=-1;
+ continue;
+ }
+
+ while(newin[i]->idsubst == ptr->idsubst) {
+ if ( newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn ) {
+ ptr = newin[i];
+ break;
+ }
+
+ newin[i] = newin[i]->nextentry;
+ if ( newin[i] == NULL )
+ return in;
+ }
+
+ if ( newin[i]->idsubst != ptr->idsubst ) {
+ ptr = newin[i];
+ i=-1;
+ continue;
+ }
+ }
+
+ if ( i==newn && matchIdSubst(stored, ptr->idsubst) && (in==NULL || !matchIdSubst(in, ptr->idsubst)) ) { /* found */
+
+ ptr->nextvariant = in;
+ in = ptr;
+ }
+
+ /* step forward */
+ for(i=0; i<newn; i++)
+ newin[i] = newin[i]->nextentry;
+ }
+
+ return NULL;
+}
+
+static TSLexeme*
+copyTSLexeme( TheSubstitute *ts ) {
+ TSLexeme *res;
+ uint16 i;
+
+ res = (TSLexeme*)palloc( sizeof(TSLexeme) * (ts->reslen+1) );
+ for(i=0;i<ts->reslen;i++) {
+ res[i] = ts->res[i];
+ res[i].lexeme = pstrdup( ts->res[i].lexeme );
+ }
+
+ res[ts->reslen].lexeme = NULL;
+
+ return res;
+}
+
+static TSLexeme*
+checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres) {
+ *moreres = false;
+ while(info) {
+ Assert( info->idsubst < d->nsubst );
+ if ( info->nextvariant )
+ *moreres = true;
+ if ( d->subst[ info->idsubst ].lastlexeme == curpos )
+ return copyTSLexeme( d->subst + info->idsubst );
+ info = info->nextvariant;
+ }
+
+ return NULL;
+}
+
+Datum
+thesaurus_lexize(PG_FUNCTION_ARGS)
+{
+ DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
+ DictSubState *dstate = (DictSubState*)PG_GETARG_POINTER(3);
+ TSLexeme *res=NULL;
+ LexemeInfo *stored, *info = NULL;
+ uint16 curpos = 0;
+ bool moreres = false;
+
+ if ( dstate == NULL || PG_NARGS() < 4 )
+ elog(ERROR,"Forbidden call of thesaurus or nested call");
+
+ if ( dstate->isend )
+ PG_RETURN_POINTER(NULL);
+ stored = (LexemeInfo*) dstate->private;
+
+ if (stored)
+ curpos = stored->posinsubst+1;
+
+ res =(TSLexeme*) DatumGetPointer (
+ FunctionCall4(
+ &(d->subdict.lexize_info),
+ PointerGetDatum(d->subdict.dictionary),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_INT32(2),
+ PointerGetDatum(NULL)
+ )
+ );
+
+ if ( res && res->lexeme ) {
+ TSLexeme *ptr = res , *basevar;
+
+ while( ptr->lexeme ) {
+ uint16 nv = ptr->nvariant;
+ uint16 i,nlex = 0;
+ LexemeInfo **infos;
+
+ basevar = ptr;
+ while( ptr->lexeme && nv == ptr->nvariant ) {
+ nlex++;
+ ptr++;
+ }
+
+ infos = (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex);
+ for(i=0;i<nlex;i++)
+ if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL )
+ break;
+
+ if ( i<nlex ) {
+ /* no chance to find */
+ pfree( infos );
+ continue;
+ }
+
+ info = findVariant( info, stored, curpos, infos, nlex);
+ }
+
+ } else {
+ LexemeInfo *infos = findTheLexeme(d, NULL);
+ info = findVariant( NULL, stored, curpos, &infos, 1);
+ }
+
+ dstate->private = (void*)info;
+
+ if ( !info ) {
+ dstate->getnext = false;
+ PG_RETURN_POINTER(NULL);
+ }
+
+ if ( (res=checkMatch(d, info, curpos,&moreres)) != NULL ) {
+ dstate->getnext = moreres;
+ PG_RETURN_POINTER(res);
+ }
+
+ dstate->getnext = true;
+
+ PG_RETURN_POINTER(NULL);
+}
--
\set ECHO none
psql:tsearch2.sql:13: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_dict_pkey" for table "pg_ts_dict"
-psql:tsearch2.sql:158: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
-psql:tsearch2.sql:257: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
-psql:tsearch2.sql:264: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
-psql:tsearch2.sql:370: NOTICE: type "tsvector" is not yet defined
+psql:tsearch2.sql:177: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
+psql:tsearch2.sql:276: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
+psql:tsearch2.sql:283: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
+psql:tsearch2.sql:389: NOTICE: type "tsvector" is not yet defined
DETAIL: Creating a shell type definition.
-psql:tsearch2.sql:375: NOTICE: argument type tsvector is only a shell
-psql:tsearch2.sql:429: NOTICE: type "tsquery" is not yet defined
+psql:tsearch2.sql:394: NOTICE: argument type tsvector is only a shell
+psql:tsearch2.sql:448: NOTICE: type "tsquery" is not yet defined
DETAIL: Creating a shell type definition.
-psql:tsearch2.sql:434: NOTICE: argument type tsquery is only a shell
-psql:tsearch2.sql:592: NOTICE: type "gtsvector" is not yet defined
+psql:tsearch2.sql:453: NOTICE: argument type tsquery is only a shell
+psql:tsearch2.sql:611: NOTICE: type "gtsvector" is not yet defined
DETAIL: Creating a shell type definition.
-psql:tsearch2.sql:597: NOTICE: argument type gtsvector is only a shell
-psql:tsearch2.sql:1087: NOTICE: type "gtsq" is not yet defined
+psql:tsearch2.sql:616: NOTICE: argument type gtsvector is only a shell
+psql:tsearch2.sql:1106: NOTICE: type "gtsq" is not yet defined
DETAIL: Creating a shell type definition.
-psql:tsearch2.sql:1092: NOTICE: argument type gtsq is only a shell
+psql:tsearch2.sql:1111: NOTICE: argument type gtsq is only a shell
--tsvector
SELECT '1'::tsvector;
tsvector
*/
#include "postgres.h"
-#include "miscadmin.h"
-
#include "common.h"
#include "dict.h"
#include "ts_locale.h"
s->len = 0;
if (in && VARSIZE(in) - VARHDRSZ > 0)
{
- char *filename = text2char(in);
+ char *filename = to_absfilename(text2char(in));
FILE *hin;
char buf[STOPBUFLEN];
int reallen = 0;
- /* if path is relative, take it as relative to share dir */
- if (!is_absolute_path(filename))
- {
- char sharepath[MAXPGPATH];
- char *absfn;
-#ifdef WIN32
- char delim = '\\';
-#else
- char delim = '/';
-#endif
-
- get_share_path(my_exec_path, sharepath);
- absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
- sprintf(absfn, "%s%c%s", sharepath, delim, filename);
-
- pfree(filename);
- filename = absfn;
- }
-
if ((hin = fopen(filename, "r")) == NULL)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
--- /dev/null
+#
+# Theasurus config file. Character ':' splits
+# string to part:
+# to be substituted string
+# substituting string
+#
+
+#one two three : 123
+#one two : 12
+#one : 1
+#two : 2
+
+#foo bar : blah blah
+#f bar : fbar
+#e bar : ebar
+#g bar bar : gbarbar
+#asd:sdffff
+#qwerty:qwer wert erty
+
return id;
}
-
void
parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
{
int type,
- lenlemm,
- i;
+ lenlemm;
char *lemm = NULL;
WParserInfo *prsobj = findprs(cfg->prs_id);
+ LexizeData ldata;
+ TSLexeme *norms;
prsobj->prs = (void *) DatumGetPointer(
FunctionCall2(
)
);
- while ((type = DatumGetInt32(FunctionCall3(
+ LexizeInit(&ldata, cfg);
+
+ do {
+ type = DatumGetInt32(FunctionCall3(
&(prsobj->getlexeme_info),
PointerGetDatum(prsobj->prs),
PointerGetDatum(&lemm),
- PointerGetDatum(&lenlemm)))) != 0)
- {
+ PointerGetDatum(&lenlemm)));
- if (lenlemm >= MAXSTRLEN)
+ if (type>0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
#endif
}
- if (type >= cfg->len) /* skip this type of lexeme */
- continue;
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
- for (i = 0; i < cfg->map[type].len; i++)
+ while( (norms = LexizeExec(&ldata, NULL)) != NULL )
{
- DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
- TSLexeme *norms,
- *ptr;
-
- norms = ptr = (TSLexeme *) DatumGetPointer(
- FunctionCall3(
- &(dict->lexize_info),
- PointerGetDatum(dict->dictionary),
- PointerGetDatum(lemm),
- PointerGetDatum(lenlemm)
- )
- );
- if (!norms) /* dictionary doesn't know this lexeme */
- continue;
+ TSLexeme *ptr = norms;
prs->pos++; /* set pos */
prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
}
+ if ( ptr->flags & TSL_ADDPOS )
+ prs->pos++;
prs->words[prs->curwords].len = strlen(ptr->lexeme);
prs->words[prs->curwords].word = ptr->lexeme;
prs->words[prs->curwords].nvariant = ptr->nvariant;
prs->curwords++;
}
pfree(norms);
- break; /* lexeme already normalized or is stop word */
- }
}
+ } while(type>0);
FunctionCall1(
&(prsobj->end_info),
}
}
+static void
+addHLParsedLex(HLPRSTEXT *prs, QUERYTYPE * query, ParsedLex *lexs, TSLexeme *norms) {
+ ParsedLex *tmplexs;
+ TSLexeme *ptr;
+
+ while( lexs ) {
+
+ if ( lexs->type > 0 )
+ hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
+
+ ptr = norms;
+ while( ptr && ptr->lexeme ) {
+ hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
+ ptr++;
+ }
+
+ tmplexs = lexs->next;
+ pfree( lexs );
+ lexs = tmplexs;
+ }
+
+ if ( norms ) {
+ ptr = norms;
+ while( ptr->lexeme ) {
+ pfree( ptr->lexeme );
+ ptr++;
+ }
+ pfree(norms);
+ }
+}
+
void
hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 buflen)
{
int type,
- lenlemm,
- i;
+ lenlemm;
char *lemm = NULL;
WParserInfo *prsobj = findprs(cfg->prs_id);
+ LexizeData ldata;
+ TSLexeme *norms;
+ ParsedLex *lexs;
prsobj->prs = (void *) DatumGetPointer(
FunctionCall2(
)
);
- while ((type = DatumGetInt32(FunctionCall3(
+ LexizeInit(&ldata, cfg);
+
+ do {
+ type = DatumGetInt32(FunctionCall3(
&(prsobj->getlexeme_info),
PointerGetDatum(prsobj->prs),
PointerGetDatum(&lemm),
- PointerGetDatum(&lenlemm)))) != 0)
- {
+ PointerGetDatum(&lenlemm)));
- if (lenlemm >= MAXSTRLEN)
+ if (type>0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
#endif
}
- hladdword(prs, lemm, lenlemm, type);
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
- if (type >= cfg->len)
- continue;
+ do {
+ if ( (norms = LexizeExec(&ldata,&lexs)) != NULL )
+ addHLParsedLex(prs, query, lexs, norms);
+ else
+ addHLParsedLex(prs, query, lexs, NULL);
+ } while( norms );
- for (i = 0; i < cfg->map[type].len; i++)
- {
- DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
- TSLexeme *norms,
- *ptr;
-
- norms = ptr = (TSLexeme *) DatumGetPointer(
- FunctionCall3(
- &(dict->lexize_info),
- PointerGetDatum(dict->dictionary),
- PointerGetDatum(lemm),
- PointerGetDatum(lenlemm)
- )
- );
- if (!norms) /* dictionary doesn't know this lexeme */
- continue;
-
- while (ptr->lexeme)
- {
- hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
- pfree(ptr->lexeme);
- ptr++;
- }
- pfree(norms);
- break; /* lexeme already normalized or is stop word */
- }
- }
+ } while( type>0 );
FunctionCall1(
&(prsobj->end_info),
--- /dev/null
+/*
+ * lexize stream of lexemes
+ * Teodor Sigaev <teodor@sigaev.ru>
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+#include <locale.h>
+
+#include "ts_cfg.h"
+#include "dict.h"
+
+void
+LexizeInit(LexizeData *ld, TSCfgInfo *cfg) {
+ ld->cfg = cfg;
+ ld->curDictId = InvalidOid;
+ ld->posDict = 0;
+ ld->towork.head = ld->towork.tail = ld->curSub = NULL;
+ ld->waste.head = ld->waste.tail = NULL;
+ ld->lastRes=NULL;
+ ld->tmpRes=NULL;
+}
+
+static void
+LPLAddTail(ListParsedLex *list, ParsedLex *newpl) {
+ if ( list->tail ) {
+ list->tail->next = newpl;
+ list->tail = newpl;
+ } else
+ list->head = list->tail = newpl;
+ newpl->next = NULL;
+}
+
+static ParsedLex*
+LPLRemoveHead(ListParsedLex *list) {
+ ParsedLex *res = list->head;
+
+ if ( list->head )
+ list->head = list->head->next;
+
+ if ( list->head == NULL )
+ list->tail = NULL;
+
+ return res;
+}
+
+
+void
+LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) {
+ ParsedLex *newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
+
+ newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
+ newpl->type = type;
+ newpl->lemm = lemm;
+ newpl->lenlemm = lenlemm;
+ LPLAddTail(&ld->towork, newpl);
+ ld->curSub = ld->towork.tail;
+}
+
+static void
+RemoveHead(LexizeData *ld) {
+ LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
+
+ ld->posDict = 0;
+}
+
+static void
+setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) {
+ if ( correspondLexem ) {
+ *correspondLexem = ld->waste.head;
+ } else {
+ ParsedLex *tmp, *ptr = ld->waste.head;
+
+ while(ptr) {
+ tmp = ptr->next;
+ pfree(ptr);
+ ptr = tmp;
+ }
+ }
+ ld->waste.head = ld->waste.tail = NULL;
+}
+
+static void
+moveToWaste(LexizeData *ld, ParsedLex *stop) {
+ bool go = true;
+
+ while( ld->towork.head && go) {
+ if (ld->towork.head == stop) {
+ ld->curSub = stop->next;
+ go = false;
+ }
+ RemoveHead(ld);
+ }
+}
+
+static void
+setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) {
+ if ( ld->tmpRes ) {
+ TSLexeme *ptr;
+ for( ptr=ld->tmpRes; ptr->lexeme; ptr++ )
+ pfree( ptr->lexeme );
+ pfree( ld->tmpRes );
+ }
+ ld->tmpRes = res;
+ ld->lastRes = lex;
+}
+
+TSLexeme*
+LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) {
+ int i;
+ ListDictionary *map;
+ DictInfo *dict;
+ TSLexeme *res;
+
+ if ( ld->curDictId == InvalidOid ) {
+ /*
+ * usial mode: dictionary wants only one word,
+ * but we should keep in mind that we should go through
+ * all stack
+ */
+
+ while( ld->towork.head ) {
+ ParsedLex *curVal = ld->towork.head;
+
+ map = ld->cfg->map + curVal->type;
+
+ if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0 ) {
+ /* skip this type of lexeme */
+ RemoveHead(ld);
+ continue;
+ }
+
+ for (i = ld->posDict; i < map->len; i++) {
+ dict = finddict(DatumGetObjectId(map->dict_id[i]));
+
+ ld->dictState.isend = ld->dictState.getnext = false;
+ ld->dictState.private = NULL;
+ res = (TSLexeme *) DatumGetPointer( FunctionCall4(
+ &(dict->lexize_info),
+ PointerGetDatum(dict->dictionary),
+ PointerGetDatum(curVal->lemm),
+ Int32GetDatum(curVal->lenlemm),
+ PointerGetDatum(&ld->dictState)
+ ));
+
+ if ( ld->dictState.getnext ) {
+ /*
+ * dictinary wants next word, so setup and store
+ * current position and go to multiword mode
+ */
+
+ ld->curDictId = DatumGetObjectId(map->dict_id[i]);
+ ld->posDict = i+1;
+ ld->curSub = curVal->next;
+ if ( res )
+ setNewTmpRes(ld, curVal, res);
+ return LexizeExec(ld, correspondLexem);
+ }
+
+ if (!res) /* dictionary doesn't know this lexeme */
+ continue;
+
+ RemoveHead(ld);
+ setCorrLex(ld, correspondLexem);
+ return res;
+ }
+
+ RemoveHead(ld);
+ }
+ } else { /* curDictId is valid */
+ dict = finddict(ld->curDictId);
+
+ /*
+ * Dictionary ld->curDictId asks us about following words
+ */
+
+ while( ld->curSub ) {
+ ParsedLex *curVal = ld->curSub;
+
+ map = ld->cfg->map + curVal->type;
+
+ if (curVal->type != 0) {
+ bool dictExists = false;
+
+ if (curVal->type >= ld->cfg->len || map->len == 0 ) {
+ /* skip this type of lexeme */
+ ld->curSub = curVal->next;
+ continue;
+ }
+
+ /*
+ * We should be sure that current type of lexeme is recognized by
+ * our dictinonary: we just check is it exist in
+ * list of dictionaries ?
+ */
+ for(i=0;i < map->len && !dictExists; i++)
+ if ( ld->curDictId == DatumGetObjectId(map->dict_id[i]) )
+ dictExists = true;
+
+ if ( !dictExists ) {
+ /*
+ * Dictionary can't work with current tpe of lexeme,
+ * return to basic mode and redo all stored lexemes
+ */
+ ld->curDictId = InvalidOid;
+ return LexizeExec(ld, correspondLexem);
+ }
+ }
+
+ ld->dictState.isend = (curVal->type==0) ? true : false;
+ ld->dictState.getnext = false;
+
+ res = (TSLexeme *) DatumGetPointer( FunctionCall4(
+ &(dict->lexize_info),
+ PointerGetDatum(dict->dictionary),
+ PointerGetDatum(curVal->lemm),
+ Int32GetDatum(curVal->lenlemm),
+ PointerGetDatum(&ld->dictState)
+ ));
+
+ if ( ld->dictState.getnext ) {
+ /* Dictionary wants one more */
+ ld->curSub = curVal->next;
+ if ( res )
+ setNewTmpRes(ld, curVal, res);
+ continue;
+ }
+
+ if ( res || ld->tmpRes ) {
+ /*
+ * Dictionary normalizes lexemes,
+ * so we remove from stack all used lexemes ,
+ * return to basic mode and redo end of stack (if it exists)
+ */
+ if ( res ) {
+ moveToWaste( ld, ld->curSub );
+ } else {
+ res = ld->tmpRes;
+ moveToWaste( ld, ld->lastRes );
+ }
+
+ /* reset to initial state */
+ ld->curDictId = InvalidOid;
+ ld->posDict = 0;
+ ld->lastRes = NULL;
+ ld->tmpRes = NULL;
+ setCorrLex(ld, correspondLexem);
+ return res;
+ }
+
+ /* Dict don't want next lexem and didn't recognize anything,
+ redo from ld->towork.head */
+ ld->curDictId = InvalidOid;
+ return LexizeExec(ld, correspondLexem);
+ }
+ }
+
+ setCorrLex(ld, correspondLexem);
+ return NULL;
+}
+
'Example of synonym dictionary'
;
+CREATE FUNCTION thesaurus_init(internal)
+ RETURNS internal
+ as 'MODULE_PATHNAME'
+ LANGUAGE C;
+
+CREATE FUNCTION thesaurus_lexize(internal,internal,int4,internal)
+ RETURNS internal
+ as 'MODULE_PATHNAME'
+ LANGUAGE C
+ RETURNS NULL ON NULL INPUT;
+
+insert into pg_ts_dict select
+ 'thesaurus_template',
+ 'thesaurus_init(internal)',
+ null,
+ 'thesaurus_lexize(internal,internal,int4,internal)',
+ 'Thesaurus template, must be pointed Dictionary and DictFile'
+;
+
--dict conf
CREATE TABLE pg_ts_parser (
prs_name text not null primary key,
--example of ISpell dictionary
--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template';
+
--example of synonym dict
---update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5;
+--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_name='synonym';
+--example of thesaurus dict
+--update pg_ts_dict set dict_initoption='DictFile="contrib/thesaurus", Dictionary="en_stem"' where dict_name='thesaurus_template';
+--update pg_ts_cfgmap set dict_name = '{thesaurus_template,en_stem}' where dict_name = '{en_stem}';
END;
DROP FUNCTION snb_ru_init(internal);
DROP FUNCTION spell_init(internal);
DROP FUNCTION spell_lexize(internal,internal,int4);
+DROP FUNCTION thesaurus_init(internal);
+DROP FUNCTION thesaurus_lexize(internal,internal,int4);
DROP FUNCTION syn_init(internal);
DROP FUNCTION syn_lexize(internal,internal,int4);
DROP FUNCTION set_curprs(int);