From c52795d18a698d25b9cd7cd1ca9318a42b08fdb9 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Mon, 21 Nov 2005 12:27:57 +0000 Subject: [PATCH] Text parser rewritten: - supports multibyte encodings - more strict rules for lexemes - flex isn't used Add: - tsquery plainto_tsquery(text) Function makes tsquery from plain text. - &&, ||, !! operation for tsquery for combining tsquery from it's parts: 'foo & bar' || 'asd' => 'foo & bar | asd' --- contrib/tsearch2/Makefile | 5 +- contrib/tsearch2/expected/tsearch2.out | 119 +-- contrib/tsearch2/query.c | 82 +- contrib/tsearch2/query_support.c | 111 +++ contrib/tsearch2/sql/tsearch2.sql | 7 + contrib/tsearch2/ts_locale.c | 61 ++ contrib/tsearch2/ts_locale.h | 38 + contrib/tsearch2/tsearch.sql.in | 56 ++ contrib/tsearch2/wordparser/Makefile | 11 +- contrib/tsearch2/wordparser/deflex.c | 4 +- contrib/tsearch2/wordparser/deflex.h | 2 +- contrib/tsearch2/wordparser/parser.c | 1028 ++++++++++++++++++++++++ contrib/tsearch2/wordparser/parser.h | 147 +++- contrib/tsearch2/wordparser/parser.l | 346 -------- contrib/tsearch2/wparser_def.c | 20 +- 15 files changed, 1613 insertions(+), 424 deletions(-) create mode 100644 contrib/tsearch2/ts_locale.c create mode 100644 contrib/tsearch2/ts_locale.h create mode 100644 contrib/tsearch2/wordparser/parser.c delete mode 100644 contrib/tsearch2/wordparser/parser.l diff --git a/contrib/tsearch2/Makefile b/contrib/tsearch2/Makefile index 4901b611ee..2ef904ddb4 100644 --- a/contrib/tsearch2/Makefile +++ b/contrib/tsearch2/Makefile @@ -1,4 +1,4 @@ -# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.11 2005/11/08 17:08:46 teodor Exp $ +# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.12 2005/11/21 12:27:57 teodor Exp $ MODULE_big = tsearch2 OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \ @@ -6,7 +6,8 @@ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \ wparser.o wparser_def.o \ ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \ tsvector_op.o rank.o ts_stat.o \ - query_util.o query_support.o query_rewrite.o query_gist.o + query_util.o query_support.o query_rewrite.o query_gist.o \ + ts_locale.o SUBDIRS := snowball ispell wordparser SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o) diff --git a/contrib/tsearch2/expected/tsearch2.out b/contrib/tsearch2/expected/tsearch2.out index 296c0ac676..a98c2216a8 100644 --- a/contrib/tsearch2/expected/tsearch2.out +++ b/contrib/tsearch2/expected/tsearch2.out @@ -13,12 +13,12 @@ psql:tsearch2.sql:342: NOTICE: argument type tsvector is only a shell psql:tsearch2.sql:396: NOTICE: type "tsquery" is not yet defined DETAIL: Creating a shell type definition. psql:tsearch2.sql:401: NOTICE: argument type tsquery is only a shell -psql:tsearch2.sql:544: NOTICE: type "gtsvector" is not yet defined +psql:tsearch2.sql:559: NOTICE: type "gtsvector" is not yet defined DETAIL: Creating a shell type definition. -psql:tsearch2.sql:549: NOTICE: argument type gtsvector is only a shell -psql:tsearch2.sql:998: NOTICE: type "gtsq" is not yet defined +psql:tsearch2.sql:564: NOTICE: argument type gtsvector is only a shell +psql:tsearch2.sql:1054: NOTICE: type "gtsq" is not yet defined DETAIL: Creating a shell type definition. -psql:tsearch2.sql:1003: NOTICE: argument type gtsq is only a shell +psql:tsearch2.sql:1059: NOTICE: argument type gtsq is only a shell --tsvector SELECT '1'::tsvector; tsvector @@ -653,7 +653,7 @@ select * from token_type('default'); 11 | lpart_hword | Latin part of hyphenated word 12 | blank | Space symbols 13 | tag | HTML Tag - 14 | http | HTTP head + 14 | protocol | Protocol head 15 | hword | Hyphenated word 16 | lhword | Latin hyphenated word 17 | nlhword | Non-latin hyphenated word @@ -672,14 +672,13 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc -------+-------------------------------------- 22 | 345 12 | - 4 | qwe@efd.r - 12 | - 12 | ' - 12 | + 1 | qwe + 12 | @ + 19 | efd.r + 12 | ' 14 | http:// 6 | www.com - 12 | / - 12 | + 12 | / 14 | http:// 5 | aew.werc.ewr/?ad=qwe&dw 6 | aew.werc.ewr @@ -700,10 +699,8 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc 6 | 4aew.werc.ewr 12 | 14 | http:// - 5 | 5aew.werc.ewr:8100/? - 6 | 5aew.werc.ewr - 18 | :8100/? - 12 | + 6 | 5aew.werc.ewr:8100 + 12 | /? 1 | ad 12 | = 1 | qwe @@ -711,12 +708,12 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc 1 | dw 12 | 5 | 6aew.werc.ewr:8100/?ad=qwe&dw - 6 | 6aew.werc.ewr - 18 | :8100/?ad=qwe&dw + 6 | 6aew.werc.ewr:8100 + 18 | /?ad=qwe&dw 12 | 5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 - 6 | 7aew.werc.ewr - 18 | :8100/?ad=qwe&dw=%20%32 + 6 | 7aew.werc.ewr:8100 + 18 | /?ad=qwe&dw=%20%32 12 | 7 | +4.0e-10 12 | @@ -747,11 +744,15 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc 1 | jf 12 | 1 | sdjk - 13 | + 12 | < + 1 | we 12 | - 3 | ewr1 - 12 | > + 1 | hjwer + 12 | + 13 | 12 | + 3 | ewr1 + 12 | > 3 | ewri2 12 | 13 | @@ -767,57 +768,53 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc 12 | 19 | /wqe-324/ewr 12 | - 6 | gist.h - 12 | - 6 | gist.h.c + 19 | gist.h 12 | - 6 | gist.c - 12 | . + 19 | gist.h.c 12 | + 19 | gist.c + 12 | . 1 | readline 12 | 20 | 4.2 12 | 20 | 4.2 - 12 | . - 12 | + 12 | . 20 | 4.2 - 12 | , - 12 | - 15 | readline-4 + 12 | , + 15 | readline-4.2 11 | readline 12 | - 20 | 4.2 12 | - 15 | readline-4 + 15 | readline-4.2 11 | readline 12 | - 20 | 4.2 - 12 | . - 12 | + 12 | . 22 | 234 12 | - 13 | + 12 | < + 1 | i + 12 | + 13 | 12 | 1 | wow 12 | - 12 | < - 12 | + 12 | < 1 | jqw 12 | - 12 | < - 12 | > - 12 | + 12 | <> 1 | qwerty -(138 rows) +(135 rows) SELECT to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf qwer jf sdjk ewr1> ewri2 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 wow < jqw <> qwerty'); - to_tsvectorad':18 'dw':20 'jf':40 '234':62 '345':1 '4.2':53,54,55,58,61 '455':32 'jqw':64 'qwe':19,28,29,36 'wer':37 'wow':63 'asdf':38 'ewr1':42 'qwer':39 'sdjk':41 '5.005':33 'ewri2':43 'qwqwe':30 'wefjn':47 'gist.c':51 'gist.h':49 'qwerti':65 '234.435':31 ':8100/?':17 'qwe-wer':35 'readlin':52,57,60 'www.com':3 '+4.0e-10':27 'gist.h.c':50 'rewt/ewr':46 'qwe@efd.r':2 'readline-4':56,59 '/?ad=qwe&dw':6,9,13 '/wqe-324/ewr':48 'aew.werc.ewr':5 '1aew.werc.ewr':8 '2aew.werc.ewr':10 '3aew.werc.ewr':12 '4aew.werc.ewr':14 '5aew.werc.ewr':16 '6aew.werc.ewr':22 '7aew.werc.ewr':25 '/usr/local/fff':44 '/awdf/dwqe/4325':45 ':8100/?ad=qwe&dw':23 'teodor@stack.net':34 '5aew.werc.ewr:8100/?':15 ':8100/?ad=qwe&dw=%20%32':26 'aew.werc.ewr/?ad=qwe&dw':4 '1aew.werc.ewr/?ad=qwe&dw':7 '3aew.werc.ewr/?ad=qwe&dw':11 '6aew.werc.ewr:8100/?ad=qwe&dw':21 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':24 + to_tsvectorad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23 (1 row) SELECT length(to_tsvector('default', '345 qw')); @@ -831,7 +828,7 @@ SELECT length(to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://ae wow < jqw <> qwerty')); length -------- - 53 + 51 (1 row) select to_tsquery('default', 'qwe & sKies '); @@ -876,6 +873,36 @@ select to_tsquery('default', '(the|and&(i&1))&fghj'); '1' & 'fghj' (1 row) +select plainto_tsquery('default', 'the and z 1))& fghj'); + plainto_tsquery +-------------------- + 'z' & '1' & 'fghj' +(1 row) + +select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd'); + ?column? +----------------------- + 'foo' & 'bar' & 'asd' +(1 row) + +select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg'); + ?column? +------------------------------ + 'foo' & 'bar' | 'asd' & 'fg' +(1 row) + +select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg'); + ?column? +----------------------------------- + 'foo' & 'bar' | !( 'asd' & 'fg' ) +(1 row) + +select plainto_tsquery('default', 'foo bar') && 'asd | fg'; + ?column? +---------------------------------- + 'foo' & 'bar' & ( 'asd' | 'fg' ) +(1 row) + select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca'; ?column? ---------- diff --git a/contrib/tsearch2/query.c b/contrib/tsearch2/query.c index e6f1ae3a89..e312cf6af7 100644 --- a/contrib/tsearch2/query.c +++ b/contrib/tsearch2/query.c @@ -51,10 +51,20 @@ Datum to_tsquery_name(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(to_tsquery_current); Datum to_tsquery_current(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(plainto_tsquery); +Datum plainto_tsquery(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(plainto_tsquery_name); +Datum plainto_tsquery_name(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(plainto_tsquery_current); +Datum plainto_tsquery_current(PG_FUNCTION_ARGS); + /* parser's states */ #define WAITOPERAND 1 #define WAITOPERATOR 2 #define WAITFIRSTOPERAND 3 +#define WAITSINGLEOPERAND 4 /* * node of query tree, also used @@ -195,6 +205,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2 else if (*(state->buf) != ' ') return ERR; break; + case WAITSINGLEOPERAND: + if ( *(state->buf) == '\0' ) + return END; + *strval = state->buf; + *lenval = strlen( state->buf ); + state->buf += strlen( state->buf ); + state->count++; + return VAL; default: return ERR; break; @@ -582,7 +600,7 @@ findoprnd(ITEM * ptr, int4 *pos) * input */ static QUERYTYPE * - queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id) +queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id, bool isplain) { QPRS_STATE state; int4 i; @@ -599,7 +617,7 @@ static QUERYTYPE * /* init state */ state.buf = buf; - state.state = WAITFIRSTOPERAND; + state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND; state.count = 0; state.num = 0; state.str = NULL; @@ -679,7 +697,7 @@ Datum tsquery_in(PG_FUNCTION_ARGS) { SET_FUNCOID(); - PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0)); + PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false)); } /* @@ -910,7 +928,7 @@ to_tsquery(PG_FUNCTION_ARGS) str = text2char(in); PG_FREE_IF_COPY(in, 1); - query = queryin(str, pushval_morph, PG_GETARG_INT32(0)); + query = queryin(str, pushval_morph, PG_GETARG_INT32(0),false); if ( query->size == 0 ) PG_RETURN_POINTER(query); @@ -950,3 +968,59 @@ to_tsquery_current(PG_FUNCTION_ARGS) Int32GetDatum(get_currcfg()), PG_GETARG_DATUM(0))); } + +Datum +plainto_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_P(1); + char *str; + QUERYTYPE *query; + ITEM *res; + int4 len; + + SET_FUNCOID(); + + str = text2char(in); + PG_FREE_IF_COPY(in, 1); + + query = queryin(str, pushval_morph, PG_GETARG_INT32(0), true); + + if ( query->size == 0 ) + PG_RETURN_POINTER(query); + + res = clean_fakeval_v2(GETQUERY(query), &len); + if (!res) + { + query->len = HDRSIZEQT; + query->size = 0; + PG_RETURN_POINTER(query); + } + memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(ITEM)); + pfree(res); + PG_RETURN_POINTER(query); +} + +Datum +plainto_tsquery_name(PG_FUNCTION_ARGS) +{ + text *name = PG_GETARG_TEXT_P(0); + Datum res; + + SET_FUNCOID(); + res = DirectFunctionCall2(plainto_tsquery, + Int32GetDatum(name2id_cfg(name)), + PG_GETARG_DATUM(1)); + + PG_FREE_IF_COPY(name, 0); + PG_RETURN_DATUM(res); +} + +Datum +plainto_tsquery_current(PG_FUNCTION_ARGS) +{ + SET_FUNCOID(); + PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery, + Int32GetDatum(get_currcfg()), + PG_GETARG_DATUM(0))); +} + diff --git a/contrib/tsearch2/query_support.c b/contrib/tsearch2/query_support.c index c973def7d4..edc2d48fcf 100644 --- a/contrib/tsearch2/query_support.c +++ b/contrib/tsearch2/query_support.c @@ -14,6 +14,117 @@ tsquery_numnode(PG_FUNCTION_ARGS) { PG_RETURN_INT32(nnode); } +static QTNode* +join_tsqueries(QUERYTYPE *a, QUERYTYPE *b) { + QTNode *res=(QTNode*)palloc0( sizeof(QTNode) ); + + res->flags |= QTN_NEEDFREE; + + res->valnode = (ITEM*)palloc0( sizeof(ITEM) ); + res->valnode->type = OPR; + + res->child = (QTNode**)palloc0( sizeof(QTNode*)*2 ); + res->child[0] = QT2QTN( GETQUERY(b), GETOPERAND(b) ); + res->child[1] = QT2QTN( GETQUERY(a), GETOPERAND(a) ); + res->nchild = 2; + + return res; +} + +PG_FUNCTION_INFO_V1(tsquery_and); +Datum tsquery_and(PG_FUNCTION_ARGS); + +Datum +tsquery_and(PG_FUNCTION_ARGS) { + QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0))); + QUERYTYPE *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1))); + QTNode *res; + QUERYTYPE *query; + + if ( a->size == 0 ) { + PG_FREE_IF_COPY(a,1); + PG_RETURN_POINTER(b); + } else if ( b->size == 0 ) { + PG_FREE_IF_COPY(b,1); + PG_RETURN_POINTER(a); + } + + res = join_tsqueries(a, b); + + res->valnode->val = '&'; + + query = QTN2QT( res, PlainMemory ); + + QTNFree(res); + PG_FREE_IF_COPY(a,0); + PG_FREE_IF_COPY(b,1); + + PG_RETURN_POINTER(query); +} + +PG_FUNCTION_INFO_V1(tsquery_or); +Datum tsquery_or(PG_FUNCTION_ARGS); + +Datum +tsquery_or(PG_FUNCTION_ARGS) { + QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0))); + QUERYTYPE *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1))); + QTNode *res; + QUERYTYPE *query; + + if ( a->size == 0 ) { + PG_FREE_IF_COPY(a,1); + PG_RETURN_POINTER(b); + } else if ( b->size == 0 ) { + PG_FREE_IF_COPY(b,1); + PG_RETURN_POINTER(a); + } + + res = join_tsqueries(a, b); + + res->valnode->val = '|'; + + query = QTN2QT( res, PlainMemory ); + + QTNFree(res); + PG_FREE_IF_COPY(a,0); + PG_FREE_IF_COPY(b,1); + + PG_RETURN_POINTER(query); +} + +PG_FUNCTION_INFO_V1(tsquery_not); +Datum tsquery_not(PG_FUNCTION_ARGS); + +Datum +tsquery_not(PG_FUNCTION_ARGS) { + QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0))); + QTNode *res; + QUERYTYPE *query; + + if ( a->size == 0 ) + PG_RETURN_POINTER(a); + + res=(QTNode*)palloc0( sizeof(QTNode) ); + + res->flags |= QTN_NEEDFREE; + + res->valnode = (ITEM*)palloc0( sizeof(ITEM) ); + res->valnode->type = OPR; + res->valnode->val = '!'; + + res->child = (QTNode**)palloc0( sizeof(QTNode*) ); + res->child[0] = QT2QTN( GETQUERY(a), GETOPERAND(a) ); + res->nchild = 1; + + query = QTN2QT( res, PlainMemory ); + + QTNFree(res); + PG_FREE_IF_COPY(a,0); + + PG_RETURN_POINTER(query); +} + static int CompareTSQ( QUERYTYPE *a, QUERYTYPE *b ) { if ( a->size != b->size ) { diff --git a/contrib/tsearch2/sql/tsearch2.sql b/contrib/tsearch2/sql/tsearch2.sql index 0923ce7a19..bd0baa3b41 100644 --- a/contrib/tsearch2/sql/tsearch2.sql +++ b/contrib/tsearch2/sql/tsearch2.sql @@ -173,6 +173,13 @@ select to_tsquery('default', 'asd&(and|fghj)'); select to_tsquery('default', '(asd&and)|fghj'); select to_tsquery('default', '(asd&!and)|fghj'); select to_tsquery('default', '(the|and&(i&1))&fghj'); + +select plainto_tsquery('default', 'the and z 1))& fghj'); +select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd'); +select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg'); +select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg'); +select plainto_tsquery('default', 'foo bar') && 'asd | fg'; + select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca'; select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:B'; select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:A'; diff --git a/contrib/tsearch2/ts_locale.c b/contrib/tsearch2/ts_locale.c new file mode 100644 index 0000000000..b84681f1b0 --- /dev/null +++ b/contrib/tsearch2/ts_locale.c @@ -0,0 +1,61 @@ +#include "ts_locale.h" + +#include "utils/builtins.h" +#include "utils/pg_locale.h" +#include "mb/pg_wchar.h" + + +#if defined(TS_USE_WIDE) && defined(WIN32) + +size_t +wchar2char( const char *to, const wchar_t *from, size_t len ) { + if (GetDatabaseEncoding() == PG_UTF8) { + int r; + + if (len==0) + return 0; + + r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes, + NULL, NULL); + + + if ( r==0 ) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("UTF-16 to UTF-8 translation failed: %lu", + GetLastError()))); + + return r; + } + + return wcstombs(to, from, len); +} + +size_t +char2wchar( const wchar_t *to, const char *from, size_t len ) { + if (GetDatabaseEncoding() == PG_UTF8) { + int r; + + if (len==0) + return 0; + + r = MultiByteToWideChar(CP_UTF8, 0, from, len, + to, len); + + if (!r) { + pg_verifymbstr(from, len, false); + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid multibyte character for locale"), + errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); + } + + Assert(r <= nbytes); + + return r; + } + + return mbstowcs(to, from, len); +} + +#endif diff --git a/contrib/tsearch2/ts_locale.h b/contrib/tsearch2/ts_locale.h new file mode 100644 index 0000000000..a7ce6f1bbc --- /dev/null +++ b/contrib/tsearch2/ts_locale.h @@ -0,0 +1,38 @@ +#ifndef __TSLOCALE_H__ +#define __TSLOCALE_H__ + +#include "postgres.h" + +#include +#include + +/* + * towlower() and friends should be in , but some pre-C99 systems + * declare them in . + */ +#ifdef HAVE_WCHAR_H +#include +#endif +#ifdef HAVE_WCTYPE_H +#include +#endif + +#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) +#define TS_USE_WIDE + +#ifdef WIN32 + +size_t wchar2char( const char *to, const wchar_t *from, size_t len ); +size_t char2wchar( const wchar_t *to, const char *from, size_t len ); + +#else /* WIN32 */ + +/* correct mbstowcs */ +#define char2wchar mbstowcs +#define wchar2char wcstombs + +#endif /* WIN32 */ + +#endif /* defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) */ + +#endif /* __TSLOCALE_H__ */ diff --git a/contrib/tsearch2/tsearch.sql.in b/contrib/tsearch2/tsearch.sql.in index 9bdf641e12..4fdf974d0d 100644 --- a/contrib/tsearch2/tsearch.sql.in +++ b/contrib/tsearch2/tsearch.sql.in @@ -427,6 +427,21 @@ RETURNS tsquery AS 'MODULE_PATHNAME','to_tsquery_current' LANGUAGE 'c' with (isstrict,iscachable); +CREATE FUNCTION plainto_tsquery(oid, text) +RETURNS tsquery +AS 'MODULE_PATHNAME' +LANGUAGE 'c' with (isstrict,iscachable); + +CREATE FUNCTION plainto_tsquery(text, text) +RETURNS tsquery +AS 'MODULE_PATHNAME','plainto_tsquery_name' +LANGUAGE 'c' with (isstrict,iscachable); + +CREATE FUNCTION plainto_tsquery(text) +RETURNS tsquery +AS 'MODULE_PATHNAME','plainto_tsquery_current' +LANGUAGE 'c' with (isstrict,iscachable); + --operations CREATE FUNCTION exectsq(tsvector, tsquery) RETURNS bool @@ -929,6 +944,47 @@ CREATE OR REPLACE FUNCTION numnode(tsquery) language 'C' with (isstrict,iscachable); +CREATE OR REPLACE FUNCTION tsquery_and(tsquery,tsquery) + returns tsquery + as 'MODULE_PATHNAME', 'tsquery_and' + language 'C' + with (isstrict,iscachable); + +CREATE OPERATOR && ( + LEFTARG = tsquery, + RIGHTARG = tsquery, + PROCEDURE = tsquery_and, + COMMUTATOR = '&&', + RESTRICT = contsel, + JOIN = contjoinsel +); + +CREATE OR REPLACE FUNCTION tsquery_or(tsquery,tsquery) + returns tsquery + as 'MODULE_PATHNAME', 'tsquery_or' + language 'C' + with (isstrict,iscachable); + +CREATE OPERATOR || ( + LEFTARG = tsquery, + RIGHTARG = tsquery, + PROCEDURE = tsquery_or, + COMMUTATOR = '||', + RESTRICT = contsel, + JOIN = contjoinsel +); + +CREATE OR REPLACE FUNCTION tsquery_not(tsquery) + returns tsquery + as 'MODULE_PATHNAME', 'tsquery_not' + language 'C' + with (isstrict,iscachable); + +CREATE OPERATOR !! ( + RIGHTARG = tsquery, + PROCEDURE = tsquery_not +); + --------------rewrite subsystem CREATE OR REPLACE FUNCTION rewrite(tsquery, text) diff --git a/contrib/tsearch2/wordparser/Makefile b/contrib/tsearch2/wordparser/Makefile index 0070970e21..c4eceba60b 100644 --- a/contrib/tsearch2/wordparser/Makefile +++ b/contrib/tsearch2/wordparser/Makefile @@ -1,8 +1,8 @@ -# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.8 2005/10/18 01:30:49 tgl Exp $ +# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.9 2005/11/21 12:27:57 teodor Exp $ SUBOBJS = parser.o deflex.o -EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) parser.c +EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) PG_CPPFLAGS = -I$(srcdir)/.. @@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL) all: SUBSYS.o -parser.c: parser.l -ifdef FLEX - $(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $< -else - @$(missing) flex $< $@ -endif - SUBSYS.o: $(SUBOBJS) $(LD) $(LDREL) $(LDOUT) $@ $^ diff --git a/contrib/tsearch2/wordparser/deflex.c b/contrib/tsearch2/wordparser/deflex.c index bbf3271b66..8f93d277a1 100644 --- a/contrib/tsearch2/wordparser/deflex.c +++ b/contrib/tsearch2/wordparser/deflex.c @@ -15,7 +15,7 @@ const char *lex_descr[] = { "Latin part of hyphenated word", "Space symbols", "HTML Tag", - "HTTP head", + "Protocol head", "Hyphenated word", "Latin hyphenated word", "Non-latin hyphenated word", @@ -42,7 +42,7 @@ const char *tok_alias[] = { "lpart_hword", "blank", "tag", - "http", + "protocol", "hword", "lhword", "nlhword", diff --git a/contrib/tsearch2/wordparser/deflex.h b/contrib/tsearch2/wordparser/deflex.h index 651d1f9e77..893f843051 100644 --- a/contrib/tsearch2/wordparser/deflex.h +++ b/contrib/tsearch2/wordparser/deflex.h @@ -17,7 +17,7 @@ #define LATPARTHYPHENWORD 11 #define SPACE 12 #define TAG 13 -#define HTTP 14 +#define PROTOCOL 14 #define HYPHENWORD 15 #define LATHYPHENWORD 16 #define CYRHYPHENWORD 17 diff --git a/contrib/tsearch2/wordparser/parser.c b/contrib/tsearch2/wordparser/parser.c new file mode 100644 index 0000000000..e414a865ff --- /dev/null +++ b/contrib/tsearch2/wordparser/parser.c @@ -0,0 +1,1028 @@ +#include "postgres.h" + +#include "utils/builtins.h" +#include "utils/pg_locale.h" +#include "mb/pg_wchar.h" + +#include "deflex.h" +#include "parser.h" +#include "ts_locale.h" + + +static TParserPosition* +newTParserPosition(TParserPosition *prev) { + TParserPosition *res = (TParserPosition*)palloc(sizeof(TParserPosition)); + + if ( prev ) + memcpy(res, prev, sizeof(TParserPosition)); + else + memset(res, 0, sizeof(TParserPosition)); + + res->prev = prev; + + res->pushedAtAction = NULL; + + return res; +} + +TParser* +TParserInit( char *str, int len ) { + TParser *prs = (TParser*)palloc0( sizeof(TParser) ); + + prs->charmaxlen = pg_database_encoding_max_length(); + prs->str = str; + prs->lenstr = len; + +#ifdef TS_USE_WIDE + /* + * Use wide char code only when max encoding length > 1 and ctype != C. + * Some operating systems fail with multi-byte encodings and a C locale. + * Also, for a C locale there is no need to process as multibyte. + * From backend/utils/adt/oracle_compat.c Teodor + */ + + if ( prs->charmaxlen > 1 && !lc_ctype_is_c() ) { + prs->usewide=true; + prs->wstr = (wchar_t*)palloc( sizeof(wchar_t) * prs->lenstr ); + prs->lenwstr = char2wchar( prs->wstr, prs->str, prs->lenstr ); + } else +#endif + prs->usewide=false; + + prs->state = newTParserPosition(NULL); + prs->state->state = TPS_Base; + + return prs; +} + +void +TParserClose( TParser* prs ) { + while( prs->state ) { + TParserPosition *ptr = prs->state->prev; + pfree( prs->state ); + prs->state = ptr; + } + + if ( prs->wstr ) + pfree( prs->wstr ); + pfree( prs ); +} + +/* + * defining support function, equvalent is* macroses, but + * working with any possible encodings and locales + */ + +#ifdef TS_USE_WIDE + +#define p_iswhat(type) \ +static int \ +p_is##type(TParser *prs) { \ + Assert( prs->state ); \ + return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ + is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ +} \ + \ +static int \ +p_isnot##type(TParser *prs) { \ + return !p_is##type(prs); \ +} + + + +/* p_iseq should be used only for ascii symbols */ + +static int +p_iseq(TParser *prs, char c) { + Assert( prs->state ); + return ( ( prs->state->charlen==1 && *( prs->str + prs->state->posbyte ) == c ) ) ? 1 : 0; +} + +#else /* TS_USE_WIDE */ + +#define p_iswhat(type) \ +static int \ +p_is##type(TParser *prs) { \ + Assert( prs->state ); \ + return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ +} \ + \ +static int \ +p_isnot##type(TParser *prs) { \ + return !p_is##type(prs); \ +} + + +static int +p_iseq(TParser *prs, char c) { + Assert( prs->state ); + return ( *( prs->str + prs->state->posbyte ) == c ) ) ? 1 : 0; +} + +#endif /* TS_USE_WIDE */ + +p_iswhat(alnum) +p_iswhat(alpha) +p_iswhat(digit) +p_iswhat(lower) +p_iswhat(print) +p_iswhat(punct) +p_iswhat(space) +p_iswhat(upper) +p_iswhat(xdigit) + +static int +p_isEOF(TParser *prs) { + Assert( prs->state ); + return (prs->state->posbyte == prs->lenstr || prs->state->charlen==0) ? 1 : 0; +} + +static int +p_iseqC(TParser *prs) { + return p_iseq(prs, prs->c); +} + +static int +p_isneC(TParser *prs) { + return !p_iseq(prs, prs->c); +} + +static int +p_isascii(TParser *prs) { + return ( prs->state->charlen==1 && isascii( (unsigned char) *( prs->str + prs->state->posbyte ) ) ) ? 1 : 0; +} + +static int +p_islatin(TParser *prs) { + return ( p_isalpha(prs) && p_isascii(prs) ) ? 1 : 0; +} + +static int +p_isnonlatin(TParser *prs) { + return ( p_isalpha(prs) && !p_isascii(prs) ) ? 1 : 0; +} + +void _make_compiler_happy(void); +void +_make_compiler_happy(void) { + p_isalnum(NULL); p_isnotalnum(NULL); + p_isalpha(NULL); p_isnotalpha(NULL); + p_isdigit(NULL); p_isnotdigit(NULL); + p_islower(NULL); p_isnotlower(NULL); + p_isprint(NULL); p_isnotprint(NULL); + p_ispunct(NULL); p_isnotpunct(NULL); + p_isspace(NULL); p_isnotspace(NULL); + p_isupper(NULL); p_isnotupper(NULL); + p_isxdigit(NULL); p_isnotxdigit(NULL); + p_isEOF(NULL); + p_iseqC(NULL); p_isneC(NULL); +} + + +static void +SpecialTags(TParser *prs) { + switch( prs->state->lencharlexeme ) { + case 8: /* lexeme, "ignore = false; + break; + case 7: /*