]> granicus.if.org Git - postgresql/blob - src/backend/snowball/dict_snowball.c
Adjust blank lines around PG_MODULE_MAGIC defines, for consistency
[postgresql] / src / backend / snowball / dict_snowball.c
1 /*-------------------------------------------------------------------------
2  *
3  * dict_snowball.c
4  *              Snowball dictionary
5  *
6  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *        src/backend/snowball/dict_snowball.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include "commands/defrem.h"
16 #include "tsearch/ts_locale.h"
17 #include "tsearch/ts_utils.h"
18
19 /* Some platforms define MAXINT and/or MININT, causing conflicts */
20 #ifdef MAXINT
21 #undef MAXINT
22 #endif
23 #ifdef MININT
24 #undef MININT
25 #endif
26
27 /* Now we can include the original Snowball header.h */
28 #include "snowball/libstemmer/header.h"
29 #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
30 #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
31 #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
32 #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
33 #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
34 #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
35 #include "snowball/libstemmer/stem_ISO_8859_1_hungarian.h"
36 #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
37 #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
38 #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
39 #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
40 #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
41 #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
42 #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
43 #include "snowball/libstemmer/stem_KOI8_R_russian.h"
44 #include "snowball/libstemmer/stem_UTF_8_danish.h"
45 #include "snowball/libstemmer/stem_UTF_8_dutch.h"
46 #include "snowball/libstemmer/stem_UTF_8_english.h"
47 #include "snowball/libstemmer/stem_UTF_8_finnish.h"
48 #include "snowball/libstemmer/stem_UTF_8_french.h"
49 #include "snowball/libstemmer/stem_UTF_8_german.h"
50 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
51 #include "snowball/libstemmer/stem_UTF_8_italian.h"
52 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
53 #include "snowball/libstemmer/stem_UTF_8_porter.h"
54 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
55 #include "snowball/libstemmer/stem_UTF_8_romanian.h"
56 #include "snowball/libstemmer/stem_UTF_8_russian.h"
57 #include "snowball/libstemmer/stem_UTF_8_spanish.h"
58 #include "snowball/libstemmer/stem_UTF_8_swedish.h"
59 #include "snowball/libstemmer/stem_UTF_8_turkish.h"
60
61 PG_MODULE_MAGIC;
62
63 PG_FUNCTION_INFO_V1(dsnowball_init);
64 Datum           dsnowball_init(PG_FUNCTION_ARGS);
65
66 PG_FUNCTION_INFO_V1(dsnowball_lexize);
67 Datum           dsnowball_lexize(PG_FUNCTION_ARGS);
68
69 /* List of supported modules */
70 typedef struct stemmer_module
71 {
72         const char *name;
73         pg_enc          enc;
74         struct SN_env *(*create) (void);
75         void            (*close) (struct SN_env *);
76         int                     (*stem) (struct SN_env *);
77 } stemmer_module;
78
79 static const stemmer_module stemmer_modules[] =
80 {
81         /*
82          * Stemmers list from Snowball distribution
83          */
84         {"danish", PG_LATIN1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
85         {"dutch", PG_LATIN1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
86         {"english", PG_LATIN1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
87         {"finnish", PG_LATIN1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
88         {"french", PG_LATIN1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
89         {"german", PG_LATIN1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
90         {"hungarian", PG_LATIN1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
91         {"italian", PG_LATIN1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
92         {"norwegian", PG_LATIN1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
93         {"porter", PG_LATIN1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
94         {"portuguese", PG_LATIN1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
95         {"spanish", PG_LATIN1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
96         {"swedish", PG_LATIN1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
97         {"romanian", PG_LATIN2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
98         {"russian", PG_KOI8R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
99         {"danish", PG_UTF8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
100         {"dutch", PG_UTF8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
101         {"english", PG_UTF8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
102         {"finnish", PG_UTF8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
103         {"french", PG_UTF8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
104         {"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
105         {"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
106         {"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
107         {"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
108         {"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
109         {"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
110         {"romanian", PG_UTF8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
111         {"russian", PG_UTF8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
112         {"spanish", PG_UTF8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
113         {"swedish", PG_UTF8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
114         {"turkish", PG_UTF8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
115
116         /*
117          * Stemmer with PG_SQL_ASCII encoding should be valid for any server
118          * encoding
119          */
120         {"english", PG_SQL_ASCII, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
121
122         {NULL, 0, NULL, NULL, NULL} /* list end marker */
123 };
124
125
126 typedef struct DictSnowball
127 {
128         struct SN_env *z;
129         StopList        stoplist;
130         bool            needrecode;             /* needs recoding before/after call stem */
131         int                     (*stem) (struct SN_env * z);
132
133         /*
134          * snowball saves alloced memory between calls, so we should run it in our
135          * private memory context. Note, init function is executed in long lived
136          * context, so we just remember CurrentMemoryContext
137          */
138         MemoryContext dictCtx;
139 } DictSnowball;
140
141
142 static void
143 locate_stem_module(DictSnowball *d, char *lang)
144 {
145         const stemmer_module *m;
146
147         /*
148          * First, try to find exact match of stemmer module. Stemmer with
149          * PG_SQL_ASCII encoding is treated as working with any server encoding
150          */
151         for (m = stemmer_modules; m->name; m++)
152         {
153                 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
154                         pg_strcasecmp(m->name, lang) == 0)
155                 {
156                         d->stem = m->stem;
157                         d->z = m->create();
158                         d->needrecode = false;
159                         return;
160                 }
161         }
162
163         /*
164          * Second, try to find stemmer for needed language for UTF8 encoding.
165          */
166         for (m = stemmer_modules; m->name; m++)
167         {
168                 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
169                 {
170                         d->stem = m->stem;
171                         d->z = m->create();
172                         d->needrecode = true;
173                         return;
174                 }
175         }
176
177         ereport(ERROR,
178                         (errcode(ERRCODE_UNDEFINED_OBJECT),
179                          errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
180                                         lang, GetDatabaseEncodingName())));
181 }
182
183 Datum
184 dsnowball_init(PG_FUNCTION_ARGS)
185 {
186         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
187         DictSnowball *d;
188         bool            stoploaded = false;
189         ListCell   *l;
190
191         d = (DictSnowball *) palloc0(sizeof(DictSnowball));
192
193         foreach(l, dictoptions)
194         {
195                 DefElem    *defel = (DefElem *) lfirst(l);
196
197                 if (pg_strcasecmp("StopWords", defel->defname) == 0)
198                 {
199                         if (stoploaded)
200                                 ereport(ERROR,
201                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
202                                                  errmsg("multiple StopWords parameters")));
203                         readstoplist(defGetString(defel), &d->stoplist, lowerstr);
204                         stoploaded = true;
205                 }
206                 else if (pg_strcasecmp("Language", defel->defname) == 0)
207                 {
208                         if (d->stem)
209                                 ereport(ERROR,
210                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
211                                                  errmsg("multiple Language parameters")));
212                         locate_stem_module(d, defGetString(defel));
213                 }
214                 else
215                 {
216                         ereport(ERROR,
217                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
218                                          errmsg("unrecognized Snowball parameter: \"%s\"",
219                                                         defel->defname)));
220                 }
221         }
222
223         if (!d->stem)
224                 ereport(ERROR,
225                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
226                                  errmsg("missing Language parameter")));
227
228         d->dictCtx = CurrentMemoryContext;
229
230         PG_RETURN_POINTER(d);
231 }
232
233 Datum
234 dsnowball_lexize(PG_FUNCTION_ARGS)
235 {
236         DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
237         char       *in = (char *) PG_GETARG_POINTER(1);
238         int32           len = PG_GETARG_INT32(2);
239         char       *txt = lowerstr_with_len(in, len);
240         TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
241
242         if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
243         {
244                 pfree(txt);
245         }
246         else
247         {
248                 MemoryContext saveCtx;
249
250                 /*
251                  * recode to utf8 if stemmer is utf8 and doesn't match server encoding
252                  */
253                 if (d->needrecode)
254                 {
255                         char       *recoded;
256
257                         recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
258                         if (recoded != txt)
259                         {
260                                 pfree(txt);
261                                 txt = recoded;
262                         }
263                 }
264
265                 /* see comment about d->dictCtx */
266                 saveCtx = MemoryContextSwitchTo(d->dictCtx);
267                 SN_set_current(d->z, strlen(txt), (symbol *) txt);
268                 d->stem(d->z);
269                 MemoryContextSwitchTo(saveCtx);
270
271                 if (d->z->p && d->z->l)
272                 {
273                         txt = repalloc(txt, d->z->l + 1);
274                         memcpy(txt, d->z->p, d->z->l);
275                         txt[d->z->l] = '\0';
276                 }
277
278                 /* back recode if needed */
279                 if (d->needrecode)
280                 {
281                         char       *recoded;
282
283                         recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
284                         if (recoded != txt)
285                         {
286                                 pfree(txt);
287                                 txt = recoded;
288                         }
289                 }
290
291                 res->lexeme = txt;
292         }
293
294         PG_RETURN_POINTER(res);
295 }