]> granicus.if.org Git - postgresql/blob - contrib/tsearch/morph.c
Thank you very much, you catch it :). This bug had a long life, because it
[postgresql] / contrib / tsearch / morph.c
1 /*
2  * morphology module
3  * New dictionary is include in dict.h. For languages which
4  * use latin charset it may be need to modify mapdict table.
5  * Teodor Sigaev <teodor@stack.net>
6  */
7 #include "postgres.h"
8
9 #include <locale.h>
10
11 #include "utils/builtins.h"
12
13 #include "morph.h"
14 #include "deflex.h"
15
16 /*
17  * Struct for calling dictionaries
18  * All of this methods are optional, but
19  * if all methods are NULL, then dictionary does nothing :)
20  * Return value of lemmatize must be palloced or the same.
21  * Return value of init must be malloced in other case
22  * it will be free in end of transaction!
23  */
24 typedef struct
25 {
26         char            localename[NAMEDATALEN];
27         /* init dictionary */
28         void       *(*init) (void);
29         /* close dictionary */
30         void            (*close) (void *);
31         /* find in dictionary */
32         char       *(*lemmatize) (void *, char *, int *);
33         int                     (*is_stoplemm) (void *, char *, int);
34         int                     (*is_stemstoplemm) (void *, char *, int);
35 }       DICT;
36
37 /* insert all dictionaries */
38 #define DICT_BODY
39 #include "dict.h"
40 #undef  DICT_BODY
41
42 /* fill dictionary's structure */
43 #define DICT_TABLE
44 DICT            dicts[] = {
45         {
46                 "C", NULL, NULL, NULL, NULL, NULL               /* fake dictionary */
47         }
48 #include "dict.h"
49 };
50
51 #undef DICT_TABLE
52
53 /* array for storing dictionary's objects (if needed) */
54 void       *dictobjs[lengthof(dicts)];
55
56 #define STOPLEXEM       -2
57 #define BYLOCALE        -1
58 #define NODICT          0
59 #define DEFAULTDICT 1
60
61 #define MAXNDICT        2
62 typedef int2 MAPDICT[MAXNDICT];
63
64 #define GETDICT(x,i)    *( ((int2*)(x)) + (i) )
65
66 /* map dictionaries for lexem type */
67 static MAPDICT mapdict[] = {
68         {NODICT, NODICT},                       /* not used                     */
69         {DEFAULTDICT, NODICT},          /* LATWORD              */
70         {BYLOCALE, NODICT},                     /* NONLATINWORD         */
71         {BYLOCALE, DEFAULTDICT},        /* UWORD                */
72         {NODICT, NODICT},                       /* EMAIL                */
73         {NODICT, NODICT},                       /* FURL                 */
74         {NODICT, NODICT},                       /* HOST                 */
75         {NODICT, NODICT},                       /* SCIENTIFIC           */
76         {NODICT, NODICT},                       /* VERSIONNUMBER                */
77         {BYLOCALE, DEFAULTDICT},        /* PARTHYPHENWORD               */
78         {BYLOCALE, NODICT},                     /* CYRPARTHYPHENWORD */
79         {DEFAULTDICT, NODICT},          /* LATPARTHYPHENWORD            */
80         {STOPLEXEM, NODICT},            /* SPACE                */
81         {STOPLEXEM, NODICT},            /* TAG          */
82         {STOPLEXEM, NODICT},            /* HTTP                 */
83         {BYLOCALE, DEFAULTDICT},        /* HYPHENWORD           */
84         {DEFAULTDICT, NODICT},          /* LATHYPHENWORD                */
85         {BYLOCALE, NODICT},                     /* CYRHYPHENWORD        */
86         {NODICT, NODICT},                       /* URI                  */
87         {NODICT, NODICT},                       /* FILEPATH             */
88         {NODICT, NODICT},                       /* DECIMAL              */
89         {NODICT, NODICT},                       /* SIGNEDINT            */
90         {NODICT, NODICT},                       /* UNSIGNEDINT          */
91         {STOPLEXEM, NODICT}                     /* HTMLENTITY           */
92 };
93
94 static bool inited = false;
95
96 void
97 initmorph(void)
98 {
99         int                     i,
100                                 j,
101                                 k;
102         MAPDICT    *md;
103         bool            needinit[lengthof(dicts)];
104         const char *curlocale;
105         int                     bylocaledict = NODICT;
106
107         if (inited)
108                 return;
109         for (i = 1; i < lengthof(dicts); i++)
110                 needinit[i] = false;
111
112         curlocale = setlocale(LC_CTYPE, NULL);
113         if (curlocale)
114         {
115                 for (i = 1; i < lengthof(dicts); i++)
116                         if (strcmp(dicts[i].localename, curlocale) == 0)
117                         {
118                                 bylocaledict = i;
119                                 break;
120                         }
121         }
122
123         for (i = 1; i < lengthof(mapdict); i++)
124         {
125                 k = 0;
126                 md = &mapdict[i];
127                 for (j = 0; j < MAXNDICT; j++)
128                 {
129                         GETDICT(md, k) = GETDICT(md, j);
130                         if (GETDICT(md, k) == NODICT)
131                                 break;
132                         else if (GETDICT(md, k) == BYLOCALE)
133                         {
134                                 if (bylocaledict == NODICT)
135                                         continue;
136                                 GETDICT(md, k) = bylocaledict;
137                         }
138                         if (GETDICT(md, k) >= (int2) lengthof(dicts))
139                                 continue;
140                         needinit[GETDICT(md, k)] = true;
141                         k++;
142                 }
143                 for (; k < MAXNDICT; k++)
144                         if (GETDICT(md, k) != STOPLEXEM)
145                                 GETDICT(md, k) = NODICT;
146         }
147
148         for (i = 1; i < lengthof(dicts); i++)
149                 if (needinit[i] && dicts[i].init)
150                         dictobjs[i] = (*(dicts[i].init)) ();
151
152         inited = true;
153         return;
154 }
155
156 char *
157 lemmatize(char *word, int *len, int type)
158 {
159         int2            nd;
160         int                     i;
161         DICT       *dict;
162
163         for (i = 0; i < MAXNDICT; i++)
164         {
165                 nd = GETDICT(&mapdict[type], i);
166                 if (nd == NODICT)
167                 {
168                         /* there is no dictionary */
169                         return word;
170                 }
171                 else if (nd == STOPLEXEM)
172                 {
173                         /* word is stopword */
174                         return NULL;
175                 }
176                 else if (nd == BYLOCALE)
177                 {
178                         continue; /* no dict for current locale */
179                 }
180                 else
181                 {
182                         dict = &dicts[nd];
183                         if (dict->is_stoplemm && (*(dict->is_stoplemm)) (dictobjs[nd], word, *len))
184                                 return NULL;
185                         if (dict->lemmatize)
186                         {
187                                 int                     oldlen = *len;
188                                 char       *newword = (*(dict->lemmatize)) (dictobjs[nd], word, len);
189
190                                 /* word is recognized by distionary */
191                                 if (newword != word || *len != oldlen)
192                                 {
193                                         if (dict->is_stemstoplemm &&
194                                         (*(dict->is_stemstoplemm)) (dictobjs[nd], word, *len))
195                                         {
196                                                 if (newword != word && newword)
197                                                         pfree(newword);
198                                                 return NULL;
199                                         }
200                                         return newword;
201                                 }
202                         }
203                 }
204         }
205
206         return word;
207 }
208
209 bool
210 is_stoptype(int type)
211 {
212         return (GETDICT(&mapdict[type], 0) == STOPLEXEM) ? true : false;
213 }