3 * New dictionary is include in dict.h. For languages which
4 * use latin charset it may be need to modify mapdict table.
5 * Teodor Sigaev <teodor@stack.net>
11 #include "utils/builtins.h"
17 * Struct for calling dictionaries
18 * All of this methods are optional, but
19 * if all methods are NULL, then dictionary does nothing :)
20 * Return value of lemmatize must be palloced or the same.
21 * Return value of init must be malloced in other case
22 * it will be free in end of transaction!
26 char localename[NAMEDATALEN];
29 /* close dictionary */
30 void (*close) (void *);
31 /* find in dictionary */
32 char *(*lemmatize) (void *, char *, int *);
33 int (*is_stoplemm) (void *, char *, int);
34 int (*is_stemstoplemm) (void *, char *, int);
37 /* insert all dictionaries */
42 /* fill dictionary's structure */
46 "C", NULL, NULL, NULL, NULL, NULL /* fake dictionary */
53 /* array for storing dictionary's objects (if needed) */
54 void *dictobjs[lengthof(dicts)];
62 typedef int2 MAPDICT[MAXNDICT];
64 #define GETDICT(x,i) *( ((int2*)(x)) + (i) )
66 /* map dictionaries for lexem type */
67 static MAPDICT mapdict[] = {
68 {NODICT, NODICT}, /* not used */
69 {DEFAULTDICT, NODICT}, /* LATWORD */
70 {BYLOCALE, NODICT}, /* NONLATINWORD */
71 {BYLOCALE, DEFAULTDICT}, /* UWORD */
72 {NODICT, NODICT}, /* EMAIL */
73 {NODICT, NODICT}, /* FURL */
74 {NODICT, NODICT}, /* HOST */
75 {NODICT, NODICT}, /* SCIENTIFIC */
76 {NODICT, NODICT}, /* VERSIONNUMBER */
77 {BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */
78 {BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */
79 {DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */
80 {STOPLEXEM, NODICT}, /* SPACE */
81 {STOPLEXEM, NODICT}, /* TAG */
82 {STOPLEXEM, NODICT}, /* HTTP */
83 {BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */
84 {DEFAULTDICT, NODICT}, /* LATHYPHENWORD */
85 {BYLOCALE, NODICT}, /* CYRHYPHENWORD */
86 {NODICT, NODICT}, /* URI */
87 {NODICT, NODICT}, /* FILEPATH */
88 {NODICT, NODICT}, /* DECIMAL */
89 {NODICT, NODICT}, /* SIGNEDINT */
90 {NODICT, NODICT}, /* UNSIGNEDINT */
91 {STOPLEXEM, NODICT} /* HTMLENTITY */
94 static bool inited = false;
103 bool needinit[lengthof(dicts)];
104 const char *curlocale;
105 int bylocaledict = NODICT;
109 for (i = 1; i < lengthof(dicts); i++)
112 curlocale = setlocale(LC_CTYPE, NULL);
115 for (i = 1; i < lengthof(dicts); i++)
116 if (strcmp(dicts[i].localename, curlocale) == 0)
123 for (i = 1; i < lengthof(mapdict); i++)
127 for (j = 0; j < MAXNDICT; j++)
129 GETDICT(md, k) = GETDICT(md, j);
130 if (GETDICT(md, k) == NODICT)
132 else if (GETDICT(md, k) == BYLOCALE)
134 if (bylocaledict == NODICT)
136 GETDICT(md, k) = bylocaledict;
138 if (GETDICT(md, k) >= (int2) lengthof(dicts))
140 needinit[GETDICT(md, k)] = true;
143 for (; k < MAXNDICT; k++)
144 if (GETDICT(md, k) != STOPLEXEM)
145 GETDICT(md, k) = NODICT;
148 for (i = 1; i < lengthof(dicts); i++)
149 if (needinit[i] && dicts[i].init)
150 dictobjs[i] = (*(dicts[i].init)) ();
157 lemmatize(char *word, int *len, int type)
163 for (i = 0; i < MAXNDICT; i++)
165 nd = GETDICT(&mapdict[type], i);
168 /* there is no dictionary */
171 else if (nd == STOPLEXEM)
173 /* word is stopword */
176 else if (nd == BYLOCALE)
178 continue; /* no dict for current locale */
183 if (dict->is_stoplemm && (*(dict->is_stoplemm)) (dictobjs[nd], word, *len))
188 char *newword = (*(dict->lemmatize)) (dictobjs[nd], word, len);
190 /* word is recognized by distionary */
191 if (newword != word || *len != oldlen)
193 if (dict->is_stemstoplemm &&
194 (*(dict->is_stemstoplemm)) (dictobjs[nd], word, *len))
196 if (newword != word && newword)
210 is_stoptype(int type)
212 return (GETDICT(&mapdict[type], 0) == STOPLEXEM) ? true : false;