1 /*-------------------------------------------------------------------------
4 * Normalizing word with ISpell
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
10 * $PostgreSQL: pgsql/src/backend/tsearch/spell.c,v 1.10 2008/01/16 13:01:03 teodor Exp $
12 *-------------------------------------------------------------------------
17 #include "storage/fd.h"
18 #include "tsearch/dicts/spell.h"
19 #include "tsearch/ts_locale.h"
20 #include "utils/memutils.h"
24 * Initialization requires a lot of memory that's not needed
25 * after the initialization is done. In init function,
26 * CurrentMemoryContext is a long lived memory context associated
27 * with the dictionary cache entry, so we use a temporary context
28 * for the short-lived stuff.
30 static MemoryContext tmpCtx = NULL;
32 #define tmpalloc(sz) MemoryContextAlloc(tmpCtx, (sz))
33 #define tmpalloc0(sz) MemoryContextAllocZero(tmpCtx, (sz))
39 * XXX: This assumes that CurrentMemoryContext doesn't have any children
40 * other than the one we create here.
42 if (CurrentMemoryContext->firstchild == NULL)
44 tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
45 "Ispell dictionary init context",
46 ALLOCSET_DEFAULT_MINSIZE,
47 ALLOCSET_DEFAULT_INITSIZE,
48 ALLOCSET_DEFAULT_MAXSIZE);
51 tmpCtx = CurrentMemoryContext->firstchild;
55 lowerstr_ctx(char *src)
57 MemoryContext saveCtx;
60 saveCtx = MemoryContextSwitchTo(tmpCtx);
62 MemoryContextSwitchTo(saveCtx);
68 #define MAXNORMLEN 256
70 #define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
71 #define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
72 #define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
74 static char *VoidString = "";
77 cmpspell(const void *s1, const void *s2)
79 return (strcmp((*(const SPELL **) s1)->word, (*(const SPELL **) s2)->word));
82 cmpspellaffix(const void *s1, const void *s2)
84 return (strncmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag, MAXFLAGLEN));
88 findchar(char *str, int c)
101 /* backward string compare for suffix tree operations */
103 strbcmp(const unsigned char *s1, const unsigned char *s2)
105 int l1 = strlen((const char *) s1) - 1,
106 l2 = strlen((const char *) s2) - 1;
108 while (l1 >= 0 && l2 >= 0)
125 strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
127 int l1 = strlen((const char *) s1) - 1,
128 l2 = strlen((const char *) s2) - 1,
131 while (l1 >= 0 && l2 >= 0 && l > 0)
151 cmpaffix(const void *s1, const void *s2)
153 const AFFIX *a1 = (const AFFIX *) s1;
154 const AFFIX *a2 = (const AFFIX *) s2;
156 if (a1->type < a2->type)
158 if (a1->type > a2->type)
160 if (a1->type == FF_PREFIX)
161 return strcmp(a1->repl, a2->repl);
163 return strbcmp((const unsigned char *) a1->repl,
164 (const unsigned char *) a2->repl);
168 NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
170 if (Conf->nspell >= Conf->mspell)
174 Conf->mspell += 1024 * 20;
175 Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
179 Conf->mspell = 1024 * 20;
180 Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
183 Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
184 strcpy(Conf->Spell[Conf->nspell]->word, word);
185 strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
192 * Note caller must already have applied get_tsearch_config_filename
195 NIImportDictionary(IspellDict *Conf, const char *filename)
202 if (!(dict = AllocateFile(filename, "r")))
204 (errcode(ERRCODE_CONFIG_FILE_ERROR),
205 errmsg("could not open dictionary file \"%s\": %m",
208 while ((line = t_readline(dict)) != NULL)
214 /* Extract flag from the line */
216 if ((s = findchar(line, '/')))
222 /* we allow only single encoded flags for faster works */
223 if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
235 /* Remove trailing spaces */
246 pstr = lowerstr_ctx(line);
248 NIAddSpell(Conf, pstr, flag);
258 FindWord(IspellDict *Conf, const char *word, int affixflag, int flag)
260 SPNode *node = Conf->Dictionary;
264 uint8 *ptr = (uint8 *) word;
266 flag &= FF_DICTFLAGMASK;
270 StopLow = node->data;
271 StopHigh = node->data + node->length;
272 while (StopLow < StopHigh)
274 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
275 if (StopMiddle->val == *ptr)
277 if (*(ptr + 1) == '\0' && StopMiddle->isword)
281 if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
284 else if ((flag & StopMiddle->compoundflag) == 0)
287 if ((affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
290 node = StopMiddle->node;
294 else if (StopMiddle->val < *ptr)
295 StopLow = StopMiddle + 1;
297 StopHigh = StopMiddle;
299 if (StopLow >= StopHigh)
306 NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
310 if (Conf->naffixes >= Conf->maffixes)
314 Conf->maffixes += 16;
315 Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
320 Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
324 Affix = Conf->Affix + Conf->naffixes;
326 if (strcmp(mask, ".") == 0)
331 else if (RS_isRegis(mask))
335 RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX) ? true : false,
336 (char *) ((mask && *mask) ? mask : VoidString));
348 tmask = (char *) tmpalloc(strlen(mask) + 3);
349 if (type == FF_SUFFIX)
350 sprintf(tmask, "%s$", mask);
352 sprintf(tmask, "^%s", mask);
354 masklen = strlen(tmask);
355 wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
356 wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
358 err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen, REG_ADVANCED | REG_NOSUB);
363 pg_regerror(err, &(Affix->reg.regex), errstr, sizeof(errstr));
365 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
366 errmsg("invalid regular expression: %s", errstr)));
370 Affix->flagflags = flagflags;
371 if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
373 if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
374 Affix->flagflags |= FF_COMPOUNDFLAG;
379 Affix->find = (find && *find) ? pstrdup(find) : VoidString;
380 if ((Affix->replen = strlen(repl)) > 0)
381 Affix->repl = pstrdup(repl);
383 Affix->repl = VoidString;
387 #define PAE_WAIT_MASK 0
389 #define PAE_WAIT_FIND 2
391 #define PAE_WAIT_REPL 4
395 parse_affentry(char *str, char *mask, char *find, char *repl,
396 const char *filename, int lineno)
398 int state = PAE_WAIT_MASK;
403 *mask = *find = *repl = '\0';
407 if (state == PAE_WAIT_MASK)
409 if (t_iseq(str, '#'))
411 else if (!t_isspace(str))
413 COPYCHAR(pmask, str);
414 pmask += pg_mblen(str);
418 else if (state == PAE_INMASK)
420 if (t_iseq(str, '>'))
423 state = PAE_WAIT_FIND;
425 else if (!t_isspace(str))
427 COPYCHAR(pmask, str);
428 pmask += pg_mblen(str);
431 else if (state == PAE_WAIT_FIND)
433 if (t_iseq(str, '-'))
437 else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
439 COPYCHAR(prepl, str);
440 prepl += pg_mblen(str);
443 else if (!t_isspace(str))
445 (errcode(ERRCODE_CONFIG_FILE_ERROR),
446 errmsg("syntax error at line %d of affix file \"%s\"",
449 else if (state == PAE_INFIND)
451 if (t_iseq(str, ','))
454 state = PAE_WAIT_REPL;
456 else if (t_isalpha(str))
458 COPYCHAR(pfind, str);
459 pfind += pg_mblen(str);
461 else if (!t_isspace(str))
463 (errcode(ERRCODE_CONFIG_FILE_ERROR),
464 errmsg("syntax error at line %d of affix file \"%s\"",
467 else if (state == PAE_WAIT_REPL)
469 if (t_iseq(str, '-'))
471 break; /* void repl */
473 else if (t_isalpha(str))
475 COPYCHAR(prepl, str);
476 prepl += pg_mblen(str);
479 else if (!t_isspace(str))
481 (errcode(ERRCODE_CONFIG_FILE_ERROR),
482 errmsg("syntax error at line %d of affix file \"%s\"",
485 else if (state == PAE_INREPL)
487 if (t_iseq(str, '#'))
492 else if (t_isalpha(str))
494 COPYCHAR(prepl, str);
495 prepl += pg_mblen(str);
497 else if (!t_isspace(str))
499 (errcode(ERRCODE_CONFIG_FILE_ERROR),
500 errmsg("syntax error at line %d of affix file \"%s\"",
504 elog(ERROR, "unrecognized state in parse_affentry: %d", state);
506 str += pg_mblen(str);
509 *pmask = *pfind = *prepl = '\0';
511 return (*mask && (*find || *repl)) ? true : false;
515 addFlagValue(IspellDict *Conf, char *s, uint32 val,
516 const char *filename, int lineno)
518 while (*s && t_isspace(s))
523 (errcode(ERRCODE_CONFIG_FILE_ERROR),
524 errmsg("syntax error at line %d of affix file \"%s\"",
527 if (pg_mblen(s) != 1)
529 (errcode(ERRCODE_CONFIG_FILE_ERROR),
530 errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
533 Conf->flagval[(unsigned int) *s] = (unsigned char) val;
534 Conf->usecompound = true;
538 NIImportOOAffixes(IspellDict *Conf, const char *filename)
549 bool isSuffix = false;
555 char scanbuf[BUFSIZ];
560 /* read file to find any flag */
561 memset(Conf->flagval, 0, sizeof(Conf->flagval));
562 Conf->usecompound = false;
564 if (!(affix = AllocateFile(filename, "r")))
566 (errcode(ERRCODE_CONFIG_FILE_ERROR),
567 errmsg("could not open affix file \"%s\": %m",
570 while ((recoded = t_readline(affix)) != NULL)
574 if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
580 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
581 addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
582 FF_COMPOUNDFLAG, filename, lineno);
583 else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
584 addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
585 FF_COMPOUNDBEGIN, filename, lineno);
586 else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
587 addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
588 FF_COMPOUNDLAST, filename, lineno);
589 /* COMPOUNDLAST and COMPOUNDEND are synonyms */
590 else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
591 addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
592 FF_COMPOUNDLAST, filename, lineno);
593 else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
594 addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
595 FF_COMPOUNDMIDDLE, filename, lineno);
596 else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
597 addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
598 FF_COMPOUNDONLY, filename, lineno);
599 else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
600 addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
601 FF_COMPOUNDPERMITFLAG, filename, lineno);
602 else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
603 addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
604 FF_COMPOUNDFORBIDFLAG, filename, lineno);
605 else if (STRNCMP(recoded, "FLAG") == 0)
607 char *s = recoded + strlen("FLAG");
609 while (*s && t_isspace(s))
612 if (*s && STRNCMP(s, "default") != 0)
614 (errcode(ERRCODE_CONFIG_FILE_ERROR),
615 errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"",
624 sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
626 if (!(affix = AllocateFile(filename, "r")))
628 (errcode(ERRCODE_CONFIG_FILE_ERROR),
629 errmsg("could not open affix file \"%s\": %m",
632 while ((recoded = t_readline(affix)) != NULL)
635 if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
638 scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
642 ptype = lowerstr_ctx(type);
643 if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
648 if (strlen(sflag) != 1)
651 isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
652 if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
653 flagflags = FF_CROSSPRODUCT;
662 if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
664 prepl = lowerstr_ctx(repl);
666 if ((ptr = strchr(prepl, '/')) != NULL)
669 ptr = repl + (ptr - prepl) + 1;
672 aflg |= Conf->flagval[(unsigned int) *ptr];
676 pfind = lowerstr_ctx(find);
677 pmask = lowerstr_ctx(mask);
678 if (t_iseq(find, '0'))
680 if (t_iseq(repl, '0'))
683 NIAddAffix(Conf, flag, flagflags | aflg, pmask, pfind, prepl,
684 isSuffix ? FF_SUFFIX : FF_PREFIX);
702 * Note caller must already have applied get_tsearch_config_filename
705 NIImportAffixes(IspellDict *Conf, const char *filename)
712 bool suffixes = false;
713 bool prefixes = false;
718 bool oldformat = false;
719 char *recoded = NULL;
723 if (!(affix = AllocateFile(filename, "r")))
725 (errcode(ERRCODE_CONFIG_FILE_ERROR),
726 errmsg("could not open affix file \"%s\": %m",
729 memset(Conf->flagval, 0, sizeof(Conf->flagval));
730 Conf->usecompound = false;
732 while ((recoded = t_readline(affix)) != NULL)
734 pstr = lowerstr(recoded);
738 /* Skip comments and empty lines */
739 if (*pstr == '#' || *pstr == '\n')
742 if (STRNCMP(pstr, "compoundwords") == 0)
744 s = findchar(pstr, 'l');
747 s = recoded + (s - pstr); /* we need non-lowercased
749 while (*s && !t_isspace(s))
751 while (*s && t_isspace(s))
754 if (*s && pg_mblen(s) == 1)
756 Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
757 Conf->usecompound = true;
763 if (STRNCMP(pstr, "suffixes") == 0)
770 if (STRNCMP(pstr, "prefixes") == 0)
777 if (STRNCMP(pstr, "flag") == 0)
779 s = recoded + 4; /* we need non-lowercased string */
782 while (*s && t_isspace(s))
786 /* allow only single-encoded flags */
787 if (pg_mblen(s) != 1)
789 (errcode(ERRCODE_CONFIG_FILE_ERROR),
790 errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
795 flagflags |= FF_CROSSPRODUCT;
800 flagflags |= FF_COMPOUNDONLY;
807 /* allow only single-encoded flags */
808 if (pg_mblen(s) != 1)
810 (errcode(ERRCODE_CONFIG_FILE_ERROR),
811 errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
814 flag = (unsigned char) *s;
817 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 || STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
818 STRNCMP(recoded, "PFX") == 0 || STRNCMP(recoded, "SFX") == 0)
822 (errcode(ERRCODE_CONFIG_FILE_ERROR),
823 errmsg("wrong affix file format for flag at line %d of affix file \"%s\"",
826 NIImportOOAffixes(Conf, filename);
829 if ((!suffixes) && (!prefixes))
832 if (!parse_affentry(pstr, mask, find, repl, filename, lineno))
835 NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
845 MergeAffix(IspellDict *Conf, int a1, int a2)
849 while (Conf->nAffixData + 1 >= Conf->lenAffixData)
851 Conf->lenAffixData *= 2;
852 Conf->AffixData = (char **) repalloc(Conf->AffixData,
853 sizeof(char *) * Conf->lenAffixData);
856 ptr = Conf->AffixData + Conf->nAffixData;
857 *ptr = palloc(strlen(Conf->AffixData[a1]) + strlen(Conf->AffixData[a2]) +
858 1 /* space */ + 1 /* \0 */ );
859 sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]);
864 return Conf->nAffixData - 1;
868 makeCompoundFlags(IspellDict *Conf, int affix)
871 char *str = Conf->AffixData[affix];
875 flag |= Conf->flagval[(unsigned int) *str];
879 return (flag & FF_DICTFLAGMASK);
883 mkSPNode(IspellDict *Conf, int low, int high, int level)
887 char lastchar = '\0';
892 for (i = low; i < high; i++)
893 if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
896 lastchar = Conf->Spell[i]->word[level];
902 rs = (SPNode *) palloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
907 for (i = low; i < high; i++)
908 if (Conf->Spell[i]->p.d.len > level)
910 if (lastchar != Conf->Spell[i]->word[level])
914 data->node = mkSPNode(Conf, lownew, i, level + 1);
918 lastchar = Conf->Spell[i]->word[level];
920 data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
921 if (Conf->Spell[i]->p.d.len == level + 1)
923 bool clearCompoundOnly = false;
925 if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
928 * MergeAffix called a few times. If one of word is
929 * allowed to be in compound word and another isn't, then
930 * clear FF_COMPOUNDONLY flag.
933 clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
934 & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
936 data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
939 data->affix = Conf->Spell[i]->p.d.affix;
942 data->compoundflag = makeCompoundFlags(Conf, data->affix);
944 if ((data->compoundflag & FF_COMPOUNDONLY) &&
945 (data->compoundflag & FF_COMPOUNDFLAG) == 0)
946 data->compoundflag |= FF_COMPOUNDFLAG;
948 if (clearCompoundOnly)
949 data->compoundflag &= ~FF_COMPOUNDONLY;
953 data->node = mkSPNode(Conf, lownew, high, level + 1);
959 * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
963 NISortDictionary(IspellDict *Conf)
971 /* compress affixes */
973 /* Count the number of different flags used in the dictionary */
975 qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
978 for (i = 0; i < Conf->nspell; i++)
980 if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
985 * Fill in Conf->AffixData with the affixes that were used in the
986 * dictionary. Replace textual flag-field of Conf->Spell entries with
987 * indexes into Conf->AffixData array.
989 Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
992 for (i = 0; i < Conf->nspell; i++)
994 if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
997 Assert(curaffix < naffix);
998 Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag);
1001 Conf->Spell[i]->p.d.affix = curaffix;
1002 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1005 Conf->lenAffixData = Conf->nAffixData = naffix;
1007 qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
1008 Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
1014 mkANode(IspellDict *Conf, int low, int high, int level, int type)
1018 uint8 lastchar = '\0';
1020 AffixNodeData *data;
1025 for (i = low; i < high; i++)
1026 if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
1029 lastchar = GETCHAR(Conf->Affix + i, level, type);
1035 aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
1038 rs = (AffixNode *) palloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
1043 for (i = low; i < high; i++)
1044 if (Conf->Affix[i].replen > level)
1046 if (lastchar != GETCHAR(Conf->Affix + i, level, type))
1050 data->node = mkANode(Conf, lownew, i, level + 1, type);
1054 data->aff = (AFFIX **) palloc(sizeof(AFFIX *) * naff);
1055 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1061 lastchar = GETCHAR(Conf->Affix + i, level, type);
1063 data->val = GETCHAR(Conf->Affix + i, level, type);
1064 if (Conf->Affix[i].replen == level + 1)
1065 { /* affix stopped */
1066 aff[naff++] = Conf->Affix + i;
1070 data->node = mkANode(Conf, lownew, high, level + 1, type);
1074 data->aff = (AFFIX **) palloc(sizeof(AFFIX *) * naff);
1075 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1085 mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
1089 int start = (issuffix) ? startsuffix : 0;
1090 int end = (issuffix) ? Conf->naffixes : startsuffix;
1091 AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
1098 Affix->data->node = Conf->Suffix;
1099 Conf->Suffix = Affix;
1103 Affix->data->node = Conf->Prefix;
1104 Conf->Prefix = Affix;
1108 for (i = start; i < end; i++)
1109 if (Conf->Affix[i].replen == 0)
1115 Affix->data->aff = (AFFIX **) palloc(sizeof(AFFIX *) * cnt);
1116 Affix->data->naff = (uint32) cnt;
1119 for (i = start; i < end; i++)
1120 if (Conf->Affix[i].replen == 0)
1122 Affix->data->aff[cnt] = Conf->Affix + i;
1128 isAffixInUse(IspellDict *Conf, char flag)
1132 for (i = 0; i < Conf->nAffixData; i++)
1133 if (strchr(Conf->AffixData[i], flag) != NULL)
1140 NISortAffixes(IspellDict *Conf)
1145 int firstsuffix = Conf->naffixes;
1149 if (Conf->naffixes == 0)
1152 if (Conf->naffixes > 1)
1153 qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
1154 Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
1157 for (i = 0; i < Conf->naffixes; i++)
1159 Affix = &(((AFFIX *) Conf->Affix)[i]);
1160 if (Affix->type == FF_SUFFIX && i < firstsuffix)
1163 if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
1164 isAffixInUse(Conf, (char) Affix->flag))
1166 if (ptr == Conf->CompoundAffix ||
1167 ptr->issuffix != (ptr - 1)->issuffix ||
1168 strbncmp((const unsigned char *) (ptr - 1)->affix,
1169 (const unsigned char *) Affix->repl,
1172 /* leave only unique and minimals suffixes */
1173 ptr->affix = Affix->repl;
1174 ptr->len = Affix->replen;
1175 ptr->issuffix = (Affix->type == FF_SUFFIX) ? true : false;
1181 Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
1183 Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
1184 Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
1185 mkVoidAffix(Conf, true, firstsuffix);
1186 mkVoidAffix(Conf, false, firstsuffix);
1189 static AffixNodeData *
1190 FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
1192 AffixNodeData *StopLow,
1198 { /* search void affixes */
1199 if (node->data->naff)
1201 node = node->data->node;
1204 while (node && *level < wrdlen)
1206 StopLow = node->data;
1207 StopHigh = node->data + node->length;
1208 while (StopLow < StopHigh)
1210 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1211 symbol = GETWCHAR(word, wrdlen, *level, type);
1213 if (StopMiddle->val == symbol)
1216 if (StopMiddle->naff)
1218 node = StopMiddle->node;
1221 else if (StopMiddle->val < symbol)
1222 StopLow = StopMiddle + 1;
1224 StopHigh = StopMiddle;
1226 if (StopLow >= StopHigh)
1233 CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
1236 * Check compound allow flags
1241 if (Affix->flagflags & FF_COMPOUNDONLY)
1244 else if (flagflags & FF_COMPOUNDBEGIN)
1246 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
1248 if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
1249 if (Affix->type == FF_SUFFIX)
1252 else if (flagflags & FF_COMPOUNDMIDDLE)
1254 if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
1255 (Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
1258 else if (flagflags & FF_COMPOUNDLAST)
1260 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
1262 if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
1263 if (Affix->type == FF_PREFIX)
1268 * make replace pattern of affix
1270 if (Affix->type == FF_SUFFIX)
1272 strcpy(newword, word);
1273 strcpy(newword + len - Affix->replen, Affix->find);
1274 if (baselen) /* store length of non-changed part of word */
1275 *baselen = len - Affix->replen;
1280 * if prefix is a all non-chaged part's length then all word contains
1281 * only prefix and suffix, so out
1283 if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
1285 strcpy(newword, Affix->find);
1286 strcat(newword, word + Affix->replen);
1290 * check resulting word
1292 if (Affix->issimple)
1294 else if (Affix->isregis)
1296 if (RS_execute(&(Affix->reg.regis), newword))
1306 /* Convert data string to wide characters */
1307 newword_len = strlen(newword);
1308 data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
1309 data_len = pg_mb2wchar_with_len(newword, data, newword_len);
1311 if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
1323 addToResult(char **forms, char **cur, char *word)
1325 if (cur - forms >= MAX_NORM - 1)
1327 if (forms == cur || strcmp(word, *(cur - 1)) != 0)
1329 *cur = pstrdup(word);
1338 NormalizeSubWord(IspellDict *Conf, char *word, int flag)
1340 AffixNodeData *suffix = NULL,
1344 int wrdlen = strlen(word),
1348 char newword[2 * MAXNORMLEN] = "";
1349 char pnewword[2 * MAXNORMLEN] = "";
1350 AffixNode *snode = Conf->Suffix,
1355 if (wrdlen > MAXNORMLEN)
1357 cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
1361 /* Check that the word itself is normal form */
1362 if (FindWord(Conf, word, 0, flag))
1364 *cur = pstrdup(word);
1369 /* Find all other NORMAL forms of the 'word' (check only prefix) */
1370 pnode = Conf->Prefix;
1374 prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
1377 for (j = 0; j < prefix->naff; j++)
1379 if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
1381 /* prefix success */
1382 if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
1383 cur += addToResult(forms, cur, newword);
1386 pnode = prefix->node;
1390 * Find all other NORMAL forms of the 'word' (check suffix and then
1397 /* find possible suffix */
1398 suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
1401 /* foreach suffix check affix */
1402 for (i = 0; i < suffix->naff; i++)
1404 if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
1406 /* suffix success */
1407 if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
1408 cur += addToResult(forms, cur, newword);
1410 /* now we will look changed word with prefixes */
1411 pnode = Conf->Prefix;
1413 swrdlen = strlen(newword);
1416 prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
1419 for (j = 0; j < prefix->naff; j++)
1421 if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
1423 /* prefix success */
1424 int ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
1425 0 : prefix->aff[j]->flag;
1427 if (FindWord(Conf, pnewword, ff, flag))
1428 cur += addToResult(forms, cur, pnewword);
1431 pnode = prefix->node;
1436 snode = suffix->node;
1447 typedef struct SplitVar
1452 struct SplitVar *next;
1456 CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
1462 while ((*ptr)->affix)
1464 if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
1467 issuffix = (*ptr)->issuffix;
1469 return (issuffix) ? len : 0;
1478 while ((*ptr)->affix)
1480 if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
1482 len = (*ptr)->len + (affbegin - word);
1483 issuffix = (*ptr)->issuffix;
1485 return (issuffix) ? len : 0;
1494 CopyVar(SplitVar *s, int makedup)
1496 SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
1503 v->lenstem = s->lenstem;
1504 v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
1505 v->nstem = s->nstem;
1506 for (i = 0; i < s->nstem; i++)
1507 v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
1512 v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
1519 AddStem(SplitVar *v, char *word)
1521 if ( v->nstem >= v->lenstem )
1524 v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
1527 v->stem[v->nstem] = word;
1532 SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
1534 SplitVar *var = NULL;
1535 SPNodeData *StopLow,
1538 SPNode *node = (snode) ? snode : Conf->Dictionary;
1539 int level = (snode) ? minpos : startpos; /* recursive
1544 int compoundflag = 0;
1546 notprobed = (char *) palloc(wordlen);
1547 memset(notprobed, 1, wordlen);
1548 var = CopyVar(orig, 1);
1550 while (level < wordlen)
1552 /* find word with epenthetic or/and compound affix */
1553 caff = Conf->CompoundAffix;
1554 while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
1557 * there is one of compound affixes, so check word for existings
1559 char buf[MAXNORMLEN];
1562 lenaff = level - startpos + lenaff;
1564 if (!notprobed[startpos + lenaff - 1])
1567 if (level + lenaff - 1 <= minpos)
1570 if ( lenaff >= MAXNORMLEN )
1571 continue; /* skip too big value */
1573 memcpy(buf, word + startpos, lenaff);
1577 compoundflag = FF_COMPOUNDBEGIN;
1578 else if (level == wordlen - 1)
1579 compoundflag = FF_COMPOUNDLAST;
1581 compoundflag = FF_COMPOUNDMIDDLE;
1582 subres = NormalizeSubWord(Conf, buf, compoundflag);
1585 /* Yes, it was a word from dictionary */
1586 SplitVar *new = CopyVar(var, 0);
1587 SplitVar *ptr = var;
1588 char **sptr = subres;
1590 notprobed[startpos + lenaff - 1] = 0;
1594 AddStem( new, *sptr );
1601 ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
1611 StopLow = node->data;
1612 StopHigh = node->data + node->length;
1613 while (StopLow < StopHigh)
1615 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1616 if (StopMiddle->val == ((uint8 *) (word))[level])
1618 else if (StopMiddle->val < ((uint8 *) (word))[level])
1619 StopLow = StopMiddle + 1;
1621 StopHigh = StopMiddle;
1624 if (StopLow < StopHigh)
1626 if (level == FF_COMPOUNDBEGIN)
1627 compoundflag = FF_COMPOUNDBEGIN;
1628 else if (level == wordlen - 1)
1629 compoundflag = FF_COMPOUNDLAST;
1631 compoundflag = FF_COMPOUNDMIDDLE;
1633 /* find infinitive */
1634 if (StopMiddle->isword &&
1635 (StopMiddle->compoundflag & compoundflag) &&
1638 /* ok, we found full compoundallowed word */
1641 /* and its length more than minimal */
1642 if (wordlen == level + 1)
1644 /* well, it was last word */
1645 AddStem( var, pnstrdup(word + startpos, wordlen - startpos) );
1651 /* then we will search more big word at the same point */
1652 SplitVar *ptr = var;
1656 ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
1657 /* we can find next word */
1659 AddStem( var, pnstrdup(word + startpos, level - startpos) );
1660 node = Conf->Dictionary;
1666 node = StopMiddle->node;
1673 AddStem( var, pnstrdup(word + startpos, wordlen - startpos) );
1679 addNorm( TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
1681 if ( *lres == NULL )
1682 *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
1684 if ( *lcur - *lres < MAX_NORM-1 ) {
1685 (*lcur)->lexeme = word;
1686 (*lcur)->flags = flags;
1687 (*lcur)->nvariant = NVariant;
1689 (*lcur)->lexeme = NULL;
1694 NINormalizeWord(IspellDict *Conf, char *word)
1697 TSLexeme *lcur = NULL,
1699 uint16 NVariant = 1;
1701 res = NormalizeSubWord(Conf, word, 0);
1707 while (*ptr && (lcur-lres) < MAX_NORM)
1709 addNorm( &lres, &lcur, *ptr, 0, NVariant++);
1715 if (Conf->usecompound)
1717 int wordlen = strlen(word);
1719 *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
1726 char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
1730 char **subptr = subres;
1734 for (i = 0; i < var->nstem - 1; i++)
1736 addNorm( &lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
1739 addNorm( &lres, &lcur, *subptr, 0, NVariant);
1745 var->stem[0] = NULL;
1746 pfree(var->stem[var->nstem - 1]);
1750 for (i = 0; i < var->nstem && var->stem[i]; i++)
1751 pfree(var->stem[i]);