From d78a7d9c7fa3e9cd494b906f065fe7b7fe9fb9a5 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Fri, 4 Mar 2016 20:08:10 +0300 Subject: [PATCH] Improve support of Hunspell in ispell dictionary. Now it's possible to load recent version of Hunspell for several languages. To handle these dictionaries Hunspell patch adds support for: * FLAG long - sets the double extended ASCII character flag type * FLAG num - sets the decimal number flag type (from 1 to 65535) * AF parameter - alias for flag's set Also it moves test dictionaries into separate directory. Author: Artur Zakirov with editorization by me --- doc/src/sgml/textsearch.sgml | 148 ++++- src/backend/tsearch/Makefile | 7 +- .../tsearch/{ => dicts}/hunspell_sample.affix | 0 .../tsearch/dicts/hunspell_sample_long.affix | 35 + .../tsearch/dicts/hunspell_sample_long.dict | 8 + .../tsearch/dicts/hunspell_sample_num.affix | 26 + .../tsearch/dicts/hunspell_sample_num.dict | 8 + .../tsearch/{ => dicts}/ispell_sample.affix | 0 .../tsearch/{ => dicts}/ispell_sample.dict | 0 .../tsearch/{ => dicts}/synonym_sample.syn | 0 .../tsearch/{ => dicts}/thesaurus_sample.ths | 0 src/backend/tsearch/spell.c | 611 ++++++++++++++++-- src/include/tsearch/dicts/spell.h | 51 +- src/test/regress/expected/tsdicts.out | 234 +++++++ src/test/regress/sql/tsdicts.sql | 64 ++ 15 files changed, 1103 insertions(+), 89 deletions(-) rename src/backend/tsearch/{ => dicts}/hunspell_sample.affix (100%) create mode 100644 src/backend/tsearch/dicts/hunspell_sample_long.affix create mode 100644 src/backend/tsearch/dicts/hunspell_sample_long.dict create mode 100644 src/backend/tsearch/dicts/hunspell_sample_num.affix create mode 100644 src/backend/tsearch/dicts/hunspell_sample_num.dict rename src/backend/tsearch/{ => dicts}/ispell_sample.affix (100%) rename src/backend/tsearch/{ => dicts}/ispell_sample.dict (100%) rename src/backend/tsearch/{ => dicts}/synonym_sample.syn (100%) rename src/backend/tsearch/{ => dicts}/thesaurus_sample.ths (100%) diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index d66b4d5d5f..ff99976068 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -2615,18 +2615,41 @@ SELECT plainto_tsquery('supernova star'); - To create an Ispell dictionary, use the built-in - ispell template and specify several parameters: + To create an Ispell dictionary perform these steps: - + + + + download dictionary configuration files. OpenOffice + extension files have the .oxt extension. It is necessary + to extract .aff and .dic files, change + extensions to .affix and .dict. For some + dictionary files it is also needed to convert characters to the UTF-8 + encoding with commands (for example, for norwegian language dictionary): -CREATE TEXT SEARCH DICTIONARY english_ispell ( +iconv -f ISO_8859-1 -t UTF-8 -o nn_no.affix nn_NO.aff +iconv -f ISO_8859-1 -t UTF-8 -o nn_no.dict nn_NO.dic + + + + + + copy files to the $SHAREDIR/tsearch_data directory + + + + + load files into PostgreSQL with the following command: + +CREATE TEXT SEARCH DICTIONARY english_hunspell ( TEMPLATE = ispell, - DictFile = english, - AffFile = english, - StopWords = english -); + DictFile = en_us, + AffFile = en_us, + Stopwords = english); + + + Here, DictFile, AffFile, and StopWords @@ -2642,6 +2665,56 @@ CREATE TEXT SEARCH DICTIONARY english_ispell ( example, a Snowball dictionary, which recognizes everything. + + The .affix file of Ispell has the following + structure: + +prefixes +flag *A: + . > RE # As in enter > reenter +suffixes +flag T: + E > ST # As in late > latest + [^AEIOU]Y > -Y,IEST # As in dirty > dirtiest + [AEIOU]Y > EST # As in gray > grayest + [^EY] > EST # As in small > smallest + + + + And the .dict file has the following structure: + +lapse/ADGRS +lard/DGRS +large/PRTY +lark/MRS + + + + + Format of the .dict file is: + +basic_form/affix_class_name + + + + + In the .affix file every affix flag is described in the + following format: + +condition > [-stripping_letters,] adding_affix + + + + + Here, condition has a format similar to the format of regular expressions. + It can use groupings [...] and [^...]. + For example, [AEIOU]Y means that the last letter of the word + is "y" and the penultimate letter is "a", + "e", "i", "o" or "u". + [^EY] means that the last letter is neither "e" + nor "y". + + Ispell dictionaries support splitting compound words; a useful feature. @@ -2663,6 +2736,65 @@ SELECT ts_lexize('norwegian_ispell', 'sjokoladefabrikk'); + + MySpell format is a subset of Hunspell. + The .affix file of Hunspell has the following + structure: + +PFX A Y 1 +PFX A 0 re . +SFX T N 4 +SFX T 0 st e +SFX T y iest [^aeiou]y +SFX T 0 est [aeiou]y +SFX T 0 est [^ey] + + + + + The first line of an affix class is the header. Fields of an affix rules are + listed after the header: + + + + + parameter name (PFX or SFX) + + + + + flag (name of the affix class) + + + + + stripping characters from beginning (at prefix) or end (at suffix) of the + word + + + + + adding affix + + + + + condition that has a format similar to the format of regular expressions. + + + + + + The .dict file looks like the .dict file of + Ispell: + +larder/M +lardy/RT +large/RSPMYT +largehearted + + + MySpell does not support compound words. diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile index cdccf525d0..21d209ae14 100644 --- a/src/backend/tsearch/Makefile +++ b/src/backend/tsearch/Makefile @@ -13,8 +13,11 @@ include $(top_builddir)/src/Makefile.global DICTDIR=tsearch_data -DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \ - ispell_sample.affix ispell_sample.dict +DICTFILES=dicts/synonym_sample.syn dicts/thesaurus_sample.ths \ + dicts/hunspell_sample.affix \ + dicts/ispell_sample.affix dicts/ispell_sample.dict \ + dicts/hunspell_sample_long.affix dicts/hunspell_sample_long.dict \ + dicts/hunspell_sample_num.affix dicts/hunspell_sample_num.dict OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \ dict_simple.o dict_synonym.o dict_thesaurus.o \ diff --git a/src/backend/tsearch/hunspell_sample.affix b/src/backend/tsearch/dicts/hunspell_sample.affix similarity index 100% rename from src/backend/tsearch/hunspell_sample.affix rename to src/backend/tsearch/dicts/hunspell_sample.affix diff --git a/src/backend/tsearch/dicts/hunspell_sample_long.affix b/src/backend/tsearch/dicts/hunspell_sample_long.affix new file mode 100644 index 0000000000..d8e60493ad --- /dev/null +++ b/src/backend/tsearch/dicts/hunspell_sample_long.affix @@ -0,0 +1,35 @@ +FLAG long + +AF 7 +AF cZ #1 +AF cL #2 +AF sGsJpUsS #3 +AF sSpB #4 +AF cZsS #5 +AF sScZs\ #6 +AF sA #7 + +COMPOUNDFLAG cZ +ONLYINCOMPOUND cL + +PFX pB Y 1 +PFX pB 0 re . + +PFX pU N 1 +PFX pU 0 un . + +SFX sJ Y 1 +SFX sJ 0 INGS [^E] + +SFX sG Y 1 +SFX sG 0 ING [^E] + +SFX sS Y 1 +SFX sS 0 S [^SXZHY] + +SFX sA Y 1 +SFX sA Y IES [^AEIOU]Y + +SFX s\ N 1 +SFX s\ 0 Y/2 [^Y] + diff --git a/src/backend/tsearch/dicts/hunspell_sample_long.dict b/src/backend/tsearch/dicts/hunspell_sample_long.dict new file mode 100644 index 0000000000..96ecbf007a --- /dev/null +++ b/src/backend/tsearch/dicts/hunspell_sample_long.dict @@ -0,0 +1,8 @@ +book/3 +booking/4 +footballklubber +foot/5 +football/1 +ball/6 +klubber/1 +sky/7 diff --git a/src/backend/tsearch/dicts/hunspell_sample_num.affix b/src/backend/tsearch/dicts/hunspell_sample_num.affix new file mode 100644 index 0000000000..ba1e8f8002 --- /dev/null +++ b/src/backend/tsearch/dicts/hunspell_sample_num.affix @@ -0,0 +1,26 @@ +FLAG num + +COMPOUNDFLAG 101 +ONLYINCOMPOUND 102 + +PFX 201 Y 1 +PFX 201 0 re . + +PFX 202 N 1 +PFX 202 0 un . + +SFX 301 Y 1 +SFX 301 0 INGS [^E] + +SFX 302 Y 1 +SFX 302 0 ING [^E] + +SFX 303 Y 1 +SFX 303 0 S [^SXZHY] + +SFX 304 Y 1 +SFX 304 Y IES [^AEIOU]Y + +SFX 305 N 1 +SFX 305 0 Y/102 [^Y] + diff --git a/src/backend/tsearch/dicts/hunspell_sample_num.dict b/src/backend/tsearch/dicts/hunspell_sample_num.dict new file mode 100644 index 0000000000..9db29dc780 --- /dev/null +++ b/src/backend/tsearch/dicts/hunspell_sample_num.dict @@ -0,0 +1,8 @@ +book/302,301,202,303 +booking/303,201 +footballklubber +foot/101,303 +football/101 +ball/303,101,305 +klubber/101 +sky/304 diff --git a/src/backend/tsearch/ispell_sample.affix b/src/backend/tsearch/dicts/ispell_sample.affix similarity index 100% rename from src/backend/tsearch/ispell_sample.affix rename to src/backend/tsearch/dicts/ispell_sample.affix diff --git a/src/backend/tsearch/ispell_sample.dict b/src/backend/tsearch/dicts/ispell_sample.dict similarity index 100% rename from src/backend/tsearch/ispell_sample.dict rename to src/backend/tsearch/dicts/ispell_sample.dict diff --git a/src/backend/tsearch/synonym_sample.syn b/src/backend/tsearch/dicts/synonym_sample.syn similarity index 100% rename from src/backend/tsearch/synonym_sample.syn rename to src/backend/tsearch/dicts/synonym_sample.syn diff --git a/src/backend/tsearch/thesaurus_sample.ths b/src/backend/tsearch/dicts/thesaurus_sample.ths similarity index 100% rename from src/backend/tsearch/thesaurus_sample.ths rename to src/backend/tsearch/dicts/thesaurus_sample.ths diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c index 49280b831a..be3432693d 100644 --- a/src/backend/tsearch/spell.c +++ b/src/backend/tsearch/spell.c @@ -5,6 +5,54 @@ * * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * + * Ispell dictionary + * ----------------- + * + * Rules of dictionaries are defined in two files with .affix and .dict + * extensions. They are used by spell checker programs Ispell and Hunspell. + * + * An .affix file declares morphological rules to get a basic form of words. + * The format of an .affix file has different structure for Ispell and Hunspell + * dictionaries. The Hunspell format is more complicated. But when an .affix + * file is imported and compiled, it is stored in the same structure AffixNode. + * + * A .dict file stores a list of basic forms of words with references to + * affix rules. The format of a .dict file has the same structure for Ispell + * and Hunspell dictionaries. + * + * Compilation of a dictionary + * --------------------------- + * + * A compiled dictionary is stored in the IspellDict structure. Compilation of + * a dictionary is divided into the several steps: + * - NIImportDictionary() - stores each word of a .dict file in the + * temporary Spell field. + * - NIImportAffixes() - stores affix rules of an .affix file in the + * Affix field (not temporary) if an .affix file has the Ispell format. + * -> NIImportOOAffixes() - stores affix rules if an .affix file has the + * Hunspell format. The AffixData field is initialized if AF parameter + * is defined. + * - NISortDictionary() - builds a prefix tree (Trie) from the words list + * and stores it in the Dictionary field. The words list is got from the + * Spell field. The AffixData field is initialized if AF parameter is not + * defined. + * - NISortAffixes(): + * - builds a list of compond affixes from the affix list and stores it + * in the CompoundAffix. + * - builds prefix trees (Trie) from the affix list for prefixes and suffixes + * and stores them in Suffix and Prefix fields. + * The affix list is got from the Affix field. + * + * Memory management + * ----------------- + * + * The IspellDict structure has the Spell field which is used only in compile + * time. The Spell field stores a words list. It can take a lot of memory. + * Therefore when a dictionary is compiled this field is cleared by + * NIFinishBuild(). + * + * All resources which should cleared by NIFinishBuild() is initialized using + * tmpalloc() and tmpalloc0(). * * IDENTIFICATION * src/backend/tsearch/spell.c @@ -150,10 +198,12 @@ cmpspell(const void *s1, const void *s2) { return (strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word)); } + static int cmpspellaffix(const void *s1, const void *s2) { - return (strncmp((*(SPELL *const *) s1)->p.flag, (*(SPELL *const *) s2)->p.flag, MAXFLAGLEN)); + return (strcmp((*(SPELL *const *) s1)->p.flag, + (*(SPELL *const *) s2)->p.flag)); } static char * @@ -220,6 +270,11 @@ strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count) return 0; } +/* + * Compares affixes. + * First compares the type of an affix. Prefixes should go before affixes. + * If types are equal then compares replaceable string. + */ static int cmpaffix(const void *s1, const void *s2) { @@ -237,6 +292,176 @@ cmpaffix(const void *s1, const void *s2) (const unsigned char *) a2->repl); } +/* + * Gets an affix flag from string representation (a set of affixes). + * + * Several flags can be stored in a single string. Flags can be represented by: + * - 1 character (FM_CHAR). + * - 2 characters (FM_LONG). + * - numbers from 1 to 65000 (FM_NUM). + * + * Depending on the flagMode an affix string can have the following format: + * - FM_CHAR: ABCD + * Here we have 4 flags: A, B, C and D + * - FM_LONG: ABCDE* + * Here we have 3 flags: AB, CD and E* + * - FM_NUM: 200,205,50 + * Here we have 3 flags: 200, 205 and 50 + * + * Conf: current dictionary. + * sflag: string representation (a set of affixes) of an affix flag. + * sflagnext: returns reference to the start of a next affix flag in the sflag. + * + * Returns an integer representation of the affix flag. + */ +static uint16 +DecodeFlag(IspellDict *Conf, char *sflag, char **sflagnext) +{ + int32 s; + char *next; + unsigned char *usflag; + + switch (Conf->flagMode) + { + case FM_LONG: + /* + * Hunspell docs says flag could contains only + * ASCII characters + */ + if (!(pg_mblen(sflag) == 1 && isascii(sflag[0]) && + pg_mblen(sflag + 1) == 1 && isascii(sflag[1]))) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("non-ASCII affix flag \"%s\"", sflag))); + + usflag = (unsigned char *)sflag; + s = ((int)usflag[0]) << 8 | ((int)usflag[1]); + if (sflagnext) + /* Go to start of the next flag */ + *sflagnext = sflag + 2; + break; + case FM_NUM: + s = strtol(sflag, &next, 10); + if (s <= 0 || s > FLAGNUM_MAXSIZE) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix flag \"%s\"", sflag))); + + if (sflagnext) + { + /* Go to start of the next flag */ + if (next) + { + bool met_comma = false; + + while (*next) + { + if (!(pg_mblen(next) == 1 && isascii(*next))) + { + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("non-ASCII affix flag \"%s\"", + sflag))); + } + else if (isdigit(*next)) + { + if (!met_comma) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix flag \"%s\"", + sflag))); + break; + } + else if (*next == ',') + { + if (met_comma) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix flag \"%s\"", + sflag))); + met_comma = true; + } + else if (!isspace(*next)) + { + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid character in affix flag \"%s\"", sflag))); + } + + next++; + } + + if (*next == '\0') + next = NULL; + } + + *sflagnext = next; + } + break; + default: + if (!(pg_mblen(sflag) == 1 && isascii(*sflag))) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("non-ASCII affix flag \"%s\"", sflag))); + + s = *sflag; + if (sflagnext) + /* Go to start of the next flag */ + *sflagnext = sflag + pg_mblen(sflag); + } + + return (uint16)s; +} + +/* + * Checks if the affix set Conf->AffixData[affix] contains affixflag. + * Conf->AffixData[affix] is the string representation of an affix flags. + * Conf->AffixData[affix] does not contain affixflag if this flag is not used + * actually by the .dict file. + * + * Conf: current dictionary. + * affix: index of the Conf->AffixData array. + * affixflag: integer representation of the affix flag. + * + * Returns true if the string Conf->AffixData[affix] contains affixflag, + * otherwise returns false. + */ +static bool +IsAffixFlagInUse(IspellDict *Conf, int affix, uint16 affixflag) +{ + char *flagcur; + char *flagnext = NULL; + + if (affixflag == 0) + return true; + + flagcur = Conf->AffixData[affix]; + + while (*flagcur) + { + /* Compare first affix flag in flagcur with affixflag */ + if (DecodeFlag(Conf, flagcur, &flagnext) == affixflag) + return true; + /* Otherwise go to next flag */ + if (flagnext) + flagcur = flagnext; + /* If we have not flags anymore then exit */ + else + break; + } + + /* Could not find affixflag */ + return false; +} + +/* + * Adds the new word into the temporary array Spell. + * + * Conf: current dictionary. + * word: new word. + * flag: set of affix flags. Integer representation of flag can be got by + * DecodeFlag(). + */ static void NIAddSpell(IspellDict *Conf, const char *word, const char *flag) { @@ -255,14 +480,18 @@ NIAddSpell(IspellDict *Conf, const char *word, const char *flag) } Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1); strcpy(Conf->Spell[Conf->nspell]->word, word); - strlcpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN); + Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0') + ? cpstrdup(Conf, flag) : VoidString; Conf->nspell++; } /* - * import dictionary + * Imports dictionary into the temporary array Spell. * - * Note caller must already have applied get_tsearch_config_filename + * Note caller must already have applied get_tsearch_config_filename. + * + * Conf: current dictionary. + * filename: path to the .dict file. */ void NIImportDictionary(IspellDict *Conf, const char *filename) @@ -280,6 +509,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename) { char *s, *pstr; + /* Set of affix flags */ const char *flag; /* Extract flag from the line */ @@ -324,7 +554,30 @@ NIImportDictionary(IspellDict *Conf, const char *filename) tsearch_readline_end(&trst); } - +/* + * Searches a basic form of word in the prefix tree. This word was generated + * using an affix rule. This rule may not be presented in an affix set of + * a basic form of word. + * + * For example, we have the entry in the .dict file: + * meter/GMD + * + * The affix rule with the flag S: + * SFX S y ies [^aeiou]y + * is not presented here. + * + * The affix rule with the flag M: + * SFX M 0 's . + * is presented here. + * + * Conf: current dictionary. + * word: basic form of word. + * affixflag: integer representation of the affix flag, by which a basic form of + * word was generated. + * flag: compound flag used to compare with StopMiddle->compoundflag. + * + * Returns 1 if the word was found in the prefix tree, else returns 0. + */ static int FindWord(IspellDict *Conf, const char *word, int affixflag, int flag) { @@ -349,13 +602,22 @@ FindWord(IspellDict *Conf, const char *word, int affixflag, int flag) { if (flag == 0) { + /* + * The word can be formed only with another word. + * And in the flag parameter there is not a sign + * that we search compound words. + */ if (StopMiddle->compoundflag & FF_COMPOUNDONLY) return 0; } else if ((flag & StopMiddle->compoundflag) == 0) return 0; - if ((affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL)) + /* + * Check if this affix rule is presented in the affix set + * with index StopMiddle->affix. + */ + if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag)) return 1; } node = StopMiddle->node; @@ -373,6 +635,24 @@ FindWord(IspellDict *Conf, const char *word, int affixflag, int flag) return 0; } +/* + * Adds a new affix rule to the Affix field. + * + * Conf: current dictionary. + * flag: integer representation of the affix flag ('\' in the below example). + * flagflags: set of flags from the flagval field for this affix rule. This set + * is listed after '/' character in the added string (repl). + * + * For example L flag in the hunspell_sample.affix: + * SFX \ 0 Y/L [^Y] + * + * mask: condition for search ('[^Y]' in the above example). + * find: stripping characters from beginning (at prefix) or end (at suffix) + * of the word ('0' in the above example, 0 means that there is not + * stripping character). + * repl: adding string after stripping ('Y' in the above example). + * type: FF_SUFFIX or FF_PREFIX. + */ static void NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type) { @@ -394,18 +674,21 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c Affix = Conf->Affix + Conf->naffixes; - if (strcmp(mask, ".") == 0) + /* This affix rule can be applied for words with any ending */ + if (strcmp(mask, ".") == 0 || *mask == '\0') { Affix->issimple = 1; Affix->isregis = 0; } + /* This affix rule will use regis to search word ending */ else if (RS_isRegis(mask)) { Affix->issimple = 0; Affix->isregis = 1; - RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX) ? true : false, + RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX), *mask ? mask : VoidString); } + /* This affix rule will use regex_t to search word ending */ else { int masklen; @@ -457,7 +740,6 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c Conf->naffixes++; } - /* Parsing states for parse_affentry() and friends */ #define PAE_WAIT_MASK 0 #define PAE_INMASK 1 @@ -712,9 +994,16 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *pmask = *pfind = *prepl = '\0'; - return (*mask && (*find || *repl)) ? true : false; + return (*mask && (*find || *repl)); } +/* + * Sets up a correspondence for the affix parameter with the affix flag. + * + * Conf: current dictionary. + * s: affix flag in string. + * val: affix parameter. + */ static void addFlagValue(IspellDict *Conf, char *s, uint32 val) { @@ -731,12 +1020,66 @@ addFlagValue(IspellDict *Conf, char *s, uint32 val) (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("multibyte flag character is not allowed"))); - Conf->flagval[*(unsigned char *) s] = (unsigned char) val; + Conf->flagval[DecodeFlag(Conf, s, (char **)NULL)] = (unsigned char) val; Conf->usecompound = true; } /* - * Import an affix file that follows MySpell or Hunspell format + * Returns a set of affix parameters which correspondence to the set of affix + * flags s. + */ +static int +getFlagValues(IspellDict *Conf, char *s) +{ + uint32 flag = 0; + char *flagcur; + char *flagnext = NULL; + + flagcur = s; + while (*flagcur) + { + flag |= Conf->flagval[DecodeFlag(Conf, flagcur, &flagnext)]; + if (flagnext) + flagcur = flagnext; + else + break; + } + + return flag; +} + +/* + * Returns a flag set using the s parameter. + * + * If Conf->useFlagAliases is true then the s parameter is index of the + * Conf->AffixData array and function returns its entry. + * Else function returns the s parameter. + */ +static char * +getFlags(IspellDict *Conf, char *s) +{ + if (Conf->useFlagAliases) + { + int curaffix = strtol(s, (char **)NULL, 10); + + if (curaffix && curaffix <= Conf->nAffixData) + /* + * Do not substract 1 from curaffix + * because empty string was added in NIImportOOAffixes + */ + return Conf->AffixData[curaffix]; + else + return VoidString; + } + else + return s; +} + +/* + * Import an affix file that follows MySpell or Hunspell format. + * + * Conf: current dictionary. + * filename: path to the .affix file. */ static void NIImportOOAffixes(IspellDict *Conf, const char *filename) @@ -751,7 +1094,10 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) char repl[BUFSIZ], *prepl; bool isSuffix = false; - int flag = 0; + int naffix = 0, + curaffix = 0; + int flag = 0, + sflaglen = 0; char flagflags = 0; tsearch_readline_state trst; char *recoded; @@ -759,6 +1105,8 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) /* read file to find any flag */ memset(Conf->flagval, 0, sizeof(Conf->flagval)); Conf->usecompound = false; + Conf->useFlagAliases = false; + Conf->flagMode = FM_CHAR; if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, @@ -806,10 +1154,18 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) while (*s && t_isspace(s)) s += pg_mblen(s); - if (*s && STRNCMP(s, "default") != 0) - ereport(ERROR, + if (*s) + { + if (STRNCMP(s, "long") == 0) + Conf->flagMode = FM_LONG; + else if (STRNCMP(s, "num") == 0) + Conf->flagMode = FM_NUM; + else if (STRNCMP(s, "default") != 0) + ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), - errmsg("Ispell dictionary supports only default flag value"))); + errmsg("Ispell dictionary supports only default, " + "long and num flag value"))); + } } pfree(recoded); @@ -834,27 +1190,77 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) if (ptype) pfree(ptype); ptype = lowerstr_ctx(Conf, type); + + /* First try to parse AF parameter (alias compression) */ + if (STRNCMP(ptype, "af") == 0) + { + /* First line is the number of aliases */ + if (!Conf->useFlagAliases) + { + Conf->useFlagAliases = true; + naffix = atoi(sflag); + if (naffix == 0) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid number of flag vector aliases"))); + + /* Also reserve place for empty flag set */ + naffix++; + + Conf->AffixData = (char **) palloc0(naffix * sizeof(char *)); + Conf->lenAffixData = Conf->nAffixData = naffix; + + /* Add empty flag set into AffixData */ + Conf->AffixData[curaffix] = VoidString; + curaffix++; + } + /* Other lines is aliases */ + else + { + if (curaffix < naffix) + { + Conf->AffixData[curaffix] = cpstrdup(Conf, sflag); + curaffix++; + } + } + goto nextline; + } + /* Else try to parse prefixes and suffixes */ if (fields_read < 4 || (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0)) goto nextline; + sflaglen = strlen(sflag); + if (sflaglen == 0 + || (sflaglen > 1 && Conf->flagMode == FM_CHAR) + || (sflaglen > 2 && Conf->flagMode == FM_LONG)) + goto nextline; + + /* + * Affix header. For example: + * SFX \ N 1 + */ if (fields_read == 4) { - if (strlen(sflag) != 1) - goto nextline; - flag = *sflag; - isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false; + /* Convert the affix flag to int */ + flag = DecodeFlag(Conf, sflag, (char **)NULL); + + isSuffix = (STRNCMP(ptype, "sfx") == 0); if (t_iseq(find, 'y') || t_iseq(find, 'Y')) flagflags = FF_CROSSPRODUCT; else flagflags = 0; } + /* + * Affix fields. For example: + * SFX \ 0 Y/L [^Y] + */ else { char *ptr; int aflg = 0; - if (strlen(sflag) != 1 || flag != *sflag || flag == 0) + if (flag == 0) goto nextline; prepl = lowerstr_ctx(Conf, repl); /* Find position of '/' in lowercased string "prepl" */ @@ -866,11 +1272,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) */ *ptr = '\0'; ptr = repl + (ptr - prepl) + 1; - while (*ptr) - { - aflg |= Conf->flagval[*(unsigned char *) ptr]; - ptr++; - } + aflg |= getFlagValues(Conf, getFlags(Conf, ptr)); } pfind = lowerstr_ctx(Conf, find); pmask = lowerstr_ctx(Conf, mask); @@ -928,6 +1330,8 @@ NIImportAffixes(IspellDict *Conf, const char *filename) memset(Conf->flagval, 0, sizeof(Conf->flagval)); Conf->usecompound = false; + Conf->useFlagAliases = false; + Conf->flagMode = FM_CHAR; while ((recoded = tsearch_readline(&trst)) != NULL) { @@ -1044,6 +1448,12 @@ isnewformat: NIImportOOAffixes(Conf, filename); } +/* + * Merges two affix flag sets and stores a new affix flag set into + * Conf->AffixData. + * + * Returns index of a new affix flag set. + */ static int MergeAffix(IspellDict *Conf, int a1, int a2) { @@ -1068,21 +1478,25 @@ MergeAffix(IspellDict *Conf, int a1, int a2) return Conf->nAffixData - 1; } +/* + * Returns a set of affix parameters which correspondence to the set of affix + * flags with the given index. + */ static uint32 makeCompoundFlags(IspellDict *Conf, int affix) { - uint32 flag = 0; - char *str = Conf->AffixData[affix]; - - while (str && *str) - { - flag |= Conf->flagval[*(unsigned char *) str]; - str++; - } - - return (flag & FF_DICTFLAGMASK); + char *str = Conf->AffixData[affix]; + return (getFlagValues(Conf, str) & FF_DICTFLAGMASK); } +/* + * Makes a prefix tree for the given level. + * + * Conf: current dictionary. + * low: lower index of the Conf->Spell array. + * high: upper index of the Conf->Spell array. + * level: current prefix tree level. + */ static SPNode * mkSPNode(IspellDict *Conf, int low, int high, int level) { @@ -1115,6 +1529,7 @@ mkSPNode(IspellDict *Conf, int low, int high, int level) { if (lastchar) { + /* Next level of the prefix tree */ data->node = mkSPNode(Conf, lownew, i, level + 1); lownew = i; data++; @@ -1154,6 +1569,7 @@ mkSPNode(IspellDict *Conf, int low, int high, int level) } } + /* Next level of the prefix tree */ data->node = mkSPNode(Conf, lownew, high, level + 1); return rs; @@ -1172,44 +1588,83 @@ NISortDictionary(IspellDict *Conf) /* compress affixes */ - /* Count the number of different flags used in the dictionary */ - - qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix); - - naffix = 0; - for (i = 0; i < Conf->nspell; i++) + /* + * If we use flag aliases then we need to use Conf->AffixData filled + * in the NIImportOOAffixes(). + */ + if (Conf->useFlagAliases) { - if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN)) - naffix++; + for (i = 0; i < Conf->nspell; i++) + { + curaffix = strtol(Conf->Spell[i]->p.flag, (char **)NULL, 10); + if (curaffix && curaffix <= Conf->nAffixData) + Conf->Spell[i]->p.d.affix = curaffix; + else + /* + * If Conf->Spell[i]->p.flag is empty, then get empty value of + * Conf->AffixData (0 index). + */ + Conf->Spell[i]->p.d.affix = 0; + Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); + } } + /* Otherwise fill Conf->AffixData here */ + else + { + /* Count the number of different flags used in the dictionary */ + qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), + cmpspellaffix); - /* - * Fill in Conf->AffixData with the affixes that were used in the - * dictionary. Replace textual flag-field of Conf->Spell entries with - * indexes into Conf->AffixData array. - */ - Conf->AffixData = (char **) palloc0(naffix * sizeof(char *)); + naffix = 0; + for (i = 0; i < Conf->nspell; i++) + { + if (i == 0 + || strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag)) + naffix++; + } - curaffix = -1; - for (i = 0; i < Conf->nspell; i++) - { - if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN)) + /* + * Fill in Conf->AffixData with the affixes that were used in the + * dictionary. Replace textual flag-field of Conf->Spell entries with + * indexes into Conf->AffixData array. + */ + Conf->AffixData = (char **) palloc0(naffix * sizeof(char *)); + + curaffix = -1; + for (i = 0; i < Conf->nspell; i++) { - curaffix++; - Assert(curaffix < naffix); - Conf->AffixData[curaffix] = cpstrdup(Conf, Conf->Spell[i]->p.flag); + if (i == 0 + || strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix])) + { + curaffix++; + Assert(curaffix < naffix); + Conf->AffixData[curaffix] = cpstrdup(Conf, + Conf->Spell[i]->p.flag); + } + + Conf->Spell[i]->p.d.affix = curaffix; + Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); } - Conf->Spell[i]->p.d.affix = curaffix; - Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); + Conf->lenAffixData = Conf->nAffixData = naffix; } - Conf->lenAffixData = Conf->nAffixData = naffix; - + /* Start build a prefix tree */ qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell); Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0); } +/* + * Makes a prefix tree for the given level using the repl string of an affix + * rule. Affixes with empty replace string do not include in the prefix tree. + * This affixes are included by mkVoidAffix(). + * + * Conf: current dictionary. + * low: lower index of the Conf->Affix array. + * high: upper index of the Conf->Affix array. + * level: current prefix tree level. + * type: FF_SUFFIX or FF_PREFIX. + */ static AffixNode * mkANode(IspellDict *Conf, int low, int high, int level, int type) { @@ -1247,6 +1702,7 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type) { if (lastchar) { + /* Next level of the prefix tree */ data->node = mkANode(Conf, lownew, i, level + 1, type); if (naff) { @@ -1267,6 +1723,7 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type) } } + /* Next level of the prefix tree */ data->node = mkANode(Conf, lownew, high, level + 1, type); if (naff) { @@ -1281,6 +1738,10 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type) return rs; } +/* + * Makes the root void node in the prefix tree. The root void node is created + * for affixes which have empty replace string ("repl" field). + */ static void mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix) { @@ -1304,11 +1765,12 @@ mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix) Conf->Prefix = Affix; } - + /* Count affixes with empty replace string */ for (i = start; i < end; i++) if (Conf->Affix[i].replen == 0) cnt++; + /* There is not affixes with empty replace string */ if (cnt == 0) return; @@ -1324,18 +1786,31 @@ mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix) } } +/* + * Checks if the affixflag is used by dictionary. Conf->AffixData does not + * contain affixflag if this flag is not used actually by the .dict file. + * + * Conf: current dictionary. + * affixflag: integer representation of the affix flag. + * + * Returns true if the Conf->AffixData array contains affixflag, otherwise + * returns false. + */ static bool -isAffixInUse(IspellDict *Conf, char flag) +isAffixInUse(IspellDict *Conf, uint16 affixflag) { int i; for (i = 0; i < Conf->nAffixData; i++) - if (strchr(Conf->AffixData[i], flag) != NULL) + if (IsAffixFlagInUse(Conf, i, affixflag)) return true; return false; } +/* + * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes. + */ void NISortAffixes(IspellDict *Conf) { @@ -1347,6 +1822,7 @@ NISortAffixes(IspellDict *Conf) if (Conf->naffixes == 0) return; + /* Store compound affixes in the Conf->CompoundAffix array */ if (Conf->naffixes > 1) qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix); Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes); @@ -1359,7 +1835,7 @@ NISortAffixes(IspellDict *Conf) firstsuffix = i; if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 && - isAffixInUse(Conf, (char) Affix->flag)) + isAffixInUse(Conf, Affix->flag)) { if (ptr == Conf->CompoundAffix || ptr->issuffix != (ptr - 1)->issuffix || @@ -1370,7 +1846,7 @@ NISortAffixes(IspellDict *Conf) /* leave only unique and minimals suffixes */ ptr->affix = Affix->repl; ptr->len = Affix->replen; - ptr->issuffix = (Affix->type == FF_SUFFIX) ? true : false; + ptr->issuffix = (Affix->type == FF_SUFFIX); ptr++; } } @@ -1378,6 +1854,7 @@ NISortAffixes(IspellDict *Conf) ptr->affix = NULL; Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1)); + /* Start build a prefix tree */ Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX); Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX); mkVoidAffix(Conf, true, firstsuffix); @@ -1825,7 +2302,7 @@ SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int if (StopLow < StopHigh) { - if (level == FF_COMPOUNDBEGIN) + if (startpos == 0) compoundflag = FF_COMPOUNDBEGIN; else if (level == wordlen - 1) compoundflag = FF_COMPOUNDLAST; diff --git a/src/include/tsearch/dicts/spell.h b/src/include/tsearch/dicts/spell.h index 7e2df833e8..8116bd2016 100644 --- a/src/include/tsearch/dicts/spell.h +++ b/src/include/tsearch/dicts/spell.h @@ -19,18 +19,18 @@ #include "tsearch/ts_public.h" /* - * Max length of a flag name. Names longer than this will be truncated - * to the maximum. + * SPNode and SPNodeData are used to represent prefix tree (Trie) to store + * a words list. */ -#define MAXFLAGLEN 16 - struct SPNode; typedef struct { uint32 val:8, isword:1, + /* Stores compound flags listed below */ compoundflag:4, + /* Reference to an entry of the AffixData field */ affix:19; struct SPNode *node; } SPNodeData; @@ -43,7 +43,8 @@ typedef struct #define FF_COMPOUNDBEGIN 0x02 #define FF_COMPOUNDMIDDLE 0x04 #define FF_COMPOUNDLAST 0x08 -#define FF_COMPOUNDFLAG ( FF_COMPOUNDBEGIN | FF_COMPOUNDMIDDLE | FF_COMPOUNDLAST ) +#define FF_COMPOUNDFLAG ( FF_COMPOUNDBEGIN | FF_COMPOUNDMIDDLE | \ + FF_COMPOUNDLAST ) #define FF_DICTFLAGMASK 0x0f typedef struct SPNode @@ -54,19 +55,24 @@ typedef struct SPNode #define SPNHDRSZ (offsetof(SPNode,data)) - +/* + * Represents an entry in a words list. + */ typedef struct spell_struct { union { /* - * flag is filled in by NIImportDictionary. After NISortDictionary, d - * is valid and flag is invalid. + * flag is filled in by NIImportDictionary(). After NISortDictionary(), + * d is used instead of flag. */ - char flag[MAXFLAGLEN]; + char *flag; + /* d is used in mkSPNode() */ struct { + /* Reference to an entry of the AffixData field */ int affix; + /* Length of the word */ int len; } d; } p; @@ -75,10 +81,14 @@ typedef struct spell_struct #define SPELLHDRSZ (offsetof(SPELL, word)) +/* + * Represents an entry in an affix list. + */ typedef struct aff_struct { - uint32 flag:8, - type:1, + uint32 flag:16; + /* FF_SUFFIX or FF_PREFIX */ + uint32 type:1, flagflags:7, issimple:1, isregis:1, @@ -106,6 +116,10 @@ typedef struct aff_struct #define FF_SUFFIX 1 #define FF_PREFIX 0 +/* + * AffixNode and AffixNodeData are used to represent prefix tree (Trie) to store + * an affix list. + */ struct AffixNode; typedef struct @@ -132,6 +146,16 @@ typedef struct bool issuffix; } CMPDAffix; +typedef enum +{ + FM_CHAR, + FM_LONG, + FM_NUM +} FlagMode; + +#define FLAGCHAR_MAXSIZE (1 << 8) +#define FLAGNUM_MAXSIZE (1 << 16) + typedef struct { int maffixes; @@ -142,14 +166,17 @@ typedef struct AffixNode *Prefix; SPNode *Dictionary; + /* Array of sets of affixes */ char **AffixData; int lenAffixData; int nAffixData; + bool useFlagAliases; CMPDAffix *CompoundAffix; - unsigned char flagval[256]; + unsigned char flagval[FLAGNUM_MAXSIZE]; bool usecompound; + FlagMode flagMode; /* * Remaining fields are only used during dictionary construction; they are diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out index 9df1434a14..ef86295f88 100644 --- a/src/test/regress/expected/tsdicts.out +++ b/src/test/regress/expected/tsdicts.out @@ -191,6 +191,198 @@ SELECT ts_lexize('hunspell', 'footballyklubber'); {foot,ball,klubber} (1 row) +-- Test ISpell dictionary with hunspell affix file with FLAG long parameter +CREATE TEXT SEARCH DICTIONARY hunspell_long ( + Template=ispell, + DictFile=hunspell_sample_long, + AffFile=hunspell_sample_long +); +SELECT ts_lexize('hunspell_long', 'skies'); + ts_lexize +----------- + {sky} +(1 row) + +SELECT ts_lexize('hunspell_long', 'bookings'); + ts_lexize +---------------- + {booking,book} +(1 row) + +SELECT ts_lexize('hunspell_long', 'booking'); + ts_lexize +---------------- + {booking,book} +(1 row) + +SELECT ts_lexize('hunspell_long', 'foot'); + ts_lexize +----------- + {foot} +(1 row) + +SELECT ts_lexize('hunspell_long', 'foots'); + ts_lexize +----------- + {foot} +(1 row) + +SELECT ts_lexize('hunspell_long', 'rebookings'); + ts_lexize +---------------- + {booking,book} +(1 row) + +SELECT ts_lexize('hunspell_long', 'rebooking'); + ts_lexize +---------------- + {booking,book} +(1 row) + +SELECT ts_lexize('hunspell_long', 'rebook'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('hunspell_long', 'unbookings'); + ts_lexize +----------- + {book} +(1 row) + +SELECT ts_lexize('hunspell_long', 'unbooking'); + ts_lexize +----------- + {book} +(1 row) + +SELECT ts_lexize('hunspell_long', 'unbook'); + ts_lexize +----------- + {book} +(1 row) + +SELECT ts_lexize('hunspell_long', 'footklubber'); + ts_lexize +---------------- + {foot,klubber} +(1 row) + +SELECT ts_lexize('hunspell_long', 'footballklubber'); + ts_lexize +------------------------------------------------------ + {footballklubber,foot,ball,klubber,football,klubber} +(1 row) + +SELECT ts_lexize('hunspell_long', 'ballyklubber'); + ts_lexize +---------------- + {ball,klubber} +(1 row) + +SELECT ts_lexize('hunspell_long', 'footballyklubber'); + ts_lexize +--------------------- + {foot,ball,klubber} +(1 row) + +-- Test ISpell dictionary with hunspell affix file with FLAG num parameter +CREATE TEXT SEARCH DICTIONARY hunspell_num ( + Template=ispell, + DictFile=hunspell_sample_num, + AffFile=hunspell_sample_num +); +SELECT ts_lexize('hunspell_num', 'skies'); + ts_lexize +----------- + {sky} +(1 row) + +SELECT ts_lexize('hunspell_num', 'bookings'); + ts_lexize +---------------- + {booking,book} +(1 row) + +SELECT ts_lexize('hunspell_num', 'booking'); + ts_lexize +---------------- + {booking,book} +(1 row) + +SELECT ts_lexize('hunspell_num', 'foot'); + ts_lexize +----------- + {foot} +(1 row) + +SELECT ts_lexize('hunspell_num', 'foots'); + ts_lexize +----------- + {foot} +(1 row) + +SELECT ts_lexize('hunspell_num', 'rebookings'); + ts_lexize +---------------- + {booking,book} +(1 row) + +SELECT ts_lexize('hunspell_num', 'rebooking'); + ts_lexize +---------------- + {booking,book} +(1 row) + +SELECT ts_lexize('hunspell_num', 'rebook'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('hunspell_num', 'unbookings'); + ts_lexize +----------- + {book} +(1 row) + +SELECT ts_lexize('hunspell_num', 'unbooking'); + ts_lexize +----------- + {book} +(1 row) + +SELECT ts_lexize('hunspell_num', 'unbook'); + ts_lexize +----------- + {book} +(1 row) + +SELECT ts_lexize('hunspell_num', 'footklubber'); + ts_lexize +---------------- + {foot,klubber} +(1 row) + +SELECT ts_lexize('hunspell_num', 'footballklubber'); + ts_lexize +------------------------------------------------------ + {footballklubber,foot,ball,klubber,football,klubber} +(1 row) + +SELECT ts_lexize('hunspell_num', 'ballyklubber'); + ts_lexize +---------------- + {ball,klubber} +(1 row) + +SELECT ts_lexize('hunspell_num', 'footballyklubber'); + ts_lexize +--------------------- + {foot,ball,klubber} +(1 row) + -- Synonim dictionary CREATE TEXT SEARCH DICTIONARY synonym ( Template=synonym, @@ -277,6 +469,48 @@ SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); 'foot':B & 'ball':B & 'klubber':B & ( 'booking':A | 'book':A ) & 'sky' (1 row) +-- Test ispell dictionary with hunspell affix with FLAG long in configuration +ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING + REPLACE hunspell WITH hunspell_long; +SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); + to_tsvector +---------------------------------------------------------------------------------------------------- + 'ball':7 'book':1,5 'booking':1,5 'foot':7,10 'football':7 'footballklubber':7 'klubber':7 'sky':3 +(1 row) + +SELECT to_tsquery('hunspell_tst', 'footballklubber'); + to_tsquery +------------------------------------------------------------------------------ + ( 'footballklubber' | 'foot' & 'ball' & 'klubber' ) | 'football' & 'klubber' +(1 row) + +SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); + to_tsquery +------------------------------------------------------------------------ + 'foot':B & 'ball':B & 'klubber':B & ( 'booking':A | 'book':A ) & 'sky' +(1 row) + +-- Test ispell dictionary with hunspell affix with FLAG num in configuration +ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING + REPLACE hunspell_long WITH hunspell_num; +SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); + to_tsvector +---------------------------------------------------------------------------------------------------- + 'ball':7 'book':1,5 'booking':1,5 'foot':7,10 'football':7 'footballklubber':7 'klubber':7 'sky':3 +(1 row) + +SELECT to_tsquery('hunspell_tst', 'footballklubber'); + to_tsquery +------------------------------------------------------------------------------ + ( 'footballklubber' | 'foot' & 'ball' & 'klubber' ) | 'football' & 'klubber' +(1 row) + +SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); + to_tsquery +------------------------------------------------------------------------ + 'foot':B & 'ball':B & 'klubber':B & ( 'booking':A | 'book':A ) & 'sky' +(1 row) + -- Test synonym dictionary in configuration CREATE TEXT SEARCH CONFIGURATION synonym_tst ( COPY=english diff --git a/src/test/regress/sql/tsdicts.sql b/src/test/regress/sql/tsdicts.sql index 55afcec906..d13ce2e378 100644 --- a/src/test/regress/sql/tsdicts.sql +++ b/src/test/regress/sql/tsdicts.sql @@ -48,6 +48,54 @@ SELECT ts_lexize('hunspell', 'footballklubber'); SELECT ts_lexize('hunspell', 'ballyklubber'); SELECT ts_lexize('hunspell', 'footballyklubber'); +-- Test ISpell dictionary with hunspell affix file with FLAG long parameter +CREATE TEXT SEARCH DICTIONARY hunspell_long ( + Template=ispell, + DictFile=hunspell_sample_long, + AffFile=hunspell_sample_long +); + +SELECT ts_lexize('hunspell_long', 'skies'); +SELECT ts_lexize('hunspell_long', 'bookings'); +SELECT ts_lexize('hunspell_long', 'booking'); +SELECT ts_lexize('hunspell_long', 'foot'); +SELECT ts_lexize('hunspell_long', 'foots'); +SELECT ts_lexize('hunspell_long', 'rebookings'); +SELECT ts_lexize('hunspell_long', 'rebooking'); +SELECT ts_lexize('hunspell_long', 'rebook'); +SELECT ts_lexize('hunspell_long', 'unbookings'); +SELECT ts_lexize('hunspell_long', 'unbooking'); +SELECT ts_lexize('hunspell_long', 'unbook'); + +SELECT ts_lexize('hunspell_long', 'footklubber'); +SELECT ts_lexize('hunspell_long', 'footballklubber'); +SELECT ts_lexize('hunspell_long', 'ballyklubber'); +SELECT ts_lexize('hunspell_long', 'footballyklubber'); + +-- Test ISpell dictionary with hunspell affix file with FLAG num parameter +CREATE TEXT SEARCH DICTIONARY hunspell_num ( + Template=ispell, + DictFile=hunspell_sample_num, + AffFile=hunspell_sample_num +); + +SELECT ts_lexize('hunspell_num', 'skies'); +SELECT ts_lexize('hunspell_num', 'bookings'); +SELECT ts_lexize('hunspell_num', 'booking'); +SELECT ts_lexize('hunspell_num', 'foot'); +SELECT ts_lexize('hunspell_num', 'foots'); +SELECT ts_lexize('hunspell_num', 'rebookings'); +SELECT ts_lexize('hunspell_num', 'rebooking'); +SELECT ts_lexize('hunspell_num', 'rebook'); +SELECT ts_lexize('hunspell_num', 'unbookings'); +SELECT ts_lexize('hunspell_num', 'unbooking'); +SELECT ts_lexize('hunspell_num', 'unbook'); + +SELECT ts_lexize('hunspell_num', 'footklubber'); +SELECT ts_lexize('hunspell_num', 'footballklubber'); +SELECT ts_lexize('hunspell_num', 'ballyklubber'); +SELECT ts_lexize('hunspell_num', 'footballyklubber'); + -- Synonim dictionary CREATE TEXT SEARCH DICTIONARY synonym ( Template=synonym, @@ -94,6 +142,22 @@ SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footb SELECT to_tsquery('hunspell_tst', 'footballklubber'); SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); +-- Test ispell dictionary with hunspell affix with FLAG long in configuration +ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING + REPLACE hunspell WITH hunspell_long; + +SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); +SELECT to_tsquery('hunspell_tst', 'footballklubber'); +SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); + +-- Test ispell dictionary with hunspell affix with FLAG num in configuration +ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING + REPLACE hunspell_long WITH hunspell_num; + +SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); +SELECT to_tsquery('hunspell_tst', 'footballklubber'); +SELECT to_tsquery('hunspell_tst', 'footballyklubber:b & rebookings:A & sky'); + -- Test synonym dictionary in configuration CREATE TEXT SEARCH CONFIGURATION synonym_tst ( COPY=english -- 2.40.0