* spell.c
* Normalizing word with ISpell
*
- * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/spell.c,v 1.15 2009/01/29 16:22:10 teodor Exp $
+ * src/backend/tsearch/spell.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
+#include "catalog/pg_collation.h"
#include "tsearch/dicts/spell.h"
#include "tsearch/ts_locale.h"
#include "utils/memutils.h"
/*
* Initialization requires a lot of memory that's not needed
- * after the initialization is done. In init function,
- * CurrentMemoryContext is a long lived memory context associated
- * with the dictionary cache entry, so we use a temporary context
- * for the short-lived stuff.
+ * after the initialization is done. During initialization,
+ * CurrentMemoryContext is the long-lived memory context associated
+ * with the dictionary cache entry. We keep the short-lived stuff
+ * in the Conf->buildCxt context.
*/
-static MemoryContext tmpCtx = NULL;
+#define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz))
+#define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz))
-#define tmpalloc(sz) MemoryContextAlloc(tmpCtx, (sz))
-#define tmpalloc0(sz) MemoryContextAllocZero(tmpCtx, (sz))
-
-static void
-checkTmpCtx(void)
+/*
+ * Prepare for constructing an ISpell dictionary.
+ *
+ * The IspellDict struct is assumed to be zeroed when allocated.
+ */
+void
+NIStartBuild(IspellDict *Conf)
{
/*
- * XXX: This assumes that CurrentMemoryContext doesn't have any children
- * other than the one we create here.
+ * The temp context is a child of CurTransactionContext, so that it will
+ * go away automatically on error.
*/
- if (CurrentMemoryContext->firstchild == NULL)
+ Conf->buildCxt = AllocSetContextCreate(CurTransactionContext,
+ "Ispell dictionary init context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+}
+
+/*
+ * Clean up when dictionary construction is complete.
+ */
+void
+NIFinishBuild(IspellDict *Conf)
+{
+ /* Release no-longer-needed temp memory */
+ MemoryContextDelete(Conf->buildCxt);
+ /* Just for cleanliness, zero the now-dangling pointers */
+ Conf->buildCxt = NULL;
+ Conf->Spell = NULL;
+ Conf->firstfree = NULL;
+}
+
+
+/*
+ * "Compact" palloc: allocate without extra palloc overhead.
+ *
+ * Since we have no need to free the ispell data items individually, there's
+ * not much value in the per-chunk overhead normally consumed by palloc.
+ * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
+ *
+ * We currently pre-zero all data allocated this way, even though some of it
+ * doesn't need that. The cpalloc and cpalloc0 macros are just documentation
+ * to indicate which allocations actually require zeroing.
+ */
+#define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */
+#define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */
+
+static void *
+compact_palloc0(IspellDict *Conf, size_t size)
+{
+ void *result;
+
+ /* Should only be called during init */
+ Assert(Conf->buildCxt != NULL);
+
+ /* No point in this for large chunks */
+ if (size > COMPACT_MAX_REQ)
+ return palloc0(size);
+
+ /* Keep everything maxaligned */
+ size = MAXALIGN(size);
+
+ /* Need more space? */
+ if (size > Conf->avail)
{
- tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
- "Ispell dictionary init context",
- ALLOCSET_DEFAULT_MINSIZE,
- ALLOCSET_DEFAULT_INITSIZE,
- ALLOCSET_DEFAULT_MAXSIZE);
+ Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
+ Conf->avail = COMPACT_ALLOC_CHUNK;
}
- else
- tmpCtx = CurrentMemoryContext->firstchild;
+
+ result = (void *) Conf->firstfree;
+ Conf->firstfree += size;
+ Conf->avail -= size;
+
+ return result;
}
+#define cpalloc(size) compact_palloc0(Conf, size)
+#define cpalloc0(size) compact_palloc0(Conf, size)
+
static char *
-lowerstr_ctx(char *src)
+cpstrdup(IspellDict *Conf, const char *str)
+{
+ char *res = cpalloc(strlen(str) + 1);
+
+ strcpy(res, str);
+ return res;
+}
+
+
+/*
+ * Apply lowerstr(), producing a temporary result (in the buildCxt).
+ */
+static char *
+lowerstr_ctx(IspellDict *Conf, const char *src)
{
MemoryContext saveCtx;
char *dst;
- saveCtx = MemoryContextSwitchTo(tmpCtx);
+ saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
dst = lowerstr(src);
MemoryContextSwitchTo(saveCtx);
#define MAXNORMLEN 256
#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
-#define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
+#define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
static char *VoidString = "";
static int
cmpspell(const void *s1, const void *s2)
{
- return (strcmp((*(const SPELL **) s1)->word, (*(const SPELL **) s2)->word));
+ return (strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word));
}
static int
cmpspellaffix(const void *s1, const void *s2)
{
- return (strncmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag, MAXFLAGLEN));
+ return (strncmp((*(SPELL *const *) s1)->p.flag, (*(SPELL *const *) s2)->p.flag, MAXFLAGLEN));
}
static char *
return 0;
}
+
static int
strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
{
{
if (Conf->mspell)
{
- Conf->mspell += 1024 * 20;
+ Conf->mspell *= 2;
Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
}
else
tsearch_readline_state trst;
char *line;
- checkTmpCtx();
-
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
}
s += pg_mblen(s);
}
- pstr = lowerstr_ctx(line);
+ pstr = lowerstr_ctx(Conf, line);
NIAddSpell(Conf, pstr, flag);
pfree(pstr);
SPNodeData *StopLow,
*StopHigh,
*StopMiddle;
- uint8 *ptr = (uint8 *) word;
+ const uint8 *ptr = (const uint8 *) word;
flag &= FF_DICTFLAGMASK;
{
if (Conf->maffixes)
{
- Conf->maffixes += 16;
+ Conf->maffixes *= 2;
Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
}
else
wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
- err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen, REG_ADVANCED | REG_NOSUB);
+ err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen,
+ REG_ADVANCED | REG_NOSUB,
+ DEFAULT_COLLATION_OID);
if (err)
{
char errstr[100];
Affix->flag = flag;
Affix->type = type;
- Affix->find = (find && *find) ? pstrdup(find) : VoidString;
+ Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
if ((Affix->replen = strlen(repl)) > 0)
- Affix->repl = pstrdup(repl);
+ Affix->repl = cpstrdup(Conf, repl);
else
Affix->repl = VoidString;
Conf->naffixes++;
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed")));
- Conf->flagval[*(unsigned char*) s] = (unsigned char) val;
+ Conf->flagval[*(unsigned char *) s] = (unsigned char) val;
Conf->usecompound = true;
}
char scanbuf[BUFSIZ];
char *recoded;
- checkTmpCtx();
-
/* read file to find any flag */
memset(Conf->flagval, 0, sizeof(Conf->flagval));
Conf->usecompound = false;
if (ptype)
pfree(ptype);
- ptype = lowerstr_ctx(type);
+ ptype = lowerstr_ctx(Conf, type);
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
goto nextline;
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
goto nextline;
- prepl = lowerstr_ctx(repl);
+ prepl = lowerstr_ctx(Conf, repl);
/* affix flag */
if ((ptr = strchr(prepl, '/')) != NULL)
{
ptr = repl + (ptr - prepl) + 1;
while (*ptr)
{
- aflg |= Conf->flagval[*(unsigned char*) ptr];
+ aflg |= Conf->flagval[*(unsigned char *) ptr];
ptr++;
}
}
- pfind = lowerstr_ctx(find);
- pmask = lowerstr_ctx(mask);
+ pfind = lowerstr_ctx(Conf, find);
+ pmask = lowerstr_ctx(Conf, mask);
if (t_iseq(find, '0'))
*pfind = '\0';
if (t_iseq(repl, '0'))
bool oldformat = false;
char *recoded = NULL;
- checkTmpCtx();
-
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
if (*s && pg_mblen(s) == 1)
{
- Conf->flagval[*(unsigned char*) s] = FF_COMPOUNDFLAG;
+ Conf->flagval[*(unsigned char *) s] = FF_COMPOUNDFLAG;
Conf->usecompound = true;
}
oldformat = true;
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed")));
- flag = *(unsigned char*) s;
+ flag = *(unsigned char *) s;
goto nextline;
}
if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 || STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
}
ptr = Conf->AffixData + Conf->nAffixData;
- *ptr = palloc(strlen(Conf->AffixData[a1]) + strlen(Conf->AffixData[a2]) +
- 1 /* space */ + 1 /* \0 */ );
+ *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
+ strlen(Conf->AffixData[a2]) +
+ 1 /* space */ + 1 /* \0 */ );
sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]);
ptr++;
*ptr = NULL;
while (str && *str)
{
- flag |= Conf->flagval[*(unsigned char*) str];
+ flag |= Conf->flagval[*(unsigned char *) str];
str++;
}
if (!nchar)
return NULL;
- rs = (SPNode *) palloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
+ rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
rs->length = nchar;
data = rs->data;
int naffix = 0;
int curaffix;
- checkTmpCtx();
-
/* compress affixes */
/* Count the number of different flags used in the dictionary */
{
curaffix++;
Assert(curaffix < naffix);
- Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag);
+ Conf->AffixData[curaffix] = cpstrdup(Conf, Conf->Spell[i]->p.flag);
}
Conf->Spell[i]->p.d.affix = curaffix;
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
-
- Conf->Spell = NULL;
}
static AffixNode *
aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
naff = 0;
- rs = (AffixNode *) palloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
+ rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
rs->length = nchar;
data = rs->data;
if (naff)
{
data->naff = naff;
- data->aff = (AFFIX **) palloc(sizeof(AFFIX *) * naff);
+ data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
naff = 0;
}
if (naff)
{
data->naff = naff;
- data->aff = (AFFIX **) palloc(sizeof(AFFIX *) * naff);
+ data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
naff = 0;
}
if (cnt == 0)
return;
- Affix->data->aff = (AFFIX **) palloc(sizeof(AFFIX *) * cnt);
+ Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
Affix->data->naff = (uint32) cnt;
cnt = 0;
CMPDAffix *ptr;
int firstsuffix = Conf->naffixes;
- checkTmpCtx();
-
if (Conf->naffixes == 0)
return;
if (forms == cur || strcmp(word, *(cur - 1)) != 0)
{
*cur = pstrdup(word);
- *(cur+1) = NULL;
+ *(cur + 1) = NULL;
return 1;
}
static void
AddStem(SplitVar *v, char *word)
{
- if ( v->nstem >= v->lenstem )
+ if (v->nstem >= v->lenstem)
{
v->lenstem *= 2;
v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
if (level + lenaff - 1 <= minpos)
continue;
- if ( lenaff >= MAXNORMLEN )
- continue; /* skip too big value */
+ if (lenaff >= MAXNORMLEN)
+ continue; /* skip too big value */
if (lenaff > 0)
memcpy(buf, word + startpos, lenaff);
buf[lenaff] = '\0';
while (*sptr)
{
- AddStem( new, *sptr );
+ AddStem(new, *sptr);
sptr++;
}
pfree(subres);
if (wordlen == level + 1)
{
/* well, it was last word */
- AddStem( var, pnstrdup(word + startpos, wordlen - startpos) );
+ AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
pfree(notprobed);
return var;
}
ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
/* we can find next word */
level++;
- AddStem( var, pnstrdup(word + startpos, level - startpos) );
+ AddStem(var, pnstrdup(word + startpos, level - startpos));
node = Conf->Dictionary;
startpos = level;
continue;
level++;
}
- AddStem( var, pnstrdup(word + startpos, wordlen - startpos) );
+ AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
pfree(notprobed);
return var;
}
static void
-addNorm( TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
+addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
{
- if ( *lres == NULL )
+ if (*lres == NULL)
*lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
- if ( *lcur - *lres < MAX_NORM-1 ) {
+ if (*lcur - *lres < MAX_NORM - 1)
+ {
(*lcur)->lexeme = word;
(*lcur)->flags = flags;
(*lcur)->nvariant = NVariant;
{
char **ptr = res;
- while (*ptr && (lcur-lres) < MAX_NORM)
+ while (*ptr && (lcur - lres) < MAX_NORM)
{
- addNorm( &lres, &lcur, *ptr, 0, NVariant++);
+ addNorm(&lres, &lcur, *ptr, 0, NVariant++);
ptr++;
}
pfree(res);
{
for (i = 0; i < var->nstem - 1; i++)
{
- addNorm( &lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
+ addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
}
- addNorm( &lres, &lcur, *subptr, 0, NVariant);
+ addNorm(&lres, &lcur, *subptr, 0, NVariant);
subptr++;
NVariant++;
}