Fix serious performance problems in json(b) to_tsvector().

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 18 Jul 2017 16:45:51 +0000 (12:45 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 18 Jul 2017 16:45:51 +0000 (12:45 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 18 Jul 2017 16:45:51 +0000 (12:45 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 18 Jul 2017 16:45:51 +0000 (12:45 -0400)
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c

index 6400440756d95e9f32897b1267fdd75eef58da0c..b410a49908add20e0ccf2a15dc14f9f1ab2a8336 100644 (file)
--- a/src/backend/tsearch/to_tsany.c
+++ b/src/backend/tsearch/to_tsany.c
@@ -28,11 +28,11 @@ typedef struct MorphOpaque
  typedef struct TSVectorBuildState
  {
         ParsedText *prs;
-       TSVector        result;
         Oid                     cfgId;
  } TSVectorBuildState;
  
-static void add_to_tsvector(void *state, char *elem_value, int elem_len);
+static void add_to_tsvector(void *_state, char *elem_value, int elem_len);
+
  
  Datum
  get_current_ts_config(PG_FUNCTION_ARGS)
@@ -270,34 +270,33 @@ jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
  {
         Oid                     cfgId = PG_GETARG_OID(0);
         Jsonb      *jb = PG_GETARG_JSONB(1);
+       TSVector        result;
         TSVectorBuildState state;
-       ParsedText *prs = (ParsedText *) palloc(sizeof(ParsedText));
+       ParsedText      prs;
  
-       prs->words = NULL;
-       state.result = NULL;
+       prs.words = NULL;
+       prs.curwords = 0;
+       state.prs = &prs;
         state.cfgId = cfgId;
-       state.prs = prs;
  
-       iterate_jsonb_string_values(jb, &state, (JsonIterateStringValuesAction) add_to_tsvector);
+       iterate_jsonb_string_values(jb, &state, add_to_tsvector);
  
-       PG_FREE_IF_COPY(jb, 1);
-
-       if (state.result == NULL)
+       if (prs.curwords > 0)
+               result = make_tsvector(&prs);
+       else
         {
                 /*
-                * There weren't any string elements in jsonb, so wee need to return
-                * an empty vector
+                * There weren't any string elements in jsonb, so we need to return an
+                * empty vector
                  */
-
-               if (prs->words != NULL)
-                       pfree(prs->words);
-
-               state.result = palloc(CALCDATASIZE(0, 0));
-               SET_VARSIZE(state.result, CALCDATASIZE(0, 0));
-               state.result->size = 0;
+               result = palloc(CALCDATASIZE(0, 0));
+               SET_VARSIZE(result, CALCDATASIZE(0, 0));
+               result->size = 0;
         }
  
-       PG_RETURN_TSVECTOR(state.result);
+       PG_FREE_IF_COPY(jb, 1);
+
+       PG_RETURN_TSVECTOR(result);
  }
  
  Datum
@@ -317,33 +316,33 @@ json_to_tsvector_byid(PG_FUNCTION_ARGS)
  {
         Oid                     cfgId = PG_GETARG_OID(0);
         text       *json = PG_GETARG_TEXT_P(1);
+       TSVector        result;
         TSVectorBuildState state;
-       ParsedText *prs = (ParsedText *) palloc(sizeof(ParsedText));
+       ParsedText      prs;
  
-       prs->words = NULL;
-       state.result = NULL;
+       prs.words = NULL;
+       prs.curwords = 0;
+       state.prs = &prs;
         state.cfgId = cfgId;
-       state.prs = prs;
  
-       iterate_json_string_values(json, &state, (JsonIterateStringValuesAction) add_to_tsvector);
+       iterate_json_string_values(json, &state, add_to_tsvector);
  
-       PG_FREE_IF_COPY(json, 1);
-       if (state.result == NULL)
+       if (prs.curwords > 0)
+               result = make_tsvector(&prs);
+       else
         {
                 /*
-                * There weren't any string elements in json, so wee need to return an
+                * There weren't any string elements in json, so we need to return an
                  * empty vector
                  */
-
-               if (prs->words != NULL)
-                       pfree(prs->words);
-
-               state.result = palloc(CALCDATASIZE(0, 0));
-               SET_VARSIZE(state.result, CALCDATASIZE(0, 0));
-               state.result->size = 0;
+               result = palloc(CALCDATASIZE(0, 0));
+               SET_VARSIZE(result, CALCDATASIZE(0, 0));
+               result->size = 0;
         }
  
-       PG_RETURN_TSVECTOR(state.result);
+       PG_FREE_IF_COPY(json, 1);
+
+       PG_RETURN_TSVECTOR(result);
  }
  
  Datum
@@ -359,45 +358,42 @@ json_to_tsvector(PG_FUNCTION_ARGS)
  }
  
  /*
- * Extend current TSVector from _state with a new one,
- * build over a json(b) element.
+ * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
   */
  static void
  add_to_tsvector(void *_state, char *elem_value, int elem_len)
  {
         TSVectorBuildState *state = (TSVectorBuildState *) _state;
         ParsedText *prs = state->prs;
-       TSVector        item_vector;
-       int                     i;
+       int32           prevwords;
  
-       prs->lenwords = elem_len / 6;
-       if (prs->lenwords == 0)
-               prs->lenwords = 2;
+       if (prs->words == NULL)
+       {
+               /*
+                * First time through: initialize words array to a reasonable size.
+                * (parsetext() will realloc it bigger as needed.)
+                */
+               prs->lenwords = Max(elem_len / 6, 64);
+               prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
+               prs->curwords = 0;
+               prs->pos = 0;
+       }
  
-       prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
-       prs->curwords = 0;
-       prs->pos = 0;
+       prevwords = prs->curwords;
  
         parsetext(state->cfgId, prs, elem_value, elem_len);
  
-       if (prs->curwords)
-       {
-               if (state->result != NULL)
-               {
-                       for (i = 0; i < prs->curwords; i++)
-                               prs->words[i].pos.pos = prs->words[i].pos.pos + TS_JUMP;
-
-                       item_vector = make_tsvector(prs);
-
-                       state->result = (TSVector) DirectFunctionCall2(tsvector_concat,
-                                                                                                                  TSVectorGetDatum(state->result),
-                                                                                                                  PointerGetDatum(item_vector));
-               }
-               else
-                       state->result = make_tsvector(prs);
-       }
+       /*
+        * If we extracted any words from this JSON element, advance pos to create
+        * an artificial break between elements.  This is because we don't want
+        * phrase searches to think that the last word in this element is adjacent
+        * to the first word in the next one.
+        */
+       if (prs->curwords > prevwords)
+               prs->pos += 1;
  }
  
+
  /*
   * to_tsquery
   */
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h

index 2885bc01532e3398986c1158449fe0a65aa13500..30d7c4bccdb03997888373342ed9e8b3800173f0 100644 (file)
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -86,15 +86,6 @@ typedef struct
  #define MAXNUMPOS      (256)
  #define LIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) )
  
-/*
- * In case if a TSVector contains several parts and we want to treat them as
- * separate, it's necessary to add an artificial increment to position of each
- * lexeme from every next part. It's required to avoid the situation when
- * tsquery can find a phrase consisting of lexemes from two of such parts.
- * TS_JUMP defined a value of this increment.
- */
-#define TS_JUMP 1
-
  /* This struct represents a complete tsvector datum */
  typedef struct
  {
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 18 Jul 2017 16:45:51 +0000 (12:45 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 18 Jul 2017 16:45:51 +0000 (12:45 -0400)
src/backend/tsearch/to_tsany.c		patch \| blob \| history
src/include/tsearch/ts_type.h		patch \| blob \| history