1 Eliminate duplicate field HLWORD->skip

author Teodor Sigaev <teodor@sigaev.ru>

Mon, 28 Jun 2004 16:19:09 +0000 (16:19 +0000)

committer Teodor Sigaev <teodor@sigaev.ru>

Mon, 28 Jun 2004 16:19:09 +0000 (16:19 +0000)
author Teodor Sigaev <teodor@sigaev.ru>
Mon, 28 Jun 2004 16:19:09 +0000 (16:19 +0000)
committer Teodor Sigaev <teodor@sigaev.ru>
Mon, 28 Jun 2004 16:19:09 +0000 (16:19 +0000)
diff --git a/contrib/tsearch2/expected/tsearch2.out b/contrib/tsearch2/expected/tsearch2.out

index fb836c087a1b2dad2ffdde7ae1a9ea0e71c229f7..93fc11dad14939752dc8abd00d74d1023c856fe3 100644 (file)
--- a/contrib/tsearch2/expected/tsearch2.out
+++ b/contrib/tsearch2/expected/tsearch2.out
@@ -458,20 +458,20 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc
      12 |  
       1 | asdf
      12 |  
-    13 |  
+    13 | <fr>
       1 | qwer
      12 |  
       1 | jf
      12 |  
       1 | sdjk
-    13 |  
+    13 | <we hjwer <werrwe>
      12 |  
       3 | ewr1
      12 | >
      12 |  
       3 | ewri2
      12 |  
-    13 |  
+    13 | <a href="qwe<qwe>">
      12 | 
  
      19 | /usr/local/fff
@@ -515,7 +515,7 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc
      22 | 234
      12 |  
  
-    13 |  
+    13 | <i <b>
      12 |  
       1 | wow
      12 |   
@@ -2130,6 +2130,35 @@ A thousand years to trace
  The granite features of this cliff
  (1 row)
  
+select headline('
+<html>
+<!-- some comment -->
+<body>
+Sea view wow <u>foo bar</u> <i>qq</i>
+<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
+ff-bg
+<script>
+       document.write(15);
+</script>
+</body>
+</html>', 
+to_tsquery('sea&foo'), 'HighlightAll=true');
+                                                                                                              headline                                                                                                               
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 
+<html>
+<!-- some comment -->
+<body>
+<b>Sea</b> view wow <u><b>foo</b> bar</u> <i>qq</i>
+<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
+ ff-bg
+<script>
+       document.write(15);
+</script>
+</body>
+</html>
+(1 row)
+
  --check debug
  select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
   ts_name | tok_type | description |   token    | dict_name |   tsvector   
diff --git a/contrib/tsearch2/sql/tsearch2.sql b/contrib/tsearch2/sql/tsearch2.sql

index 231ddaebe5e957c1befbd94707952c2085ea9d44..0a980608f7c2631e6d7109bb258d29fdc29d2bdf 100644 (file)
--- a/contrib/tsearch2/sql/tsearch2.sql
+++ b/contrib/tsearch2/sql/tsearch2.sql
@@ -253,6 +253,20 @@ The sculpture of these granite seams,
  Upon a woman s face. E.  J.  Pratt  (1882 1964)
  ', to_tsquery('sea'));
  
+
+select headline('
+<html>
+<!-- some comment -->
+<body>
+Sea view wow <u>foo bar</u> <i>qq</i>
+<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
+ff-bg
+<script>
+       document.write(15);
+</script>
+</body>
+</html>', 
+to_tsquery('sea&foo'), 'HighlightAll=true');
  --check debug
  select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
  
diff --git a/contrib/tsearch2/ts_cfg.c b/contrib/tsearch2/ts_cfg.c

index efd79a1e32ffa8755a26f2aa22a81f969ba9b612..4e0a0bb90436e1728148ee6db7c5b433bd66bbb0 100644 (file)
--- a/contrib/tsearch2/ts_cfg.c
+++ b/contrib/tsearch2/ts_cfg.c
@@ -510,7 +510,7 @@ genhl(HLPRSTEXT * prs)
                         ptr = ((char *) out) + dist;
                 }
  
-               if (wrd->in && !wrd->skip && !wrd->repeated)
+               if (wrd->in && !wrd->repeated)
                 {
                         if (wrd->replace)
                         {
@@ -532,7 +532,7 @@ genhl(HLPRSTEXT * prs)
                                         ptr += prs->stopsellen;
                                 }
                         }
-               }
+               } else
  
                 if (!wrd->repeated)
                         pfree(wrd->word);
diff --git a/contrib/tsearch2/ts_cfg.h b/contrib/tsearch2/ts_cfg.h

index 9bf65144b208591b167b44869a6a5e1463ab0b2d..e000233178d1e556029ba7f8a46e336644765124 100644 (file)
--- a/contrib/tsearch2/ts_cfg.h
+++ b/contrib/tsearch2/ts_cfg.h
@@ -46,13 +46,13 @@ typedef struct
  
  typedef struct
  {
-       uint16          len;
-       uint8           selected:1,
+       uint32          selected:1,
                                 in:1,
-                               skip:1,
                                 replace:1,
-                               repeated:1;
-       uint8           type;
+                               repeated:1,
+                               unused:4,
+                               type:8,
+                               len:16;
         char       *word;
         ITEM       *item;
  }      HLWORD;
diff --git a/contrib/tsearch2/wordparser/parser.l b/contrib/tsearch2/wordparser/parser.l

index e80f5fea903061699972dcb56dd8fe8ff07afdb7..8c46edf7b8b25d660980ac17fd3839efa4c1b204 100644 (file)
--- a/contrib/tsearch2/wordparser/parser.l
+++ b/contrib/tsearch2/wordparser/parser.l
@@ -10,10 +10,48 @@
  
  char *token = NULL;  /* pointer to token */
  int tokenlen;
-char *s     = NULL;  /* to return WHOLE hyphenated-word */
+static char *s     = NULL;  /* to return WHOLE hyphenated-word */
  
  YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
  
+typedef struct {
+       int tlen;
+       int clen;
+       char *str;
+} TagStorage;
+
+static TagStorage ts={0,0,NULL};
+
+static void
+addTag() {
+       while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
+               ts.tlen*=2;
+               ts.str=realloc(ts.str,ts.tlen);
+               if (!ts.str)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+        }
+        memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
+        ts.clen+=tsearch2_yyleng;
+       ts.str[ts.clen]='\0';
+}
+
+static void
+startTag() {
+       if ( ts.str==NULL ) {
+               ts.tlen=tsearch2_yyleng+1;
+               ts.str=malloc(ts.tlen);
+               if (!ts.str)
+                       ereport(ERROR,
+                                (errcode(ERRCODE_OUT_OF_MEMORY),
+                                 errmsg("out of memory")));
+       }
+       ts.clen=0;
+       ts.str[0]='\0';
+       addTag();
+}
+
  %}
  
  %option 8bit
@@ -46,47 +84,46 @@ URI         [-_[:alnum:]/%,\.;=&?#]+
  
  %%
  
-"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
+"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
  
  <INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
         BEGIN INITIAL; 
-       *tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0'; 
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return SPACE;
+       addTag();
+       token = ts.str;
+       tokenlen = ts.clen;
+       return TAG;
  }
  
-"<!--" { BEGIN INCOMMENT; }
+"<!--" { BEGIN INCOMMENT; startTag(); }
  
  <INCOMMENT>"-->"       { 
         BEGIN INITIAL;
-       *tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0'; 
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return SPACE;
+       addTag();
+       token = ts.str;
+       tokenlen = ts.clen;
+       return TAG;
  }
  
  
-"<"[\![:alpha:]]       { BEGIN INTAG; }
+"<"[\![:alpha:]]       { BEGIN INTAG; startTag(); }
  
-"</"[[:alpha:]]        { BEGIN INTAG; }
+"</"[[:alpha:]]        { BEGIN INTAG; startTag(); }
  
-<INTAG>"\""    { BEGIN QINTAG; }
+<INTAG>"\""    { BEGIN QINTAG; addTag(); }
  
-<QINTAG>"\\\"" ;
+<QINTAG>"\\\"" { addTag(); }
  
-<QINTAG>"\""   { BEGIN INTAG; }
+<QINTAG>"\""   { BEGIN INTAG; addTag(); }
  
  <INTAG>">"     { 
         BEGIN INITIAL;
-       token = tsearch2_yytext;
-       *tsearch2_yytext=' '; 
-       token = tsearch2_yytext;
-       tokenlen = 1;
+       addTag();
+       token = ts.str;
+       tokenlen = ts.clen;
         return TAG;
  }
  
-<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n  ;
+<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }    
  
  \&(quot|amp|nbsp|lt|gt)\;   {
         token = tsearch2_yytext;
@@ -295,3 +332,4 @@ void tsearch2_start_parse_str(char* str, int limit) {
         tsearch2_yy_switch_to_buffer( buf );
         BEGIN INITIAL;
  }
+
diff --git a/contrib/tsearch2/wparser_def.c b/contrib/tsearch2/wparser_def.c

index a3d61126282c0f07998693d7c490879e2b70d275..035e5f2495d62ab6ff47fa0275092d096c91fed1 100644 (file)
--- a/contrib/tsearch2/wparser_def.c
+++ b/contrib/tsearch2/wparser_def.c
@@ -78,6 +78,7 @@ prsd_end(PG_FUNCTION_ARGS)
  
  #define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
  #define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
+#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
  #define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
  #define NOENDTOKEN(x)  ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )
  
@@ -196,6 +197,7 @@ prsd_headline(PG_FUNCTION_ARGS)
                                 curlen;
  
         int                     i;
+       int                     highlight=0;
  
         /* config */
         prs->startsel = NULL;
@@ -220,6 +222,15 @@ prsd_headline(PG_FUNCTION_ARGS)
                                 prs->startsel = pstrdup(mptr->value);
                         else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
                                 prs->stopsel = pstrdup(mptr->value);
+                       else if (pg_strcasecmp(mptr->key, "HighlightAll") == 0)
+                               highlight = (
+                                       pg_strcasecmp(mptr->value, "1")==0 || 
+                                       pg_strcasecmp(mptr->value, "on")==0 || 
+                                       pg_strcasecmp(mptr->value, "true")==0 || 
+                                       pg_strcasecmp(mptr->value, "t")==0 || 
+                                       pg_strcasecmp(mptr->value, "y")==0 || 
+                                       pg_strcasecmp(mptr->value, "yes")==0 ) ?
+                               1 : 0;
  
                         pfree(mptr->key);
                         pfree(mptr->value);
@@ -228,124 +239,133 @@ prsd_headline(PG_FUNCTION_ARGS)
                 }
                 pfree(map);
  
-               if (min_words >= max_words)
-                       ereport(ERROR,
+               if (highlight==0) {
+                       if (min_words >= max_words)
+                               ereport(ERROR,
                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                          errmsg("MinWords should be less than MaxWords")));
-               if (min_words <= 0)
-                       ereport(ERROR,
+                       if (min_words <= 0)
+                               ereport(ERROR,
                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                          errmsg("MinWords should be positive")));
-               if (shortword < 0)
-                       ereport(ERROR,
+                       if (shortword < 0)
+                               ereport(ERROR,
                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                          errmsg("ShortWord should be >= 0")));
-       }
-
-       while (hlCover(prs, query, &p, &q))
-       {
-               /* find cover len in words */
-               curlen = 0;
-               poslen = 0;
-               for (i = p; i <= q && curlen < max_words; i++)
-               {
-                       if (!NONWORDTOKEN(prs->words[i].type))
-                               curlen++;
-                       if (prs->words[i].item && !prs->words[i].repeated)
-                               poslen++;
-                       pose = i;
                 }
+       }
  
-               if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
+       if (highlight==0) {
+               while (hlCover(prs, query, &p, &q))
                 {
-                       /* best already finded, so try one more cover */
-                       p++;
-                       continue;
-               }
-
-               posb=p;
-               if (curlen < max_words)
-               {                                               /* find good end */
-                       for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
+                       /* find cover len in words */
+                       curlen = 0;
+                       poslen = 0;
+                       for (i = p; i <= q && curlen < max_words; i++)
                         {
-                               if (i != q)
+                               if (!NONWORDTOKEN(prs->words[i].type))
+                                       curlen++;
+                               if (prs->words[i].item && !prs->words[i].repeated)
+                                       poslen++;
+                               pose = i;
+                       }
+       
+                       if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
+                       {
+                               /* best already finded, so try one more cover */
+                               p++;
+                               continue;
+                       }
+       
+                       posb=p;
+                       if (curlen < max_words)
+                       {                                               /* find good end */
+                               for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
                                 {
-                                       if (!NONWORDTOKEN(prs->words[i].type))
-                                               curlen++;
-                                       if (prs->words[i].item && !prs->words[i].repeated)
-                                               poslen++;
+                                       if (i != q)
+                                       {
+                                               if (!NONWORDTOKEN(prs->words[i].type))
+                                                       curlen++;
+                                               if (prs->words[i].item && !prs->words[i].repeated)
+                                                       poslen++;
+                                       }
+                                       pose = i;
+                                       if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+                                               continue;
+                                       if (curlen >= min_words)
+                                               break;
+                               }
+                               if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
+                                       for(i=p; i>= 0; i--) {
+                                               if (!NONWORDTOKEN(prs->words[i].type))
+                                                       curlen++;
+                                               if (prs->words[i].item && !prs->words[i].repeated)
+                                                       poslen++;
+                                               if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+                                                       continue;
+                                               if (curlen >= min_words)
+                                                       break;
+                                       }
+                                       posb=(i>=0) ? i : 0;
                                 }
-                               pose = i;
-                               if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
-                                       continue;
-                               if (curlen >= min_words)
-                                       break;
                         }
-                       if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
-                               for(i=p; i>= 0; i--) {
+                       else
+                       {                                               /* shorter cover :((( */
+                               for (; curlen > min_words; i--)
+                               {
                                         if (!NONWORDTOKEN(prs->words[i].type))
-                                               curlen++;
+                                               curlen--;
                                         if (prs->words[i].item && !prs->words[i].repeated)
-                                               poslen++;
+                                               poslen--;
+                                       pose = i;
                                         if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
                                                 continue;
-                                       if (curlen >= min_words)
-                                               break;
+                                       break;
                                 }
-                               posb=(i>=0) ? i : 0;
                         }
-               }
-               else
-               {                                               /* shorter cover :((( */
-                       for (; curlen > min_words; i--)
+       
+                       if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
+                               (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
+                                (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
                         {
-                               if (!NONWORDTOKEN(prs->words[i].type))
-                                       curlen--;
-                               if (prs->words[i].item && !prs->words[i].repeated)
-                                       poslen--;
-                               pose = i;
-                               if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
-                                       continue;
-                               break;
+                               bestb = posb;
+                               beste = pose;
+                               bestlen = poslen;
                         }
+       
+                       p++;
                 }
  
-               if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
-                       (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
-                        (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
+               if (bestlen < 0)
                 {
-                       bestb = posb;
+                       curlen = 0;
+                       for (i = 0; i < prs->curwords && curlen < min_words; i++)
+                       {
+                               if (!NONWORDTOKEN(prs->words[i].type))
+                                       curlen++;
+                               pose = i;
+                       }
+                       bestb = 0;
                         beste = pose;
-                       bestlen = poslen;
                 }
-
-               p++;
-       }
-
-       if (bestlen < 0)
-       {
-               curlen = 0;
-               poslen = 0;
-               for (i = 0; i < prs->curwords && curlen < min_words; i++)
-               {
-                       if (!NONWORDTOKEN(prs->words[i].type))
-                               curlen++;
-                       pose = i;
-               }
-               bestb = 0;
-               beste = pose;
+       } else {
+               bestb=0;
+               beste=prs->curwords-1;
         }
  
         for (i = bestb; i <= beste; i++)
         {
                 if (prs->words[i].item)
                         prs->words[i].selected = 1;
-               if (prs->words[i].repeated)
-                       prs->words[i].skip = 1;
-               if (HLIDIGNORE(prs->words[i].type))
-                       prs->words[i].replace = 1;
+               if ( highlight==0 ) { 
+                       if (HLIDIGNORE(prs->words[i].type))
+                               prs->words[i].replace = 1;
+               } else {
+                       if (HTMLHLIDIGNORE(prs->words[i].type))
+                               prs->words[i].replace = 1;
+               }
  
-               prs->words[i].in = 1;
+               prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
         }
  
         if (!prs->startsel)
author	Teodor Sigaev <teodor@sigaev.ru>
	Mon, 28 Jun 2004 16:19:09 +0000 (16:19 +0000)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Mon, 28 Jun 2004 16:19:09 +0000 (16:19 +0000)
contrib/tsearch2/expected/tsearch2.out		patch \| blob \| history
contrib/tsearch2/sql/tsearch2.sql		patch \| blob \| history
contrib/tsearch2/ts_cfg.c		patch \| blob \| history
contrib/tsearch2/ts_cfg.h		patch \| blob \| history
contrib/tsearch2/wordparser/parser.l		patch \| blob \| history
contrib/tsearch2/wparser_def.c		patch \| blob \| history