From b87b52bf0469e5629f3119f93e0849fa70245602 Mon Sep 17 00:00:00 2001
From: Teodor Sigaev <teodor@sigaev.ru>
Date: Wed, 12 Nov 2008 13:43:54 +0000
Subject: [PATCH] Support of multibyte encoding for pg_trgm

---
 contrib/pg_trgm/trgm.h     |  11 +-
 contrib/pg_trgm/trgm_gin.c |   4 +-
 contrib/pg_trgm/trgm_op.c  | 234 ++++++++++++++++++++++++-------------
 3 files changed, 161 insertions(+), 88 deletions(-)

diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index e2f60fcf7b..18d6751687 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -1,5 +1,5 @@
 /*
- * $PostgreSQL: pgsql/contrib/pg_trgm/trgm.h,v 1.9 2008/05/17 01:28:21 adunstan Exp $ 
+ * $PostgreSQL: pgsql/contrib/pg_trgm/trgm.h,v 1.10 2008/11/12 13:43:54 teodor Exp $ 
  */
 #ifndef __TRGM_H__
 #define __TRGM_H__
@@ -31,7 +31,14 @@ typedef char trgm[3];
 	*(((char*)(a))+2) = *(((char*)(b))+2);	\
 } while(0);
 
-#define TRGMINT(a) ( (*(((char*)(a))+2)<<16)+(*(((char*)(a))+1)<<8)+*(((char*)(a))+0) )
+uint32 trgm2int(trgm *ptr);
+
+#ifdef KEEPONLYALNUM
+#define ISPRINTABLECHAR(a)	( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
+#else
+#define ISPRINTABLECHAR(a)	( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
+#endif
+#define ISPRINTABLETRGM(t)	( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) )
 
 typedef struct
 {
diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c
index 7f8bda7207..7a64764fb8 100644
--- a/contrib/pg_trgm/trgm_gin.c
+++ b/contrib/pg_trgm/trgm_gin.c
@@ -1,5 +1,5 @@
 /*
- * $PostgreSQL: pgsql/contrib/pg_trgm/trgm_gin.c,v 1.5 2008/07/11 11:56:48 teodor Exp $ 
+ * $PostgreSQL: pgsql/contrib/pg_trgm/trgm_gin.c,v 1.6 2008/11/12 13:43:54 teodor Exp $ 
  */
 #include "trgm.h"
 
@@ -42,7 +42,7 @@ gin_extract_trgm(PG_FUNCTION_ARGS)
 		ptr = GETARR(trg);
 		while (ptr - GETARR(trg) < ARRNELEM(trg))
 		{
-			item = TRGMINT(ptr);
+			item = trgm2int(ptr);
 			entries[i++] = Int32GetDatum(item);
 
 			ptr++;
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index ae72ae617f..3d2df22111 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -1,10 +1,11 @@
 /*
- * $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.10 2008/05/17 01:28:21 adunstan Exp $ 
+ * $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.11 2008/11/12 13:43:54 teodor Exp $ 
  */
 #include "trgm.h"
 #include <ctype.h>
 #include "utils/array.h"
 #include "catalog/pg_type.h"
+#include "tsearch/ts_locale.h"
 
 PG_MODULE_MAGIC;
 
@@ -31,9 +32,6 @@ show_limit(PG_FUNCTION_ARGS)
 	PG_RETURN_FLOAT4(trgm_limit);
 }
 
-#define WORDWAIT		0
-#define INWORD			1
-
 static int
 comp_trgm(const void *a, const void *b)
 {
@@ -60,18 +58,119 @@ unique_array(trgm * a, int len)
 	return curend + 1 - a;
 }
 
+#ifdef KEEPONLYALNUM
+#define	iswordchr(c)	(t_isalpha(c) || t_isdigit(c))
+#else
+#define iswordchr(c)	(!t_isspace(c))
+#endif
+
+/*
+ * Finds first word in string, returns pointer to the word,
+ * endword points to the character after word
+ */
+static char*
+find_word(char *str, int lenstr, char **endword, int *charlen) 
+{
+	char *beginword = str;
+
+	while( beginword - str < lenstr && !iswordchr(beginword) )
+		beginword += pg_mblen(beginword);
+
+	if (beginword - str >= lenstr)
+		return NULL;
+
+	*endword = beginword;
+	*charlen = 0;
+	while( *endword - str < lenstr && iswordchr(*endword) ) 
+	{
+		*endword += pg_mblen(*endword);
+		(*charlen)++;
+	}
+
+	return beginword;
+}
+
+#ifdef USE_WIDE_UPPER_LOWER
+static void
+cnt_trigram(trgm *tptr, char *str, int bytelen) 
+{
+	if ( bytelen == 3 ) 
+	{
+		CPTRGM(tptr, str);		
+	}
+	else
+	{
+		pg_crc32	crc;
+
+		INIT_CRC32(crc);
+		COMP_CRC32(crc, str, bytelen);
+		FIN_CRC32(crc);
+
+		/*
+		 * use only 3 upper bytes from crc, hope, it's
+		 * good enough hashing
+		 */
+		CPTRGM(tptr, &crc);
+	}
+}
+#endif
+
+/*
+ * Adds trigramm from words (already padded).
+ */
+static trgm*
+make_trigrams( trgm *tptr, char *str, int bytelen, int charlen )
+{
+	char	*ptr = str;
+
+	if ( charlen < 3 )
+		return tptr;
+
+#ifdef USE_WIDE_UPPER_LOWER
+	if (pg_database_encoding_max_length() > 1)
+	{
+		int lenfirst 	= pg_mblen(str),
+			lenmiddle 	= pg_mblen(str + lenfirst),
+			lenlast		= pg_mblen(str + lenfirst + lenmiddle);
+
+		while( (ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen ) 
+		{
+			cnt_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
+
+			ptr += lenfirst;
+			tptr++;
+
+			lenfirst 	= lenmiddle;
+			lenmiddle 	= lenlast;
+			lenlast 	= pg_mblen(ptr + lenfirst + lenmiddle);
+		}
+	}
+	else
+#endif
+	{
+		Assert( bytelen == charlen );
+
+		while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
+		{
+			CPTRGM(tptr, ptr);
+			ptr++;
+			tptr++;
+		}
+	}
+	
+	return tptr;
+}
 
 TRGM *
 generate_trgm(char *str, int slen)
 {
 	TRGM	   *trg;
-	char	   *buf,
-			   *sptr,
-			   *bufptr;
+	char	   *buf;
 	trgm	   *tptr;
-	int			state = WORDWAIT;
-	int			wl,
-				len;
+	int			len,
+				charlen,
+				bytelen;
+	char		*bword, *eword;
 
 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
 	trg->flag = ARRKEY;
@@ -83,7 +182,6 @@ generate_trgm(char *str, int slen)
 	tptr = GETARR(trg);
 
 	buf = palloc(sizeof(char) * (slen + 4));
-	sptr = str;
 
 	if (LPADDING > 0)
 	{
@@ -92,82 +190,29 @@ generate_trgm(char *str, int slen)
 			*(buf + 1) = ' ';
 	}
 
-	bufptr = buf + LPADDING;
-	while (sptr - str < slen)
+	eword = str;
+	while( (bword=find_word(eword, slen - (eword-str), &eword, &charlen)) != NULL ) 
 	{
-		if (state == WORDWAIT)
-		{
-			if (
-#ifdef	KEEPONLYALNUM
-				isalnum((unsigned char) *sptr)
-#else
-				!isspace((unsigned char) *sptr)
-#endif
-				)
-			{
-				*bufptr = *sptr;	/* start put word in buffer */
-				bufptr++;
-				state = INWORD;
-				if (sptr - str == slen - 1 /* last char */ )
-					goto gettrg;
-			}
-		}
-		else
-		{
-			if (
-#ifdef	KEEPONLYALNUM
-				!isalnum((unsigned char) *sptr)
+#ifdef IGNORECASE
+		bword = lowerstr_with_len(bword, eword - bword);
+		bytelen = strlen(bword);
 #else
-				isspace((unsigned char) *sptr)
+		bytelen = eword - bword;
 #endif
-				)
-			{
-		gettrg:
-				/* word in buffer, so count trigrams */
-				*bufptr = ' ';
-				*(bufptr + 1) = ' ';
-				wl = bufptr - (buf + LPADDING) - 2 + LPADDING + RPADDING;
-				if (wl <= 0)
-				{
-					bufptr = buf + LPADDING;
-					state = WORDWAIT;
-					sptr++;
-					continue;
-				}
+
+		memcpy(buf + LPADDING, bword, bytelen);
 
 #ifdef IGNORECASE
-				do
-				{				/* lower word */
-					int			wwl = bufptr - buf;
-
-					bufptr = buf + LPADDING;
-					while (bufptr - buf < wwl)
-					{
-						*bufptr = tolower((unsigned char) *bufptr);
-						bufptr++;
-					}
-				} while (0);
+		pfree(bword);
 #endif
-				bufptr = buf;
-				/* set trigrams */
-				while (bufptr - buf < wl)
-				{
-					CPTRGM(tptr, bufptr);
-					bufptr++;
-					tptr++;
-				}
-				bufptr = buf + LPADDING;
-				state = WORDWAIT;
-			}
-			else
-			{
-				*bufptr = *sptr;	/* put in buffer */
-				bufptr++;
-				if (sptr - str == slen - 1)
-					goto gettrg;
-			}
-		}
-		sptr++;
+		buf[LPADDING+bytelen] = ' ';
+		buf[LPADDING+bytelen+1] = ' ';
+
+		/*
+		 * count trigrams
+		 */
+		tptr = make_trigrams( tptr, buf, bytelen + LPADDING + RPADDING, 
+										 charlen + LPADDING + RPADDING );
 	}
 
 	pfree(buf);
@@ -186,6 +231,19 @@ generate_trgm(char *str, int slen)
 	return trg;
 }
 
+uint32
+trgm2int(trgm *ptr)
+{
+	uint32	val = 0;
+
+	val |= *( ((unsigned char*)ptr) );
+	val <<= 8;
+	val |= *( ((unsigned char*)ptr) + 1 );
+	val <<= 8;
+	val |= *( ((unsigned char*)ptr) + 2 );
+
+	return val;
+}
 
 PG_FUNCTION_INFO_V1(show_trgm);
 Datum		show_trgm(PG_FUNCTION_ARGS);
@@ -204,10 +262,18 @@ show_trgm(PG_FUNCTION_ARGS)
 
 	for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
 	{
-		text	   *item = (text *) palloc(VARHDRSZ + 3);
+		text	   *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length()*3) );
 
-		SET_VARSIZE(item, VARHDRSZ + 3);
-		CPTRGM(VARDATA(item), ptr);
+		if ( pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr) )
+		{
+			snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
+			SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
+		}
+		else
+		{
+			SET_VARSIZE(item, VARHDRSZ + 3);
+			CPTRGM(VARDATA(item), ptr);
+		}
 		d[i] = PointerGetDatum(item);
 	}
 
-- 
2.40.0