Add strict_word_similarity to pg_trgm module

strict_word_similarity is similar to existing word_similarity function but it takes into account word boundaries to compute similarity. Author: Alexander Korotkov Review by: David Steele, Liudmila Mantrova, me Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com
2025-01-24 18:55:04 +08:00 · 2018-03-21 14:57:42 +03:00 · 2018-03-21 14:57:42 +03:00 · be8a7a6866
commit be8a7a6866
parent f20b328534
10 changed files with 1461 additions and 61 deletions
--- a/contrib/pg_trgm/Makefile
+++ b/contrib/pg_trgm/Makefile
@ -4,11 +4,12 @@ MODULE_big = pg_trgm
 OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES)

 EXTENSION = pg_trgm
-DATA = pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
+DATA = pg_trgm--1.3--1.4.sql \
+	pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
 	pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
 PGFILEDESC = "pg_trgm - trigram matching"

-REGRESS = pg_trgm pg_word_trgm
+REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm

 ifdef USE_PGXS
 PG_CONFIG = pg_config
--- a/contrib/pg_trgm/expected/pg_strict_word_trgm.out
+++ b/contrib/pg_trgm/expected/pg_strict_word_trgm.out
--- a/contrib/pg_trgm/pg_trgm--1.3--1.4.sql
+++ b/contrib/pg_trgm/pg_trgm--1.3--1.4.sql
@ -0,0 +1,68 @@
+/* contrib/pg_trgm/pg_trgm--1.3--1.4.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.4'" to load this file. \quit
+
+CREATE FUNCTION strict_word_similarity(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+
+CREATE FUNCTION strict_word_similarity_op(text,text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE PARALLEL SAFE;  -- stable because depends on pg_trgm.word_similarity_threshold
+
+CREATE FUNCTION strict_word_similarity_commutator_op(text,text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE PARALLEL SAFE;  -- stable because depends on pg_trgm.word_similarity_threshold
+
+CREATE OPERATOR <<% (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_op,
+        COMMUTATOR = '%>>',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+
+CREATE OPERATOR %>> (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_commutator_op,
+        COMMUTATOR = '<<%',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+
+CREATE FUNCTION strict_word_similarity_dist_op(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+
+CREATE FUNCTION strict_word_similarity_dist_commutator_op(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+
+CREATE OPERATOR <<<-> (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_dist_op,
+        COMMUTATOR = '<->>>'
+);
+
+CREATE OPERATOR <->>> (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_dist_commutator_op,
+        COMMUTATOR = '<<<->'
+);
+
+ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
+        OPERATOR        9       %>> (text, text),
+        OPERATOR        10       <->>> (text, text) FOR ORDER BY pg_catalog.float_ops;
+
+ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
+        OPERATOR        9       %>> (text, text);
--- a/contrib/pg_trgm/pg_trgm.control
+++ b/contrib/pg_trgm/pg_trgm.control
@ -1,5 +1,5 @@
 # pg_trgm extension
 comment = 'text similarity measurement and index searching based on trigrams'
-default_version = '1.3'
+default_version = '1.4'
 module_pathname = '$libdir/pg_trgm'
 relocatable = true
--- a/contrib/pg_trgm/sql/pg_strict_word_trgm.sql
+++ b/contrib/pg_trgm/sql/pg_strict_word_trgm.sql
@ -0,0 +1,42 @@
+DROP INDEX trgm_idx2;
+
+\copy test_trgm3 from 'data/trgm2.data'
+
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
+
+create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops);
+set enable_seqscan=off;
+
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+
+explain (costs off)
+select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
+select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
+
+drop index trgm_idx2;
+create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops);
+set enable_seqscan=off;
+
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+
+set "pg_trgm.strict_word_similarity_threshold" to 0.4;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+
+set "pg_trgm.strict_word_similarity_threshold" to 0.2;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@ -6,6 +6,7 @@

 #include "access/gist.h"
 #include "access/itup.h"
+#include "access/stratnum.h"
 #include "storage/bufpage.h"

 /*
@ -26,14 +27,16 @@
 #define DIVUNION

 /* operator strategy numbers */
-#define SimilarityStrategyNumber		1
-#define DistanceStrategyNumber			2
-#define LikeStrategyNumber				3
-#define ILikeStrategyNumber				4
-#define RegExpStrategyNumber			5
-#define RegExpICaseStrategyNumber		6
-#define WordSimilarityStrategyNumber	7
-#define WordDistanceStrategyNumber		8
+#define SimilarityStrategyNumber			1
+#define DistanceStrategyNumber				2
+#define LikeStrategyNumber					3
+#define ILikeStrategyNumber					4
+#define RegExpStrategyNumber				5
+#define RegExpICaseStrategyNumber			6
+#define WordSimilarityStrategyNumber		7
+#define WordDistanceStrategyNumber			8
+#define StrictWordSimilarityStrategyNumber	9
+#define StrictWordDistanceStrategyNumber	10

 typedef char trgm[3];

@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph;

 extern double similarity_threshold;
 extern double word_similarity_threshold;
+extern double strict_word_similarity_threshold;

+extern double index_strategy_get_limit(StrategyNumber strategy);
 extern uint32 trgm2int(trgm *ptr);
 extern void compact_trigram(trgm *tptr, char *str, int bytelen);
 extern TRGM *generate_trgm(char *str, int slen);
--- a/contrib/pg_trgm/trgm_gin.c
+++ b/contrib/pg_trgm/trgm_gin.c
@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
+		case StrictWordSimilarityStrategyNumber:
 			trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
 			break;
 		case ILikeStrategyNumber:
@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
-			nlimit = (strategy == SimilarityStrategyNumber) ?
-				similarity_threshold : word_similarity_threshold;
+		case StrictWordSimilarityStrategyNumber:
+			nlimit = index_strategy_get_limit(strategy);

 			/* Count the matches */
 			ntrue = 0;
@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
-			nlimit = (strategy == SimilarityStrategyNumber) ?
-				similarity_threshold : word_similarity_threshold;
+		case StrictWordSimilarityStrategyNumber:
+			nlimit = index_strategy_get_limit(strategy);

 			/* Count the matches */
 			ntrue = 0;
--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 		{
 			case SimilarityStrategyNumber:
 			case WordSimilarityStrategyNumber:
+			case StrictWordSimilarityStrategyNumber:
 				qtrg = generate_trgm(VARDATA(query),
 									 querysize - VARHDRSZ);
 				break;
@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
-			/* Similarity search is exact. Word similarity search is inexact */
-			*recheck = (strategy == WordSimilarityStrategyNumber);
-			nlimit = (strategy == SimilarityStrategyNumber) ?
-				similarity_threshold : word_similarity_threshold;
+		case StrictWordSimilarityStrategyNumber:
+			/* Similarity search is exact. (Strict) word similarity search is inexact */
+			*recheck = (strategy != SimilarityStrategyNumber);
+
+			nlimit = index_strategy_get_limit(strategy);

 			if (GIST_LEAF(entry))
 			{					/* all leafs contains orig trgm */
@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS)
 	{
 		case DistanceStrategyNumber:
 		case WordDistanceStrategyNumber:
-			*recheck = strategy == WordDistanceStrategyNumber;
+		case StrictWordDistanceStrategyNumber:
+			/* Only plain trigram distance is exact */
+			*recheck = (strategy != DistanceStrategyNumber);
 			if (GIST_LEAF(entry))
 			{					/* all leafs contains orig trgm */

--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@ -18,6 +18,7 @@ PG_MODULE_MAGIC;
 /* GUC variables */
 double		similarity_threshold = 0.3f;
 double		word_similarity_threshold = 0.6f;
+double		strict_word_similarity_threshold = 0.5f;

 void		_PG_init(void);

@ -26,12 +27,17 @@ PG_FUNCTION_INFO_V1(show_limit);
 PG_FUNCTION_INFO_V1(show_trgm);
 PG_FUNCTION_INFO_V1(similarity);
 PG_FUNCTION_INFO_V1(word_similarity);
+PG_FUNCTION_INFO_V1(strict_word_similarity);
 PG_FUNCTION_INFO_V1(similarity_dist);
 PG_FUNCTION_INFO_V1(similarity_op);
 PG_FUNCTION_INFO_V1(word_similarity_op);
 PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
 PG_FUNCTION_INFO_V1(word_similarity_dist_op);
 PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
+PG_FUNCTION_INFO_V1(strict_word_similarity_op);
+PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
+PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
+PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);

 /* Trigram with position */
 typedef struct
@ -40,6 +46,17 @@ typedef struct
 	int			index;
 } pos_trgm;

+/* Trigram bound type */
+typedef uint8 TrgmBound;
+#define TRGM_BOUND_LEFT				(0x01) /* trigram is left bound of word */
+#define TRGM_BOUND_RIGHT			(0x02) /* trigram is right bound of word */
+
+/* Word similarity flags */
+#define WORD_SIMILARITY_CHECK_ONLY	(0x01) /* if set then only check existence
+											* of similar search pattern in text */
+#define WORD_SIMILARITY_STRICT		(0x02) /* force bounds of extent to match
+											* word bounds */
+
 /*
 * Module load callback
 */
@ -71,6 +88,18 @@ _PG_init(void)
 							 NULL,
 							 NULL,
 							 NULL);
+	DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
+							 "Sets the threshold used by the <<%% operator.",
+							 "Valid range is 0.0 .. 1.0.",
+							 &strict_word_similarity_threshold,
+							 0.5,
+							 0.0,
+							 1.0,
+							 PGC_USERSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
 }

 /*
@ -95,6 +124,29 @@ set_limit(PG_FUNCTION_ARGS)
 	PG_RETURN_FLOAT4(similarity_threshold);
 }

+
+/*
+ * Get similarity threshold for given index scan strategy number.
+ */
+double
+index_strategy_get_limit(StrategyNumber strategy)
+{
+	switch (strategy)
+	{
+		case SimilarityStrategyNumber:
+			return similarity_threshold;
+		case WordSimilarityStrategyNumber:
+			return word_similarity_threshold;
+		case StrictWordSimilarityStrategyNumber:
+			return strict_word_similarity_threshold;
+		default:
+			elog(ERROR, "unrecognized strategy number: %d", strategy);
+			break;
+	}
+
+	return 0.0;	/* keep compiler quiet */
+}
+
 /*
 * Deprecated function.
 * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
@ -235,11 +287,12 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
 *
 * trg: where to return the array of trigrams.
 * str: source string, of length slen bytes.
+ * bounds: where to return bounds of trigrams (if needed).
 *
 * Returns length of the generated array.
 */
 static int
-generate_trgm_only(trgm *trg, char *str, int slen)
+generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
 {
 	trgm	   *tptr;
 	char	   *buf;
@ -282,11 +335,13 @@ generate_trgm_only(trgm *trg, char *str, int slen)
 		buf[LPADDING + bytelen] = ' ';
 		buf[LPADDING + bytelen + 1] = ' ';

-		/*
-		 * count trigrams
-		 */
+		/* Calculate trigrams marking their bounds if needed */
+		if (bounds)
+			bounds[tptr - trg] |= TRGM_BOUND_LEFT;
 		tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
 							 charlen + LPADDING + RPADDING);
+		if (bounds)
+			bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
 	}

 	pfree(buf);
@ -328,7 +383,7 @@ generate_trgm(char *str, int slen)
 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
 	trg->flag = ARRKEY;

-	len = generate_trgm_only(GETARR(trg), str, slen);
+	len = generate_trgm_only(GETARR(trg), str, slen, NULL);
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));

 	if (len == 0)
@ -413,8 +468,8 @@ comp_ptrgm(const void *v1, const void *v2)
 * ulen1: count of unique trigrams of array "trg1".
 * len2: length of array "trg2" and array "trg2indexes".
 * len: length of the array "found".
- * check_only: if true then only check existence of similar search pattern in
- *			   text.
+ * lags: set of boolean flags parametrizing similarity calculation.
+ * bounds: whether each trigram is left/right bound of word.
 *
 * Returns word similarity.
 */
@ -424,16 +479,32 @@ iterate_word_similarity(int *trg2indexes,
 						int ulen1,
 						int len2,
 						int len,
-						bool check_only)
+						uint8 flags,
+						TrgmBound *bounds)
 {
 	int		   *lastpos,
 				i,
 				ulen2 = 0,
 				count = 0,
 				upper = -1,
-				lower = -1;
+				lower;
 	float4		smlr_cur,
 				smlr_max = 0.0f;
+	double		threshold;
+
+	Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
+
+	/* Select appropriate threshold */
+	threshold = (flags & WORD_SIMILARITY_STRICT) ?
+				 strict_word_similarity_threshold :
+				 word_similarity_threshold;
+
+	/*
+	 * Consider first trigram as initial lower bount for strict word similarity,
+	 * or initialize it later with first trigram present for plain word
+	 * similarity.
+	 */
+	lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;

 	/* Memorise last position of each trigram */
 	lastpos = (int *) palloc(sizeof(int) * len);
@ -456,8 +527,13 @@ iterate_word_similarity(int *trg2indexes,
 			lastpos[trgindex] = i;
 		}

-		/* Adjust upper bound if this trigram is present in required substring */
-		if (found[trgindex])
+		/*
+		 * Adjust upper bound if trigram is upper bound of word for strict
+		 * word similarity, or if trigram is present in required substring for
+		 * plain word similarity
+		 */
+		if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
+											 : found[trgindex])
 		{
 			int			prev_lower,
 						tmp_ulen2,
@ -479,24 +555,35 @@ iterate_word_similarity(int *trg2indexes,
 			prev_lower = lower;
 			for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
 			{
-				float		smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
+				float		smlr_tmp;
 				int			tmp_trgindex;

-				if (smlr_tmp > smlr_cur)
-				{
-					smlr_cur = smlr_tmp;
-					ulen2 = tmp_ulen2;
-					lower = tmp_lower;
-					count = tmp_count;
-				}
-
 				/*
-				 * if we only check that word similarity is greater than
-				 * pg_trgm.word_similarity_threshold we do not need to
-				 * calculate a maximum similarity.
+				 * Adjust lower bound only if trigram is lower bound of word
+				 * for strict word similarity, or consider every trigram as
+				 * lower bound for plain word similarity.
 				 */
-				if (check_only && smlr_cur >= word_similarity_threshold)
-					break;
+				if (!(flags & WORD_SIMILARITY_STRICT)
+					|| (bounds[tmp_lower] & TRGM_BOUND_LEFT))
+				{
+					smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
+					if (smlr_tmp > smlr_cur)
+					{
+						smlr_cur = smlr_tmp;
+						ulen2 = tmp_ulen2;
+						lower = tmp_lower;
+						count = tmp_count;
+					}
+
+					/*
+					 * If we only check that word similarity is greater than
+					 * threshold we do not need to calculate a maximum
+					 * similarity.
+					 */
+					if ((flags & WORD_SIMILARITY_CHECK_ONLY)
+						&& smlr_cur >= threshold)
+						break;
+				}

 				tmp_trgindex = trg2indexes[tmp_lower];
 				if (lastpos[tmp_trgindex] == tmp_lower)
@ -511,10 +598,9 @@ iterate_word_similarity(int *trg2indexes,

 			/*
 			 * if we only check that word similarity is greater than
-			 * pg_trgm.word_similarity_threshold we do not need to calculate a
-			 * maximum similarity
+			 * threshold we do not need to calculate a maximum similarity.
 			 */
-			if (check_only && smlr_max >= word_similarity_threshold)
+			if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
 				break;

 			for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
@ -547,14 +633,13 @@ iterate_word_similarity(int *trg2indexes,
 *
 * str1: search pattern string, of length slen1 bytes.
 * str2: text in which we are looking for a word, of length slen2 bytes.
- * check_only: if true then only check existence of similar search pattern in
- *			   text.
+ * flags: set of boolean flags parametrizing similarity calculation.
 *
 * Returns word similarity.
 */
 static float4
 calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
-					 bool check_only)
+					 uint8 flags)
 {
 	bool	   *found;
 	pos_trgm   *ptrg;
@ -568,15 +653,20 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
 				ulen1;
 	int		   *trg2indexes;
 	float4		result;
+	TrgmBound	   *bounds;

 	protect_out_of_mem(slen1 + slen2);

 	/* Make positional trigrams */
 	trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
 	trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
+	if (flags & WORD_SIMILARITY_STRICT)
+		bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
+	else
+		bounds = NULL;

-	len1 = generate_trgm_only(trg1, str1, slen1);
-	len2 = generate_trgm_only(trg2, str2, slen2);
+	len1 = generate_trgm_only(trg1, str1, slen1, NULL);
+	len2 = generate_trgm_only(trg2, str2, slen2, bounds);

 	ptrg = make_positional_trgm(trg1, len1, trg2, len2);
 	len = len1 + len2;
@ -622,7 +712,7 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,

 	/* Run iterative procedure to find maximum similarity with word */
 	result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
-									 check_only);
+									 flags, bounds);

 	pfree(trg2indexes);
 	pfree(found);
@ -1081,7 +1171,23 @@ word_similarity(PG_FUNCTION_ARGS)

 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
-							   false);
+							   0);
+
+	PG_FREE_IF_COPY(in1, 0);
+	PG_FREE_IF_COPY(in2, 1);
+	PG_RETURN_FLOAT4(res);
+}
+
+Datum
+strict_word_similarity(PG_FUNCTION_ARGS)
+{
+	text	   *in1 = PG_GETARG_TEXT_PP(0);
+	text	   *in2 = PG_GETARG_TEXT_PP(1);
+	float4		res;
+
+	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
+							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
+							   WORD_SIMILARITY_STRICT);

 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@ -1117,7 +1223,7 @@ word_similarity_op(PG_FUNCTION_ARGS)

 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
-							   true);
+							   WORD_SIMILARITY_CHECK_ONLY);

 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@ -1133,7 +1239,7 @@ word_similarity_commutator_op(PG_FUNCTION_ARGS)

 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
-							   true);
+							   WORD_SIMILARITY_CHECK_ONLY);

 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@ -1149,7 +1255,7 @@ word_similarity_dist_op(PG_FUNCTION_ARGS)

 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
-							   false);
+							   0);

 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@ -1165,7 +1271,71 @@ word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)

 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
-							   false);
+							   0);
+
+	PG_FREE_IF_COPY(in1, 0);
+	PG_FREE_IF_COPY(in2, 1);
+	PG_RETURN_FLOAT4(1.0 - res);
+}
+
+Datum
+strict_word_similarity_op(PG_FUNCTION_ARGS)
+{
+	text	   *in1 = PG_GETARG_TEXT_PP(0);
+	text	   *in2 = PG_GETARG_TEXT_PP(1);
+	float4		res;
+
+	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
+							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
+							   WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
+
+	PG_FREE_IF_COPY(in1, 0);
+	PG_FREE_IF_COPY(in2, 1);
+	PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
+}
+
+Datum
+strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
+{
+	text	   *in1 = PG_GETARG_TEXT_PP(0);
+	text	   *in2 = PG_GETARG_TEXT_PP(1);
+	float4		res;
+
+	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
+							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
+							   WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
+
+	PG_FREE_IF_COPY(in1, 0);
+	PG_FREE_IF_COPY(in2, 1);
+	PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
+}
+
+Datum
+strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
+{
+	text	   *in1 = PG_GETARG_TEXT_PP(0);
+	text	   *in2 = PG_GETARG_TEXT_PP(1);
+	float4		res;
+
+	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
+							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
+							   WORD_SIMILARITY_STRICT);
+
+	PG_FREE_IF_COPY(in1, 0);
+	PG_FREE_IF_COPY(in2, 1);
+	PG_RETURN_FLOAT4(1.0 - res);
+}
+
+Datum
+strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
+{
+	text	   *in1 = PG_GETARG_TEXT_PP(0);
+	text	   *in2 = PG_GETARG_TEXT_PP(1);
+	float4		res;
+
+	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
+							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
+							   WORD_SIMILARITY_STRICT);

 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
--- a/doc/src/sgml/pgtrgm.sgml
+++ b/doc/src/sgml/pgtrgm.sgml
@ -105,6 +105,17 @@
       the explanation below.
      </entry>
     </row>
+     <row>
+      <entry>
+       <function>strict_word_similarity(text, text)</function>
+       <indexterm><primary>strict_word_similarity</primary></indexterm>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Same as <function>word_similarity(text, text)</function>, but forces
+       extent boundaries to match word boundaries.
+      </entry>
+     </row>
     <row>
      <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
      <entry><type>real</type></entry>
@ -157,6 +168,29 @@
   a part of the word.
  </para>

+  <para>
+   At the same time, <function>strict_word_similarity(text, text)</function>
+   has to select an extent that matches word boundaries.  In the example above,
+   <function>strict_word_similarity(text, text)</function> would select the
+   extent <literal>{"  w"," wo","wor","ord","rds", ds "}</literal>, which
+   corresponds to the whole word <literal>'words'</literal>.
+
+<programlisting>
+# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words');
+ strict_word_similarity | similarity
+------------------------+------------
+               0.571429 |   0.571429
+(1 row)
+</programlisting>
+  </para>
+
+  <para>
+   Thus, the <function>strict_word_similarity(text, text)</function> function
+   is useful for finding similar subsets of whole words, while
+   <function>word_similarity(text, text)</function> is more suitable for
+   searching similar parts of words.
+  </para>
+
  <table id="pgtrgm-op-table">
   <title><filename>pg_trgm</filename> Operators</title>
   <tgroup cols="3">
@ -196,6 +230,24 @@
       Commutator of the <literal>&lt;%</literal> operator.
      </entry>
     </row>
+     <row>
+      <entry><type>text</type> <literal>&lt;&lt;%</literal> <type>text</type></entry>
+      <entry><type>boolean</type></entry>
+      <entry>
+       Returns <literal>true</literal> if its second argument has a continuous
+       extent of an ordered trigram set that matches word boundaries,
+       and its similarity to the trigram set of the first argument is greater
+       than the current strict word similarity threshold set by the
+       <varname>pg_trgm.strict_word_similarity_threshold</varname> parameter.
+      </entry>
+     </row>
+     <row>
+      <entry><type>text</type> <literal>%&gt;&gt;</literal> <type>text</type></entry>
+      <entry><type>boolean</type></entry>
+      <entry>
+       Commutator of the <literal>&lt;&lt;%</literal> operator.
+      </entry>
+     </row>
     <row>
      <entry><type>text</type> <literal>&lt;-&gt;</literal> <type>text</type></entry>
      <entry><type>real</type></entry>
@ -223,6 +275,25 @@
       Commutator of the <literal>&lt;&lt;-&gt;</literal> operator.
      </entry>
     </row>
+     <row>
+      <entry>
+       <type>text</type> <literal>&lt;&lt;&lt;-&gt;</literal> <type>text</type>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Returns the <quote>distance</quote> between the arguments, that is
+       one minus the <function>strict_word_similarity()</function> value.
+      </entry>
+     </row>
+     <row>
+      <entry>
+       <type>text</type> <literal>&lt;-&gt;&gt;&gt;</literal> <type>text</type>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Commutator of the <literal>&lt;&lt;&lt;-&gt;</literal> operator.
+      </entry>
+     </row>
    </tbody>
   </tgroup>
  </table>
@ -322,12 +393,19 @@ SELECT t, t &lt;-&gt; '<replaceable>word</replaceable>' AS dist

  <para>
   Also you can use an index on the <structfield>t</structfield> column for word
-   similarity.  For example:
+   similarity or strict word similarity.  Typical queries are:
 <programlisting>
 SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
  FROM test_trgm
  WHERE '<replaceable>word</replaceable>' &lt;% t
  ORDER BY sml DESC, t;
+</programlisting>
+   and
+<programlisting>
+SELECT t, strict_word_similarity('<replaceable>word</replaceable>', t) AS sml
+  FROM test_trgm
+  WHERE '<replaceable>word</replaceable>' &lt;&lt;% t
+  ORDER BY sml DESC, t;
 </programlisting>
   This will return all values in the text column for which there is a
   continuous extent in the corresponding ordered trigram set that is
@ -337,11 +415,17 @@ SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
  </para>

  <para>
-   A variant of the above query is
+   Possible variants of the above queries are:
 <programlisting>
 SELECT t, '<replaceable>word</replaceable>' &lt;&lt;-&gt; t AS dist
  FROM test_trgm
  ORDER BY dist LIMIT 10;
+</programlisting>
+   and
+<programlisting>
+SELECT t, '<replaceable>word</replaceable>' &lt;&lt;&lt;-&gt; t AS dist
+  FROM test_trgm
+  ORDER BY dist LIMIT 10;
 </programlisting>
   This can be implemented quite efficiently by GiST indexes, but not
   by GIN indexes.