mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-24 18:55:04 +08:00
Add strict_word_similarity to pg_trgm module
strict_word_similarity is similar to existing word_similarity function but it takes into account word boundaries to compute similarity. Author: Alexander Korotkov Review by: David Steele, Liudmila Mantrova, me Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com
This commit is contained in:
parent
f20b328534
commit
be8a7a6866
@ -4,11 +4,12 @@ MODULE_big = pg_trgm
|
||||
OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES)
|
||||
|
||||
EXTENSION = pg_trgm
|
||||
DATA = pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
|
||||
DATA = pg_trgm--1.3--1.4.sql \
|
||||
pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
|
||||
pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
|
||||
PGFILEDESC = "pg_trgm - trigram matching"
|
||||
|
||||
REGRESS = pg_trgm pg_word_trgm
|
||||
REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
|
||||
|
||||
ifdef USE_PGXS
|
||||
PG_CONFIG = pg_config
|
||||
|
1025
contrib/pg_trgm/expected/pg_strict_word_trgm.out
Normal file
1025
contrib/pg_trgm/expected/pg_strict_word_trgm.out
Normal file
File diff suppressed because it is too large
Load Diff
68
contrib/pg_trgm/pg_trgm--1.3--1.4.sql
Normal file
68
contrib/pg_trgm/pg_trgm--1.3--1.4.sql
Normal file
@ -0,0 +1,68 @@
|
||||
/* contrib/pg_trgm/pg_trgm--1.3--1.4.sql */
|
||||
|
||||
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
|
||||
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.4'" to load this file. \quit
|
||||
|
||||
CREATE FUNCTION strict_word_similarity(text,text)
|
||||
RETURNS float4
|
||||
AS 'MODULE_PATHNAME'
|
||||
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
CREATE FUNCTION strict_word_similarity_op(text,text)
|
||||
RETURNS bool
|
||||
AS 'MODULE_PATHNAME'
|
||||
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
|
||||
|
||||
CREATE FUNCTION strict_word_similarity_commutator_op(text,text)
|
||||
RETURNS bool
|
||||
AS 'MODULE_PATHNAME'
|
||||
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
|
||||
|
||||
CREATE OPERATOR <<% (
|
||||
LEFTARG = text,
|
||||
RIGHTARG = text,
|
||||
PROCEDURE = strict_word_similarity_op,
|
||||
COMMUTATOR = '%>>',
|
||||
RESTRICT = contsel,
|
||||
JOIN = contjoinsel
|
||||
);
|
||||
|
||||
CREATE OPERATOR %>> (
|
||||
LEFTARG = text,
|
||||
RIGHTARG = text,
|
||||
PROCEDURE = strict_word_similarity_commutator_op,
|
||||
COMMUTATOR = '<<%',
|
||||
RESTRICT = contsel,
|
||||
JOIN = contjoinsel
|
||||
);
|
||||
|
||||
CREATE FUNCTION strict_word_similarity_dist_op(text,text)
|
||||
RETURNS float4
|
||||
AS 'MODULE_PATHNAME'
|
||||
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
CREATE FUNCTION strict_word_similarity_dist_commutator_op(text,text)
|
||||
RETURNS float4
|
||||
AS 'MODULE_PATHNAME'
|
||||
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
CREATE OPERATOR <<<-> (
|
||||
LEFTARG = text,
|
||||
RIGHTARG = text,
|
||||
PROCEDURE = strict_word_similarity_dist_op,
|
||||
COMMUTATOR = '<->>>'
|
||||
);
|
||||
|
||||
CREATE OPERATOR <->>> (
|
||||
LEFTARG = text,
|
||||
RIGHTARG = text,
|
||||
PROCEDURE = strict_word_similarity_dist_commutator_op,
|
||||
COMMUTATOR = '<<<->'
|
||||
);
|
||||
|
||||
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
|
||||
OPERATOR 9 %>> (text, text),
|
||||
OPERATOR 10 <->>> (text, text) FOR ORDER BY pg_catalog.float_ops;
|
||||
|
||||
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
|
||||
OPERATOR 9 %>> (text, text);
|
@ -1,5 +1,5 @@
|
||||
# pg_trgm extension
|
||||
comment = 'text similarity measurement and index searching based on trigrams'
|
||||
default_version = '1.3'
|
||||
default_version = '1.4'
|
||||
module_pathname = '$libdir/pg_trgm'
|
||||
relocatable = true
|
||||
|
42
contrib/pg_trgm/sql/pg_strict_word_trgm.sql
Normal file
42
contrib/pg_trgm/sql/pg_strict_word_trgm.sql
Normal file
@ -0,0 +1,42 @@
|
||||
DROP INDEX trgm_idx2;
|
||||
|
||||
\copy test_trgm3 from 'data/trgm2.data'
|
||||
|
||||
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
|
||||
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
|
||||
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
|
||||
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
|
||||
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
|
||||
|
||||
create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops);
|
||||
set enable_seqscan=off;
|
||||
|
||||
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
|
||||
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
|
||||
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
|
||||
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
|
||||
|
||||
explain (costs off)
|
||||
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
|
||||
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
|
||||
|
||||
drop index trgm_idx2;
|
||||
create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops);
|
||||
set enable_seqscan=off;
|
||||
|
||||
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
|
||||
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
|
||||
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
|
||||
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
|
||||
|
||||
set "pg_trgm.strict_word_similarity_threshold" to 0.4;
|
||||
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
|
||||
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
|
||||
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
|
||||
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
|
||||
|
||||
set "pg_trgm.strict_word_similarity_threshold" to 0.2;
|
||||
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
|
||||
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
|
||||
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
|
||||
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
|
@ -6,6 +6,7 @@
|
||||
|
||||
#include "access/gist.h"
|
||||
#include "access/itup.h"
|
||||
#include "access/stratnum.h"
|
||||
#include "storage/bufpage.h"
|
||||
|
||||
/*
|
||||
@ -26,14 +27,16 @@
|
||||
#define DIVUNION
|
||||
|
||||
/* operator strategy numbers */
|
||||
#define SimilarityStrategyNumber 1
|
||||
#define DistanceStrategyNumber 2
|
||||
#define LikeStrategyNumber 3
|
||||
#define ILikeStrategyNumber 4
|
||||
#define RegExpStrategyNumber 5
|
||||
#define RegExpICaseStrategyNumber 6
|
||||
#define WordSimilarityStrategyNumber 7
|
||||
#define WordDistanceStrategyNumber 8
|
||||
#define SimilarityStrategyNumber 1
|
||||
#define DistanceStrategyNumber 2
|
||||
#define LikeStrategyNumber 3
|
||||
#define ILikeStrategyNumber 4
|
||||
#define RegExpStrategyNumber 5
|
||||
#define RegExpICaseStrategyNumber 6
|
||||
#define WordSimilarityStrategyNumber 7
|
||||
#define WordDistanceStrategyNumber 8
|
||||
#define StrictWordSimilarityStrategyNumber 9
|
||||
#define StrictWordDistanceStrategyNumber 10
|
||||
|
||||
typedef char trgm[3];
|
||||
|
||||
@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph;
|
||||
|
||||
extern double similarity_threshold;
|
||||
extern double word_similarity_threshold;
|
||||
extern double strict_word_similarity_threshold;
|
||||
|
||||
extern double index_strategy_get_limit(StrategyNumber strategy);
|
||||
extern uint32 trgm2int(trgm *ptr);
|
||||
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
|
||||
extern TRGM *generate_trgm(char *str, int slen);
|
||||
|
@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
|
||||
{
|
||||
case SimilarityStrategyNumber:
|
||||
case WordSimilarityStrategyNumber:
|
||||
case StrictWordSimilarityStrategyNumber:
|
||||
trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
|
||||
break;
|
||||
case ILikeStrategyNumber:
|
||||
@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
|
||||
{
|
||||
case SimilarityStrategyNumber:
|
||||
case WordSimilarityStrategyNumber:
|
||||
nlimit = (strategy == SimilarityStrategyNumber) ?
|
||||
similarity_threshold : word_similarity_threshold;
|
||||
case StrictWordSimilarityStrategyNumber:
|
||||
nlimit = index_strategy_get_limit(strategy);
|
||||
|
||||
/* Count the matches */
|
||||
ntrue = 0;
|
||||
@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
|
||||
{
|
||||
case SimilarityStrategyNumber:
|
||||
case WordSimilarityStrategyNumber:
|
||||
nlimit = (strategy == SimilarityStrategyNumber) ?
|
||||
similarity_threshold : word_similarity_threshold;
|
||||
case StrictWordSimilarityStrategyNumber:
|
||||
nlimit = index_strategy_get_limit(strategy);
|
||||
|
||||
/* Count the matches */
|
||||
ntrue = 0;
|
||||
|
@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
|
||||
{
|
||||
case SimilarityStrategyNumber:
|
||||
case WordSimilarityStrategyNumber:
|
||||
case StrictWordSimilarityStrategyNumber:
|
||||
qtrg = generate_trgm(VARDATA(query),
|
||||
querysize - VARHDRSZ);
|
||||
break;
|
||||
@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
|
||||
{
|
||||
case SimilarityStrategyNumber:
|
||||
case WordSimilarityStrategyNumber:
|
||||
/* Similarity search is exact. Word similarity search is inexact */
|
||||
*recheck = (strategy == WordSimilarityStrategyNumber);
|
||||
nlimit = (strategy == SimilarityStrategyNumber) ?
|
||||
similarity_threshold : word_similarity_threshold;
|
||||
case StrictWordSimilarityStrategyNumber:
|
||||
/* Similarity search is exact. (Strict) word similarity search is inexact */
|
||||
*recheck = (strategy != SimilarityStrategyNumber);
|
||||
|
||||
nlimit = index_strategy_get_limit(strategy);
|
||||
|
||||
if (GIST_LEAF(entry))
|
||||
{ /* all leafs contains orig trgm */
|
||||
@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS)
|
||||
{
|
||||
case DistanceStrategyNumber:
|
||||
case WordDistanceStrategyNumber:
|
||||
*recheck = strategy == WordDistanceStrategyNumber;
|
||||
case StrictWordDistanceStrategyNumber:
|
||||
/* Only plain trigram distance is exact */
|
||||
*recheck = (strategy != DistanceStrategyNumber);
|
||||
if (GIST_LEAF(entry))
|
||||
{ /* all leafs contains orig trgm */
|
||||
|
||||
|
@ -18,6 +18,7 @@ PG_MODULE_MAGIC;
|
||||
/* GUC variables */
|
||||
double similarity_threshold = 0.3f;
|
||||
double word_similarity_threshold = 0.6f;
|
||||
double strict_word_similarity_threshold = 0.5f;
|
||||
|
||||
void _PG_init(void);
|
||||
|
||||
@ -26,12 +27,17 @@ PG_FUNCTION_INFO_V1(show_limit);
|
||||
PG_FUNCTION_INFO_V1(show_trgm);
|
||||
PG_FUNCTION_INFO_V1(similarity);
|
||||
PG_FUNCTION_INFO_V1(word_similarity);
|
||||
PG_FUNCTION_INFO_V1(strict_word_similarity);
|
||||
PG_FUNCTION_INFO_V1(similarity_dist);
|
||||
PG_FUNCTION_INFO_V1(similarity_op);
|
||||
PG_FUNCTION_INFO_V1(word_similarity_op);
|
||||
PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
|
||||
PG_FUNCTION_INFO_V1(word_similarity_dist_op);
|
||||
PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
|
||||
PG_FUNCTION_INFO_V1(strict_word_similarity_op);
|
||||
PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
|
||||
PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
|
||||
PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
|
||||
|
||||
/* Trigram with position */
|
||||
typedef struct
|
||||
@ -40,6 +46,17 @@ typedef struct
|
||||
int index;
|
||||
} pos_trgm;
|
||||
|
||||
/* Trigram bound type */
|
||||
typedef uint8 TrgmBound;
|
||||
#define TRGM_BOUND_LEFT (0x01) /* trigram is left bound of word */
|
||||
#define TRGM_BOUND_RIGHT (0x02) /* trigram is right bound of word */
|
||||
|
||||
/* Word similarity flags */
|
||||
#define WORD_SIMILARITY_CHECK_ONLY (0x01) /* if set then only check existence
|
||||
* of similar search pattern in text */
|
||||
#define WORD_SIMILARITY_STRICT (0x02) /* force bounds of extent to match
|
||||
* word bounds */
|
||||
|
||||
/*
|
||||
* Module load callback
|
||||
*/
|
||||
@ -71,6 +88,18 @@ _PG_init(void)
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
|
||||
"Sets the threshold used by the <<%% operator.",
|
||||
"Valid range is 0.0 .. 1.0.",
|
||||
&strict_word_similarity_threshold,
|
||||
0.5,
|
||||
0.0,
|
||||
1.0,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -95,6 +124,29 @@ set_limit(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_FLOAT4(similarity_threshold);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Get similarity threshold for given index scan strategy number.
|
||||
*/
|
||||
double
|
||||
index_strategy_get_limit(StrategyNumber strategy)
|
||||
{
|
||||
switch (strategy)
|
||||
{
|
||||
case SimilarityStrategyNumber:
|
||||
return similarity_threshold;
|
||||
case WordSimilarityStrategyNumber:
|
||||
return word_similarity_threshold;
|
||||
case StrictWordSimilarityStrategyNumber:
|
||||
return strict_word_similarity_threshold;
|
||||
default:
|
||||
elog(ERROR, "unrecognized strategy number: %d", strategy);
|
||||
break;
|
||||
}
|
||||
|
||||
return 0.0; /* keep compiler quiet */
|
||||
}
|
||||
|
||||
/*
|
||||
* Deprecated function.
|
||||
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
|
||||
@ -235,11 +287,12 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
|
||||
*
|
||||
* trg: where to return the array of trigrams.
|
||||
* str: source string, of length slen bytes.
|
||||
* bounds: where to return bounds of trigrams (if needed).
|
||||
*
|
||||
* Returns length of the generated array.
|
||||
*/
|
||||
static int
|
||||
generate_trgm_only(trgm *trg, char *str, int slen)
|
||||
generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
|
||||
{
|
||||
trgm *tptr;
|
||||
char *buf;
|
||||
@ -282,11 +335,13 @@ generate_trgm_only(trgm *trg, char *str, int slen)
|
||||
buf[LPADDING + bytelen] = ' ';
|
||||
buf[LPADDING + bytelen + 1] = ' ';
|
||||
|
||||
/*
|
||||
* count trigrams
|
||||
*/
|
||||
/* Calculate trigrams marking their bounds if needed */
|
||||
if (bounds)
|
||||
bounds[tptr - trg] |= TRGM_BOUND_LEFT;
|
||||
tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
|
||||
charlen + LPADDING + RPADDING);
|
||||
if (bounds)
|
||||
bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
|
||||
}
|
||||
|
||||
pfree(buf);
|
||||
@ -328,7 +383,7 @@ generate_trgm(char *str, int slen)
|
||||
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
|
||||
trg->flag = ARRKEY;
|
||||
|
||||
len = generate_trgm_only(GETARR(trg), str, slen);
|
||||
len = generate_trgm_only(GETARR(trg), str, slen, NULL);
|
||||
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
|
||||
|
||||
if (len == 0)
|
||||
@ -413,8 +468,8 @@ comp_ptrgm(const void *v1, const void *v2)
|
||||
* ulen1: count of unique trigrams of array "trg1".
|
||||
* len2: length of array "trg2" and array "trg2indexes".
|
||||
* len: length of the array "found".
|
||||
* check_only: if true then only check existence of similar search pattern in
|
||||
* text.
|
||||
* lags: set of boolean flags parametrizing similarity calculation.
|
||||
* bounds: whether each trigram is left/right bound of word.
|
||||
*
|
||||
* Returns word similarity.
|
||||
*/
|
||||
@ -424,16 +479,32 @@ iterate_word_similarity(int *trg2indexes,
|
||||
int ulen1,
|
||||
int len2,
|
||||
int len,
|
||||
bool check_only)
|
||||
uint8 flags,
|
||||
TrgmBound *bounds)
|
||||
{
|
||||
int *lastpos,
|
||||
i,
|
||||
ulen2 = 0,
|
||||
count = 0,
|
||||
upper = -1,
|
||||
lower = -1;
|
||||
lower;
|
||||
float4 smlr_cur,
|
||||
smlr_max = 0.0f;
|
||||
double threshold;
|
||||
|
||||
Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
|
||||
|
||||
/* Select appropriate threshold */
|
||||
threshold = (flags & WORD_SIMILARITY_STRICT) ?
|
||||
strict_word_similarity_threshold :
|
||||
word_similarity_threshold;
|
||||
|
||||
/*
|
||||
* Consider first trigram as initial lower bount for strict word similarity,
|
||||
* or initialize it later with first trigram present for plain word
|
||||
* similarity.
|
||||
*/
|
||||
lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
|
||||
|
||||
/* Memorise last position of each trigram */
|
||||
lastpos = (int *) palloc(sizeof(int) * len);
|
||||
@ -456,8 +527,13 @@ iterate_word_similarity(int *trg2indexes,
|
||||
lastpos[trgindex] = i;
|
||||
}
|
||||
|
||||
/* Adjust upper bound if this trigram is present in required substring */
|
||||
if (found[trgindex])
|
||||
/*
|
||||
* Adjust upper bound if trigram is upper bound of word for strict
|
||||
* word similarity, or if trigram is present in required substring for
|
||||
* plain word similarity
|
||||
*/
|
||||
if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
|
||||
: found[trgindex])
|
||||
{
|
||||
int prev_lower,
|
||||
tmp_ulen2,
|
||||
@ -479,24 +555,35 @@ iterate_word_similarity(int *trg2indexes,
|
||||
prev_lower = lower;
|
||||
for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
|
||||
{
|
||||
float smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
|
||||
float smlr_tmp;
|
||||
int tmp_trgindex;
|
||||
|
||||
if (smlr_tmp > smlr_cur)
|
||||
{
|
||||
smlr_cur = smlr_tmp;
|
||||
ulen2 = tmp_ulen2;
|
||||
lower = tmp_lower;
|
||||
count = tmp_count;
|
||||
}
|
||||
|
||||
/*
|
||||
* if we only check that word similarity is greater than
|
||||
* pg_trgm.word_similarity_threshold we do not need to
|
||||
* calculate a maximum similarity.
|
||||
* Adjust lower bound only if trigram is lower bound of word
|
||||
* for strict word similarity, or consider every trigram as
|
||||
* lower bound for plain word similarity.
|
||||
*/
|
||||
if (check_only && smlr_cur >= word_similarity_threshold)
|
||||
break;
|
||||
if (!(flags & WORD_SIMILARITY_STRICT)
|
||||
|| (bounds[tmp_lower] & TRGM_BOUND_LEFT))
|
||||
{
|
||||
smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
|
||||
if (smlr_tmp > smlr_cur)
|
||||
{
|
||||
smlr_cur = smlr_tmp;
|
||||
ulen2 = tmp_ulen2;
|
||||
lower = tmp_lower;
|
||||
count = tmp_count;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we only check that word similarity is greater than
|
||||
* threshold we do not need to calculate a maximum
|
||||
* similarity.
|
||||
*/
|
||||
if ((flags & WORD_SIMILARITY_CHECK_ONLY)
|
||||
&& smlr_cur >= threshold)
|
||||
break;
|
||||
}
|
||||
|
||||
tmp_trgindex = trg2indexes[tmp_lower];
|
||||
if (lastpos[tmp_trgindex] == tmp_lower)
|
||||
@ -511,10 +598,9 @@ iterate_word_similarity(int *trg2indexes,
|
||||
|
||||
/*
|
||||
* if we only check that word similarity is greater than
|
||||
* pg_trgm.word_similarity_threshold we do not need to calculate a
|
||||
* maximum similarity
|
||||
* threshold we do not need to calculate a maximum similarity.
|
||||
*/
|
||||
if (check_only && smlr_max >= word_similarity_threshold)
|
||||
if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
|
||||
break;
|
||||
|
||||
for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
|
||||
@ -547,14 +633,13 @@ iterate_word_similarity(int *trg2indexes,
|
||||
*
|
||||
* str1: search pattern string, of length slen1 bytes.
|
||||
* str2: text in which we are looking for a word, of length slen2 bytes.
|
||||
* check_only: if true then only check existence of similar search pattern in
|
||||
* text.
|
||||
* flags: set of boolean flags parametrizing similarity calculation.
|
||||
*
|
||||
* Returns word similarity.
|
||||
*/
|
||||
static float4
|
||||
calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
|
||||
bool check_only)
|
||||
uint8 flags)
|
||||
{
|
||||
bool *found;
|
||||
pos_trgm *ptrg;
|
||||
@ -568,15 +653,20 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
|
||||
ulen1;
|
||||
int *trg2indexes;
|
||||
float4 result;
|
||||
TrgmBound *bounds;
|
||||
|
||||
protect_out_of_mem(slen1 + slen2);
|
||||
|
||||
/* Make positional trigrams */
|
||||
trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
|
||||
trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
|
||||
if (flags & WORD_SIMILARITY_STRICT)
|
||||
bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
|
||||
else
|
||||
bounds = NULL;
|
||||
|
||||
len1 = generate_trgm_only(trg1, str1, slen1);
|
||||
len2 = generate_trgm_only(trg2, str2, slen2);
|
||||
len1 = generate_trgm_only(trg1, str1, slen1, NULL);
|
||||
len2 = generate_trgm_only(trg2, str2, slen2, bounds);
|
||||
|
||||
ptrg = make_positional_trgm(trg1, len1, trg2, len2);
|
||||
len = len1 + len2;
|
||||
@ -622,7 +712,7 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
|
||||
|
||||
/* Run iterative procedure to find maximum similarity with word */
|
||||
result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
|
||||
check_only);
|
||||
flags, bounds);
|
||||
|
||||
pfree(trg2indexes);
|
||||
pfree(found);
|
||||
@ -1081,7 +1171,23 @@ word_similarity(PG_FUNCTION_ARGS)
|
||||
|
||||
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
||||
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
||||
false);
|
||||
0);
|
||||
|
||||
PG_FREE_IF_COPY(in1, 0);
|
||||
PG_FREE_IF_COPY(in2, 1);
|
||||
PG_RETURN_FLOAT4(res);
|
||||
}
|
||||
|
||||
Datum
|
||||
strict_word_similarity(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in1 = PG_GETARG_TEXT_PP(0);
|
||||
text *in2 = PG_GETARG_TEXT_PP(1);
|
||||
float4 res;
|
||||
|
||||
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
||||
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
||||
WORD_SIMILARITY_STRICT);
|
||||
|
||||
PG_FREE_IF_COPY(in1, 0);
|
||||
PG_FREE_IF_COPY(in2, 1);
|
||||
@ -1117,7 +1223,7 @@ word_similarity_op(PG_FUNCTION_ARGS)
|
||||
|
||||
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
||||
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
||||
true);
|
||||
WORD_SIMILARITY_CHECK_ONLY);
|
||||
|
||||
PG_FREE_IF_COPY(in1, 0);
|
||||
PG_FREE_IF_COPY(in2, 1);
|
||||
@ -1133,7 +1239,7 @@ word_similarity_commutator_op(PG_FUNCTION_ARGS)
|
||||
|
||||
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
||||
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
||||
true);
|
||||
WORD_SIMILARITY_CHECK_ONLY);
|
||||
|
||||
PG_FREE_IF_COPY(in1, 0);
|
||||
PG_FREE_IF_COPY(in2, 1);
|
||||
@ -1149,7 +1255,7 @@ word_similarity_dist_op(PG_FUNCTION_ARGS)
|
||||
|
||||
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
||||
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
||||
false);
|
||||
0);
|
||||
|
||||
PG_FREE_IF_COPY(in1, 0);
|
||||
PG_FREE_IF_COPY(in2, 1);
|
||||
@ -1165,7 +1271,71 @@ word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
|
||||
|
||||
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
||||
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
||||
false);
|
||||
0);
|
||||
|
||||
PG_FREE_IF_COPY(in1, 0);
|
||||
PG_FREE_IF_COPY(in2, 1);
|
||||
PG_RETURN_FLOAT4(1.0 - res);
|
||||
}
|
||||
|
||||
Datum
|
||||
strict_word_similarity_op(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in1 = PG_GETARG_TEXT_PP(0);
|
||||
text *in2 = PG_GETARG_TEXT_PP(1);
|
||||
float4 res;
|
||||
|
||||
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
||||
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
||||
WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
|
||||
|
||||
PG_FREE_IF_COPY(in1, 0);
|
||||
PG_FREE_IF_COPY(in2, 1);
|
||||
PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
|
||||
}
|
||||
|
||||
Datum
|
||||
strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in1 = PG_GETARG_TEXT_PP(0);
|
||||
text *in2 = PG_GETARG_TEXT_PP(1);
|
||||
float4 res;
|
||||
|
||||
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
||||
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
||||
WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
|
||||
|
||||
PG_FREE_IF_COPY(in1, 0);
|
||||
PG_FREE_IF_COPY(in2, 1);
|
||||
PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
|
||||
}
|
||||
|
||||
Datum
|
||||
strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in1 = PG_GETARG_TEXT_PP(0);
|
||||
text *in2 = PG_GETARG_TEXT_PP(1);
|
||||
float4 res;
|
||||
|
||||
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
||||
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
||||
WORD_SIMILARITY_STRICT);
|
||||
|
||||
PG_FREE_IF_COPY(in1, 0);
|
||||
PG_FREE_IF_COPY(in2, 1);
|
||||
PG_RETURN_FLOAT4(1.0 - res);
|
||||
}
|
||||
|
||||
Datum
|
||||
strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in1 = PG_GETARG_TEXT_PP(0);
|
||||
text *in2 = PG_GETARG_TEXT_PP(1);
|
||||
float4 res;
|
||||
|
||||
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
||||
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
||||
WORD_SIMILARITY_STRICT);
|
||||
|
||||
PG_FREE_IF_COPY(in1, 0);
|
||||
PG_FREE_IF_COPY(in2, 1);
|
||||
|
@ -105,6 +105,17 @@
|
||||
the explanation below.
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<function>strict_word_similarity(text, text)</function>
|
||||
<indexterm><primary>strict_word_similarity</primary></indexterm>
|
||||
</entry>
|
||||
<entry><type>real</type></entry>
|
||||
<entry>
|
||||
Same as <function>word_similarity(text, text)</function>, but forces
|
||||
extent boundaries to match word boundaries.
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
|
||||
<entry><type>real</type></entry>
|
||||
@ -157,6 +168,29 @@
|
||||
a part of the word.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
At the same time, <function>strict_word_similarity(text, text)</function>
|
||||
has to select an extent that matches word boundaries. In the example above,
|
||||
<function>strict_word_similarity(text, text)</function> would select the
|
||||
extent <literal>{" w"," wo","wor","ord","rds", ds "}</literal>, which
|
||||
corresponds to the whole word <literal>'words'</literal>.
|
||||
|
||||
<programlisting>
|
||||
# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words');
|
||||
strict_word_similarity | similarity
|
||||
------------------------+------------
|
||||
0.571429 | 0.571429
|
||||
(1 row)
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Thus, the <function>strict_word_similarity(text, text)</function> function
|
||||
is useful for finding similar subsets of whole words, while
|
||||
<function>word_similarity(text, text)</function> is more suitable for
|
||||
searching similar parts of words.
|
||||
</para>
|
||||
|
||||
<table id="pgtrgm-op-table">
|
||||
<title><filename>pg_trgm</filename> Operators</title>
|
||||
<tgroup cols="3">
|
||||
@ -196,6 +230,24 @@
|
||||
Commutator of the <literal><%</literal> operator.
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry><type>text</type> <literal><<%</literal> <type>text</type></entry>
|
||||
<entry><type>boolean</type></entry>
|
||||
<entry>
|
||||
Returns <literal>true</literal> if its second argument has a continuous
|
||||
extent of an ordered trigram set that matches word boundaries,
|
||||
and its similarity to the trigram set of the first argument is greater
|
||||
than the current strict word similarity threshold set by the
|
||||
<varname>pg_trgm.strict_word_similarity_threshold</varname> parameter.
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry><type>text</type> <literal>%>></literal> <type>text</type></entry>
|
||||
<entry><type>boolean</type></entry>
|
||||
<entry>
|
||||
Commutator of the <literal><<%</literal> operator.
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry><type>text</type> <literal><-></literal> <type>text</type></entry>
|
||||
<entry><type>real</type></entry>
|
||||
@ -223,6 +275,25 @@
|
||||
Commutator of the <literal><<-></literal> operator.
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<type>text</type> <literal><<<-></literal> <type>text</type>
|
||||
</entry>
|
||||
<entry><type>real</type></entry>
|
||||
<entry>
|
||||
Returns the <quote>distance</quote> between the arguments, that is
|
||||
one minus the <function>strict_word_similarity()</function> value.
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<type>text</type> <literal><->>></literal> <type>text</type>
|
||||
</entry>
|
||||
<entry><type>real</type></entry>
|
||||
<entry>
|
||||
Commutator of the <literal><<<-></literal> operator.
|
||||
</entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
@ -322,12 +393,19 @@ SELECT t, t <-> '<replaceable>word</replaceable>' AS dist
|
||||
|
||||
<para>
|
||||
Also you can use an index on the <structfield>t</structfield> column for word
|
||||
similarity. For example:
|
||||
similarity or strict word similarity. Typical queries are:
|
||||
<programlisting>
|
||||
SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
|
||||
FROM test_trgm
|
||||
WHERE '<replaceable>word</replaceable>' <% t
|
||||
ORDER BY sml DESC, t;
|
||||
</programlisting>
|
||||
and
|
||||
<programlisting>
|
||||
SELECT t, strict_word_similarity('<replaceable>word</replaceable>', t) AS sml
|
||||
FROM test_trgm
|
||||
WHERE '<replaceable>word</replaceable>' <<% t
|
||||
ORDER BY sml DESC, t;
|
||||
</programlisting>
|
||||
This will return all values in the text column for which there is a
|
||||
continuous extent in the corresponding ordered trigram set that is
|
||||
@ -337,11 +415,17 @@ SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
|
||||
</para>
|
||||
|
||||
<para>
|
||||
A variant of the above query is
|
||||
Possible variants of the above queries are:
|
||||
<programlisting>
|
||||
SELECT t, '<replaceable>word</replaceable>' <<-> t AS dist
|
||||
FROM test_trgm
|
||||
ORDER BY dist LIMIT 10;
|
||||
</programlisting>
|
||||
and
|
||||
<programlisting>
|
||||
SELECT t, '<replaceable>word</replaceable>' <<<-> t AS dist
|
||||
FROM test_trgm
|
||||
ORDER BY dist LIMIT 10;
|
||||
</programlisting>
|
||||
This can be implemented quite efficiently by GiST indexes, but not
|
||||
by GIN indexes.
|
||||
|
Loading…
Reference in New Issue
Block a user