mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-12 18:34:36 +08:00
3ed2005ff5
Our usual practice for "poor man's enum" catalog columns is to define macros for the possible values and use those, not literal constants, in C code. But for some reason lost in the mists of time, this was never done for typalign/attalign or typstorage/attstorage. It's never too late to make it better though, so let's do that. The reason I got interested in this right now is the need to duplicate some uses of the TYPSTORAGE constants in an upcoming ALTER TYPE patch. But in general, this sort of change aids greppability and readability, so it's a good idea even without any specific motivation. I may have missed a few places that could be converted, and it's even more likely that pending patches will re-introduce some hard-coded references. But that's not fatal --- there's no expectation that we'd actually change any of these values. We can clean up stragglers over time. Discussion: https://postgr.es/m/16457.1583189537@sss.pgh.pa.us
1322 lines
29 KiB
C
1322 lines
29 KiB
C
/*
|
|
* contrib/pg_trgm/trgm_op.c
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include <ctype.h>
|
|
|
|
#include "catalog/pg_type.h"
|
|
#include "lib/qunique.h"
|
|
#include "trgm.h"
|
|
#include "tsearch/ts_locale.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/pg_crc.h"
|
|
|
|
PG_MODULE_MAGIC;
|
|
|
|
/* GUC variables */
|
|
double similarity_threshold = 0.3f;
|
|
double word_similarity_threshold = 0.6f;
|
|
double strict_word_similarity_threshold = 0.5f;
|
|
|
|
void _PG_init(void);
|
|
|
|
PG_FUNCTION_INFO_V1(set_limit);
|
|
PG_FUNCTION_INFO_V1(show_limit);
|
|
PG_FUNCTION_INFO_V1(show_trgm);
|
|
PG_FUNCTION_INFO_V1(similarity);
|
|
PG_FUNCTION_INFO_V1(word_similarity);
|
|
PG_FUNCTION_INFO_V1(strict_word_similarity);
|
|
PG_FUNCTION_INFO_V1(similarity_dist);
|
|
PG_FUNCTION_INFO_V1(similarity_op);
|
|
PG_FUNCTION_INFO_V1(word_similarity_op);
|
|
PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
|
|
PG_FUNCTION_INFO_V1(word_similarity_dist_op);
|
|
PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
|
|
PG_FUNCTION_INFO_V1(strict_word_similarity_op);
|
|
PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
|
|
PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
|
|
PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
|
|
|
|
/* Trigram with position */
|
|
typedef struct
|
|
{
|
|
trgm trg;
|
|
int index;
|
|
} pos_trgm;
|
|
|
|
/* Trigram bound type */
|
|
typedef uint8 TrgmBound;
|
|
#define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
|
|
#define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
|
|
|
|
/* Word similarity flags */
|
|
#define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
|
|
* search pattern in text */
|
|
#define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
|
|
* word bounds */
|
|
|
|
/*
|
|
* Module load callback
|
|
*/
|
|
void
|
|
_PG_init(void)
|
|
{
|
|
/* Define custom GUC variables. */
|
|
DefineCustomRealVariable("pg_trgm.similarity_threshold",
|
|
"Sets the threshold used by the % operator.",
|
|
"Valid range is 0.0 .. 1.0.",
|
|
&similarity_threshold,
|
|
0.3,
|
|
0.0,
|
|
1.0,
|
|
PGC_USERSET,
|
|
0,
|
|
NULL,
|
|
NULL,
|
|
NULL);
|
|
DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
|
|
"Sets the threshold used by the <% operator.",
|
|
"Valid range is 0.0 .. 1.0.",
|
|
&word_similarity_threshold,
|
|
0.6,
|
|
0.0,
|
|
1.0,
|
|
PGC_USERSET,
|
|
0,
|
|
NULL,
|
|
NULL,
|
|
NULL);
|
|
DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
|
|
"Sets the threshold used by the <<% operator.",
|
|
"Valid range is 0.0 .. 1.0.",
|
|
&strict_word_similarity_threshold,
|
|
0.5,
|
|
0.0,
|
|
1.0,
|
|
PGC_USERSET,
|
|
0,
|
|
NULL,
|
|
NULL,
|
|
NULL);
|
|
}
|
|
|
|
/*
|
|
* Deprecated function.
|
|
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
|
|
*/
|
|
Datum
|
|
set_limit(PG_FUNCTION_ARGS)
|
|
{
|
|
float4 nlimit = PG_GETARG_FLOAT4(0);
|
|
char *nlimit_str;
|
|
Oid func_out_oid;
|
|
bool is_varlena;
|
|
|
|
getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
|
|
|
|
nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
|
|
|
|
SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
|
|
PGC_USERSET, PGC_S_SESSION);
|
|
|
|
PG_RETURN_FLOAT4(similarity_threshold);
|
|
}
|
|
|
|
|
|
/*
|
|
* Get similarity threshold for given index scan strategy number.
|
|
*/
|
|
double
|
|
index_strategy_get_limit(StrategyNumber strategy)
|
|
{
|
|
switch (strategy)
|
|
{
|
|
case SimilarityStrategyNumber:
|
|
return similarity_threshold;
|
|
case WordSimilarityStrategyNumber:
|
|
return word_similarity_threshold;
|
|
case StrictWordSimilarityStrategyNumber:
|
|
return strict_word_similarity_threshold;
|
|
default:
|
|
elog(ERROR, "unrecognized strategy number: %d", strategy);
|
|
break;
|
|
}
|
|
|
|
return 0.0; /* keep compiler quiet */
|
|
}
|
|
|
|
/*
|
|
* Deprecated function.
|
|
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
|
|
*/
|
|
Datum
|
|
show_limit(PG_FUNCTION_ARGS)
|
|
{
|
|
PG_RETURN_FLOAT4(similarity_threshold);
|
|
}
|
|
|
|
static int
|
|
comp_trgm(const void *a, const void *b)
|
|
{
|
|
return CMPTRGM(a, b);
|
|
}
|
|
|
|
/*
|
|
* Finds first word in string, returns pointer to the word,
|
|
* endword points to the character after word
|
|
*/
|
|
static char *
|
|
find_word(char *str, int lenstr, char **endword, int *charlen)
|
|
{
|
|
char *beginword = str;
|
|
|
|
while (beginword - str < lenstr && !ISWORDCHR(beginword))
|
|
beginword += pg_mblen(beginword);
|
|
|
|
if (beginword - str >= lenstr)
|
|
return NULL;
|
|
|
|
*endword = beginword;
|
|
*charlen = 0;
|
|
while (*endword - str < lenstr && ISWORDCHR(*endword))
|
|
{
|
|
*endword += pg_mblen(*endword);
|
|
(*charlen)++;
|
|
}
|
|
|
|
return beginword;
|
|
}
|
|
|
|
/*
|
|
* Reduce a trigram (three possibly multi-byte characters) to a trgm,
|
|
* which is always exactly three bytes. If we have three single-byte
|
|
* characters, we just use them as-is; otherwise we form a hash value.
|
|
*/
|
|
void
|
|
compact_trigram(trgm *tptr, char *str, int bytelen)
|
|
{
|
|
if (bytelen == 3)
|
|
{
|
|
CPTRGM(tptr, str);
|
|
}
|
|
else
|
|
{
|
|
pg_crc32 crc;
|
|
|
|
INIT_LEGACY_CRC32(crc);
|
|
COMP_LEGACY_CRC32(crc, str, bytelen);
|
|
FIN_LEGACY_CRC32(crc);
|
|
|
|
/*
|
|
* use only 3 upper bytes from crc, hope, it's good enough hashing
|
|
*/
|
|
CPTRGM(tptr, &crc);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Adds trigrams from words (already padded).
|
|
*/
|
|
static trgm *
|
|
make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
|
|
{
|
|
char *ptr = str;
|
|
|
|
if (charlen < 3)
|
|
return tptr;
|
|
|
|
if (bytelen > charlen)
|
|
{
|
|
/* Find multibyte character boundaries and apply compact_trigram */
|
|
int lenfirst = pg_mblen(str),
|
|
lenmiddle = pg_mblen(str + lenfirst),
|
|
lenlast = pg_mblen(str + lenfirst + lenmiddle);
|
|
|
|
while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
|
|
{
|
|
compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
|
|
|
|
ptr += lenfirst;
|
|
tptr++;
|
|
|
|
lenfirst = lenmiddle;
|
|
lenmiddle = lenlast;
|
|
lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* Fast path when there are no multibyte characters */
|
|
Assert(bytelen == charlen);
|
|
|
|
while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
|
|
{
|
|
CPTRGM(tptr, ptr);
|
|
ptr++;
|
|
tptr++;
|
|
}
|
|
}
|
|
|
|
return tptr;
|
|
}
|
|
|
|
/*
|
|
* Make array of trigrams without sorting and removing duplicate items.
|
|
*
|
|
* trg: where to return the array of trigrams.
|
|
* str: source string, of length slen bytes.
|
|
* bounds: where to return bounds of trigrams (if needed).
|
|
*
|
|
* Returns length of the generated array.
|
|
*/
|
|
static int
|
|
generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
|
|
{
|
|
trgm *tptr;
|
|
char *buf;
|
|
int charlen,
|
|
bytelen;
|
|
char *bword,
|
|
*eword;
|
|
|
|
if (slen + LPADDING + RPADDING < 3 || slen == 0)
|
|
return 0;
|
|
|
|
tptr = trg;
|
|
|
|
/* Allocate a buffer for case-folded, blank-padded words */
|
|
buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
|
|
|
|
if (LPADDING > 0)
|
|
{
|
|
*buf = ' ';
|
|
if (LPADDING > 1)
|
|
*(buf + 1) = ' ';
|
|
}
|
|
|
|
eword = str;
|
|
while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
|
|
{
|
|
#ifdef IGNORECASE
|
|
bword = lowerstr_with_len(bword, eword - bword);
|
|
bytelen = strlen(bword);
|
|
#else
|
|
bytelen = eword - bword;
|
|
#endif
|
|
|
|
memcpy(buf + LPADDING, bword, bytelen);
|
|
|
|
#ifdef IGNORECASE
|
|
pfree(bword);
|
|
#endif
|
|
|
|
buf[LPADDING + bytelen] = ' ';
|
|
buf[LPADDING + bytelen + 1] = ' ';
|
|
|
|
/* Calculate trigrams marking their bounds if needed */
|
|
if (bounds)
|
|
bounds[tptr - trg] |= TRGM_BOUND_LEFT;
|
|
tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
|
|
charlen + LPADDING + RPADDING);
|
|
if (bounds)
|
|
bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
|
|
}
|
|
|
|
pfree(buf);
|
|
|
|
return tptr - trg;
|
|
}
|
|
|
|
/*
|
|
* Guard against possible overflow in the palloc requests below. (We
|
|
* don't worry about the additive constants, since palloc can detect
|
|
* requests that are a little above MaxAllocSize --- we just need to
|
|
* prevent integer overflow in the multiplications.)
|
|
*/
|
|
static void
|
|
protect_out_of_mem(int slen)
|
|
{
|
|
if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
|
|
(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("out of memory")));
|
|
}
|
|
|
|
/*
|
|
* Make array of trigrams with sorting and removing duplicate items.
|
|
*
|
|
* str: source string, of length slen bytes.
|
|
*
|
|
* Returns the sorted array of unique trigrams.
|
|
*/
|
|
TRGM *
|
|
generate_trgm(char *str, int slen)
|
|
{
|
|
TRGM *trg;
|
|
int len;
|
|
|
|
protect_out_of_mem(slen);
|
|
|
|
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
|
|
trg->flag = ARRKEY;
|
|
|
|
len = generate_trgm_only(GETARR(trg), str, slen, NULL);
|
|
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
|
|
|
|
if (len == 0)
|
|
return trg;
|
|
|
|
/*
|
|
* Make trigrams unique.
|
|
*/
|
|
if (len > 1)
|
|
{
|
|
qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
|
|
len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
|
|
}
|
|
|
|
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
|
|
|
|
return trg;
|
|
}
|
|
|
|
/*
|
|
* Make array of positional trigrams from two trigram arrays trg1 and trg2.
|
|
*
|
|
* trg1: trigram array of search pattern, of length len1. trg1 is required
|
|
* word which positions don't matter and replaced with -1.
|
|
* trg2: trigram array of text, of length len2. trg2 is haystack where we
|
|
* search and have to store its positions.
|
|
*
|
|
* Returns concatenated trigram array.
|
|
*/
|
|
static pos_trgm *
|
|
make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
|
|
{
|
|
pos_trgm *result;
|
|
int i,
|
|
len = len1 + len2;
|
|
|
|
result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
|
|
|
|
for (i = 0; i < len1; i++)
|
|
{
|
|
memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
|
|
result[i].index = -1;
|
|
}
|
|
|
|
for (i = 0; i < len2; i++)
|
|
{
|
|
memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
|
|
result[i + len1].index = i;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Compare position trigrams: compare trigrams first and position second.
|
|
*/
|
|
static int
|
|
comp_ptrgm(const void *v1, const void *v2)
|
|
{
|
|
const pos_trgm *p1 = (const pos_trgm *) v1;
|
|
const pos_trgm *p2 = (const pos_trgm *) v2;
|
|
int cmp;
|
|
|
|
cmp = CMPTRGM(p1->trg, p2->trg);
|
|
if (cmp != 0)
|
|
return cmp;
|
|
|
|
if (p1->index < p2->index)
|
|
return -1;
|
|
else if (p1->index == p2->index)
|
|
return 0;
|
|
else
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Iterative search function which calculates maximum similarity with word in
|
|
* the string. But maximum similarity is calculated only if check_only == false.
|
|
*
|
|
* trg2indexes: array which stores indexes of the array "found".
|
|
* found: array which stores true of false values.
|
|
* ulen1: count of unique trigrams of array "trg1".
|
|
* len2: length of array "trg2" and array "trg2indexes".
|
|
* len: length of the array "found".
|
|
* lags: set of boolean flags parameterizing similarity calculation.
|
|
* bounds: whether each trigram is left/right bound of word.
|
|
*
|
|
* Returns word similarity.
|
|
*/
|
|
static float4
|
|
iterate_word_similarity(int *trg2indexes,
|
|
bool *found,
|
|
int ulen1,
|
|
int len2,
|
|
int len,
|
|
uint8 flags,
|
|
TrgmBound *bounds)
|
|
{
|
|
int *lastpos,
|
|
i,
|
|
ulen2 = 0,
|
|
count = 0,
|
|
upper = -1,
|
|
lower;
|
|
float4 smlr_cur,
|
|
smlr_max = 0.0f;
|
|
double threshold;
|
|
|
|
Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
|
|
|
|
/* Select appropriate threshold */
|
|
threshold = (flags & WORD_SIMILARITY_STRICT) ?
|
|
strict_word_similarity_threshold :
|
|
word_similarity_threshold;
|
|
|
|
/*
|
|
* Consider first trigram as initial lower bound for strict word
|
|
* similarity, or initialize it later with first trigram present for plain
|
|
* word similarity.
|
|
*/
|
|
lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
|
|
|
|
/* Memorise last position of each trigram */
|
|
lastpos = (int *) palloc(sizeof(int) * len);
|
|
memset(lastpos, -1, sizeof(int) * len);
|
|
|
|
for (i = 0; i < len2; i++)
|
|
{
|
|
/* Get index of next trigram */
|
|
int trgindex = trg2indexes[i];
|
|
|
|
/* Update last position of this trigram */
|
|
if (lower >= 0 || found[trgindex])
|
|
{
|
|
if (lastpos[trgindex] < 0)
|
|
{
|
|
ulen2++;
|
|
if (found[trgindex])
|
|
count++;
|
|
}
|
|
lastpos[trgindex] = i;
|
|
}
|
|
|
|
/*
|
|
* Adjust upper bound if trigram is upper bound of word for strict
|
|
* word similarity, or if trigram is present in required substring for
|
|
* plain word similarity
|
|
*/
|
|
if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
|
|
: found[trgindex])
|
|
{
|
|
int prev_lower,
|
|
tmp_ulen2,
|
|
tmp_lower,
|
|
tmp_count;
|
|
|
|
upper = i;
|
|
if (lower == -1)
|
|
{
|
|
lower = i;
|
|
ulen2 = 1;
|
|
}
|
|
|
|
smlr_cur = CALCSML(count, ulen1, ulen2);
|
|
|
|
/* Also try to adjust lower bound for greater similarity */
|
|
tmp_count = count;
|
|
tmp_ulen2 = ulen2;
|
|
prev_lower = lower;
|
|
for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
|
|
{
|
|
float smlr_tmp;
|
|
int tmp_trgindex;
|
|
|
|
/*
|
|
* Adjust lower bound only if trigram is lower bound of word
|
|
* for strict word similarity, or consider every trigram as
|
|
* lower bound for plain word similarity.
|
|
*/
|
|
if (!(flags & WORD_SIMILARITY_STRICT)
|
|
|| (bounds[tmp_lower] & TRGM_BOUND_LEFT))
|
|
{
|
|
smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
|
|
if (smlr_tmp > smlr_cur)
|
|
{
|
|
smlr_cur = smlr_tmp;
|
|
ulen2 = tmp_ulen2;
|
|
lower = tmp_lower;
|
|
count = tmp_count;
|
|
}
|
|
|
|
/*
|
|
* If we only check that word similarity is greater than
|
|
* threshold we do not need to calculate a maximum
|
|
* similarity.
|
|
*/
|
|
if ((flags & WORD_SIMILARITY_CHECK_ONLY)
|
|
&& smlr_cur >= threshold)
|
|
break;
|
|
}
|
|
|
|
tmp_trgindex = trg2indexes[tmp_lower];
|
|
if (lastpos[tmp_trgindex] == tmp_lower)
|
|
{
|
|
tmp_ulen2--;
|
|
if (found[tmp_trgindex])
|
|
tmp_count--;
|
|
}
|
|
}
|
|
|
|
smlr_max = Max(smlr_max, smlr_cur);
|
|
|
|
/*
|
|
* if we only check that word similarity is greater than threshold
|
|
* we do not need to calculate a maximum similarity.
|
|
*/
|
|
if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
|
|
break;
|
|
|
|
for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
|
|
{
|
|
int tmp_trgindex;
|
|
|
|
tmp_trgindex = trg2indexes[tmp_lower];
|
|
if (lastpos[tmp_trgindex] == tmp_lower)
|
|
lastpos[tmp_trgindex] = -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
pfree(lastpos);
|
|
|
|
return smlr_max;
|
|
}
|
|
|
|
/*
|
|
* Calculate word similarity.
|
|
* This function prepare two arrays: "trg2indexes" and "found". Then this arrays
|
|
* are used to calculate word similarity using iterate_word_similarity().
|
|
*
|
|
* "trg2indexes" is array which stores indexes of the array "found".
|
|
* In other words:
|
|
* trg2indexes[j] = i;
|
|
* found[i] = true (or false);
|
|
* If found[i] == true then there is trigram trg2[j] in array "trg1".
|
|
* If found[i] == false then there is not trigram trg2[j] in array "trg1".
|
|
*
|
|
* str1: search pattern string, of length slen1 bytes.
|
|
* str2: text in which we are looking for a word, of length slen2 bytes.
|
|
* flags: set of boolean flags parameterizing similarity calculation.
|
|
*
|
|
* Returns word similarity.
|
|
*/
|
|
static float4
|
|
calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
|
|
uint8 flags)
|
|
{
|
|
bool *found;
|
|
pos_trgm *ptrg;
|
|
trgm *trg1;
|
|
trgm *trg2;
|
|
int len1,
|
|
len2,
|
|
len,
|
|
i,
|
|
j,
|
|
ulen1;
|
|
int *trg2indexes;
|
|
float4 result;
|
|
TrgmBound *bounds;
|
|
|
|
protect_out_of_mem(slen1 + slen2);
|
|
|
|
/* Make positional trigrams */
|
|
trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
|
|
trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
|
|
if (flags & WORD_SIMILARITY_STRICT)
|
|
bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
|
|
else
|
|
bounds = NULL;
|
|
|
|
len1 = generate_trgm_only(trg1, str1, slen1, NULL);
|
|
len2 = generate_trgm_only(trg2, str2, slen2, bounds);
|
|
|
|
ptrg = make_positional_trgm(trg1, len1, trg2, len2);
|
|
len = len1 + len2;
|
|
qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
|
|
|
|
pfree(trg1);
|
|
pfree(trg2);
|
|
|
|
/*
|
|
* Merge positional trigrams array: enumerate each trigram and find its
|
|
* presence in required word.
|
|
*/
|
|
trg2indexes = (int *) palloc(sizeof(int) * len2);
|
|
found = (bool *) palloc0(sizeof(bool) * len);
|
|
|
|
ulen1 = 0;
|
|
j = 0;
|
|
for (i = 0; i < len; i++)
|
|
{
|
|
if (i > 0)
|
|
{
|
|
int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
|
|
|
|
if (cmp != 0)
|
|
{
|
|
if (found[j])
|
|
ulen1++;
|
|
j++;
|
|
}
|
|
}
|
|
|
|
if (ptrg[i].index >= 0)
|
|
{
|
|
trg2indexes[ptrg[i].index] = j;
|
|
}
|
|
else
|
|
{
|
|
found[j] = true;
|
|
}
|
|
}
|
|
if (found[j])
|
|
ulen1++;
|
|
|
|
/* Run iterative procedure to find maximum similarity with word */
|
|
result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
|
|
flags, bounds);
|
|
|
|
pfree(trg2indexes);
|
|
pfree(found);
|
|
pfree(ptrg);
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
/*
|
|
* Extract the next non-wildcard part of a search string, i.e. a word bounded
|
|
* by '_' or '%' meta-characters, non-word characters or string end.
|
|
*
|
|
* str: source string, of length lenstr bytes (need not be null-terminated)
|
|
* buf: where to return the substring (must be long enough)
|
|
* *bytelen: receives byte length of the found substring
|
|
* *charlen: receives character length of the found substring
|
|
*
|
|
* Returns pointer to end+1 of the found substring in the source string.
|
|
* Returns NULL if no word found (in which case buf, bytelen, charlen not set)
|
|
*
|
|
* If the found word is bounded by non-word characters or string boundaries
|
|
* then this function will include corresponding padding spaces into buf.
|
|
*/
|
|
static const char *
|
|
get_wildcard_part(const char *str, int lenstr,
|
|
char *buf, int *bytelen, int *charlen)
|
|
{
|
|
const char *beginword = str;
|
|
const char *endword;
|
|
char *s = buf;
|
|
bool in_leading_wildcard_meta = false;
|
|
bool in_trailing_wildcard_meta = false;
|
|
bool in_escape = false;
|
|
int clen;
|
|
|
|
/*
|
|
* Find the first word character, remembering whether preceding character
|
|
* was wildcard meta-character. Note that the in_escape state persists
|
|
* from this loop to the next one, since we may exit at a word character
|
|
* that is in_escape.
|
|
*/
|
|
while (beginword - str < lenstr)
|
|
{
|
|
if (in_escape)
|
|
{
|
|
if (ISWORDCHR(beginword))
|
|
break;
|
|
in_escape = false;
|
|
in_leading_wildcard_meta = false;
|
|
}
|
|
else
|
|
{
|
|
if (ISESCAPECHAR(beginword))
|
|
in_escape = true;
|
|
else if (ISWILDCARDCHAR(beginword))
|
|
in_leading_wildcard_meta = true;
|
|
else if (ISWORDCHR(beginword))
|
|
break;
|
|
else
|
|
in_leading_wildcard_meta = false;
|
|
}
|
|
beginword += pg_mblen(beginword);
|
|
}
|
|
|
|
/*
|
|
* Handle string end.
|
|
*/
|
|
if (beginword - str >= lenstr)
|
|
return NULL;
|
|
|
|
/*
|
|
* Add left padding spaces if preceding character wasn't wildcard
|
|
* meta-character.
|
|
*/
|
|
*charlen = 0;
|
|
if (!in_leading_wildcard_meta)
|
|
{
|
|
if (LPADDING > 0)
|
|
{
|
|
*s++ = ' ';
|
|
(*charlen)++;
|
|
if (LPADDING > 1)
|
|
{
|
|
*s++ = ' ';
|
|
(*charlen)++;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Copy data into buf until wildcard meta-character, non-word character or
|
|
* string boundary. Strip escapes during copy.
|
|
*/
|
|
endword = beginword;
|
|
while (endword - str < lenstr)
|
|
{
|
|
clen = pg_mblen(endword);
|
|
if (in_escape)
|
|
{
|
|
if (ISWORDCHR(endword))
|
|
{
|
|
memcpy(s, endword, clen);
|
|
(*charlen)++;
|
|
s += clen;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Back up endword to the escape character when stopping at an
|
|
* escaped char, so that subsequent get_wildcard_part will
|
|
* restart from the escape character. We assume here that
|
|
* escape chars are single-byte.
|
|
*/
|
|
endword--;
|
|
break;
|
|
}
|
|
in_escape = false;
|
|
}
|
|
else
|
|
{
|
|
if (ISESCAPECHAR(endword))
|
|
in_escape = true;
|
|
else if (ISWILDCARDCHAR(endword))
|
|
{
|
|
in_trailing_wildcard_meta = true;
|
|
break;
|
|
}
|
|
else if (ISWORDCHR(endword))
|
|
{
|
|
memcpy(s, endword, clen);
|
|
(*charlen)++;
|
|
s += clen;
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
endword += clen;
|
|
}
|
|
|
|
/*
|
|
* Add right padding spaces if next character isn't wildcard
|
|
* meta-character.
|
|
*/
|
|
if (!in_trailing_wildcard_meta)
|
|
{
|
|
if (RPADDING > 0)
|
|
{
|
|
*s++ = ' ';
|
|
(*charlen)++;
|
|
if (RPADDING > 1)
|
|
{
|
|
*s++ = ' ';
|
|
(*charlen)++;
|
|
}
|
|
}
|
|
}
|
|
|
|
*bytelen = s - buf;
|
|
return endword;
|
|
}
|
|
|
|
/*
|
|
* Generates trigrams for wildcard search string.
|
|
*
|
|
* Returns array of trigrams that must occur in any string that matches the
|
|
* wildcard string. For example, given pattern "a%bcd%" the trigrams
|
|
* " a", "bcd" would be extracted.
|
|
*/
|
|
TRGM *
|
|
generate_wildcard_trgm(const char *str, int slen)
|
|
{
|
|
TRGM *trg;
|
|
char *buf,
|
|
*buf2;
|
|
trgm *tptr;
|
|
int len,
|
|
charlen,
|
|
bytelen;
|
|
const char *eword;
|
|
|
|
protect_out_of_mem(slen);
|
|
|
|
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
|
|
trg->flag = ARRKEY;
|
|
SET_VARSIZE(trg, TRGMHDRSIZE);
|
|
|
|
if (slen + LPADDING + RPADDING < 3 || slen == 0)
|
|
return trg;
|
|
|
|
tptr = GETARR(trg);
|
|
|
|
/* Allocate a buffer for blank-padded, but not yet case-folded, words */
|
|
buf = palloc(sizeof(char) * (slen + 4));
|
|
|
|
/*
|
|
* Extract trigrams from each substring extracted by get_wildcard_part.
|
|
*/
|
|
eword = str;
|
|
while ((eword = get_wildcard_part(eword, slen - (eword - str),
|
|
buf, &bytelen, &charlen)) != NULL)
|
|
{
|
|
#ifdef IGNORECASE
|
|
buf2 = lowerstr_with_len(buf, bytelen);
|
|
bytelen = strlen(buf2);
|
|
#else
|
|
buf2 = buf;
|
|
#endif
|
|
|
|
/*
|
|
* count trigrams
|
|
*/
|
|
tptr = make_trigrams(tptr, buf2, bytelen, charlen);
|
|
|
|
#ifdef IGNORECASE
|
|
pfree(buf2);
|
|
#endif
|
|
}
|
|
|
|
pfree(buf);
|
|
|
|
if ((len = tptr - GETARR(trg)) == 0)
|
|
return trg;
|
|
|
|
/*
|
|
* Make trigrams unique.
|
|
*/
|
|
if (len > 1)
|
|
{
|
|
qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
|
|
len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
|
|
}
|
|
|
|
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
|
|
|
|
return trg;
|
|
}
|
|
|
|
uint32
|
|
trgm2int(trgm *ptr)
|
|
{
|
|
uint32 val = 0;
|
|
|
|
val |= *(((unsigned char *) ptr));
|
|
val <<= 8;
|
|
val |= *(((unsigned char *) ptr) + 1);
|
|
val <<= 8;
|
|
val |= *(((unsigned char *) ptr) + 2);
|
|
|
|
return val;
|
|
}
|
|
|
|
Datum
|
|
show_trgm(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in = PG_GETARG_TEXT_PP(0);
|
|
TRGM *trg;
|
|
Datum *d;
|
|
ArrayType *a;
|
|
trgm *ptr;
|
|
int i;
|
|
|
|
trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
|
|
d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
|
|
|
|
for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
|
|
{
|
|
text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
|
|
|
|
if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
|
|
{
|
|
snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
|
|
SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
|
|
}
|
|
else
|
|
{
|
|
SET_VARSIZE(item, VARHDRSZ + 3);
|
|
CPTRGM(VARDATA(item), ptr);
|
|
}
|
|
d[i] = PointerGetDatum(item);
|
|
}
|
|
|
|
a = construct_array(d,
|
|
ARRNELEM(trg),
|
|
TEXTOID,
|
|
-1,
|
|
false,
|
|
TYPALIGN_INT);
|
|
|
|
for (i = 0; i < ARRNELEM(trg); i++)
|
|
pfree(DatumGetPointer(d[i]));
|
|
|
|
pfree(d);
|
|
pfree(trg);
|
|
PG_FREE_IF_COPY(in, 0);
|
|
|
|
PG_RETURN_POINTER(a);
|
|
}
|
|
|
|
float4
|
|
cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
|
|
{
|
|
trgm *ptr1,
|
|
*ptr2;
|
|
int count = 0;
|
|
int len1,
|
|
len2;
|
|
|
|
ptr1 = GETARR(trg1);
|
|
ptr2 = GETARR(trg2);
|
|
|
|
len1 = ARRNELEM(trg1);
|
|
len2 = ARRNELEM(trg2);
|
|
|
|
/* explicit test is needed to avoid 0/0 division when both lengths are 0 */
|
|
if (len1 <= 0 || len2 <= 0)
|
|
return (float4) 0.0;
|
|
|
|
while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
|
|
{
|
|
int res = CMPTRGM(ptr1, ptr2);
|
|
|
|
if (res < 0)
|
|
ptr1++;
|
|
else if (res > 0)
|
|
ptr2++;
|
|
else
|
|
{
|
|
ptr1++;
|
|
ptr2++;
|
|
count++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If inexact then len2 is equal to count, because we don't know actual
|
|
* length of second string in inexact search and we can assume that count
|
|
* is a lower bound of len2.
|
|
*/
|
|
return CALCSML(count, len1, inexact ? count : len2);
|
|
}
|
|
|
|
|
|
/*
|
|
* Returns whether trg2 contains all trigrams in trg1.
|
|
* This relies on the trigram arrays being sorted.
|
|
*/
|
|
bool
|
|
trgm_contained_by(TRGM *trg1, TRGM *trg2)
|
|
{
|
|
trgm *ptr1,
|
|
*ptr2;
|
|
int len1,
|
|
len2;
|
|
|
|
ptr1 = GETARR(trg1);
|
|
ptr2 = GETARR(trg2);
|
|
|
|
len1 = ARRNELEM(trg1);
|
|
len2 = ARRNELEM(trg2);
|
|
|
|
while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
|
|
{
|
|
int res = CMPTRGM(ptr1, ptr2);
|
|
|
|
if (res < 0)
|
|
return false;
|
|
else if (res > 0)
|
|
ptr2++;
|
|
else
|
|
{
|
|
ptr1++;
|
|
ptr2++;
|
|
}
|
|
}
|
|
if (ptr1 - GETARR(trg1) < len1)
|
|
return false;
|
|
else
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Return a palloc'd boolean array showing, for each trigram in "query",
|
|
* whether it is present in the trigram array "key".
|
|
* This relies on the "key" array being sorted, but "query" need not be.
|
|
*/
|
|
bool *
|
|
trgm_presence_map(TRGM *query, TRGM *key)
|
|
{
|
|
bool *result;
|
|
trgm *ptrq = GETARR(query),
|
|
*ptrk = GETARR(key);
|
|
int lenq = ARRNELEM(query),
|
|
lenk = ARRNELEM(key),
|
|
i;
|
|
|
|
result = (bool *) palloc0(lenq * sizeof(bool));
|
|
|
|
/* for each query trigram, do a binary search in the key array */
|
|
for (i = 0; i < lenq; i++)
|
|
{
|
|
int lo = 0;
|
|
int hi = lenk;
|
|
|
|
while (lo < hi)
|
|
{
|
|
int mid = (lo + hi) / 2;
|
|
int res = CMPTRGM(ptrq, ptrk + mid);
|
|
|
|
if (res < 0)
|
|
hi = mid;
|
|
else if (res > 0)
|
|
lo = mid + 1;
|
|
else
|
|
{
|
|
result[i] = true;
|
|
break;
|
|
}
|
|
}
|
|
ptrq++;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
Datum
|
|
similarity(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
TRGM *trg1,
|
|
*trg2;
|
|
float4 res;
|
|
|
|
trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
|
|
trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
|
|
|
|
res = cnt_sml(trg1, trg2, false);
|
|
|
|
pfree(trg1);
|
|
pfree(trg2);
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
|
|
PG_RETURN_FLOAT4(res);
|
|
}
|
|
|
|
Datum
|
|
word_similarity(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
float4 res;
|
|
|
|
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
|
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
|
0);
|
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
PG_RETURN_FLOAT4(res);
|
|
}
|
|
|
|
Datum
|
|
strict_word_similarity(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
float4 res;
|
|
|
|
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
|
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
|
WORD_SIMILARITY_STRICT);
|
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
PG_RETURN_FLOAT4(res);
|
|
}
|
|
|
|
Datum
|
|
similarity_dist(PG_FUNCTION_ARGS)
|
|
{
|
|
float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
|
|
PG_GETARG_DATUM(0),
|
|
PG_GETARG_DATUM(1)));
|
|
|
|
PG_RETURN_FLOAT4(1.0 - res);
|
|
}
|
|
|
|
Datum
|
|
similarity_op(PG_FUNCTION_ARGS)
|
|
{
|
|
float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
|
|
PG_GETARG_DATUM(0),
|
|
PG_GETARG_DATUM(1)));
|
|
|
|
PG_RETURN_BOOL(res >= similarity_threshold);
|
|
}
|
|
|
|
Datum
|
|
word_similarity_op(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
float4 res;
|
|
|
|
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
|
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
|
WORD_SIMILARITY_CHECK_ONLY);
|
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
PG_RETURN_BOOL(res >= word_similarity_threshold);
|
|
}
|
|
|
|
Datum
|
|
word_similarity_commutator_op(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
float4 res;
|
|
|
|
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
|
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
|
WORD_SIMILARITY_CHECK_ONLY);
|
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
PG_RETURN_BOOL(res >= word_similarity_threshold);
|
|
}
|
|
|
|
Datum
|
|
word_similarity_dist_op(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
float4 res;
|
|
|
|
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
|
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
|
0);
|
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
PG_RETURN_FLOAT4(1.0 - res);
|
|
}
|
|
|
|
Datum
|
|
word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
float4 res;
|
|
|
|
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
|
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
|
0);
|
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
PG_RETURN_FLOAT4(1.0 - res);
|
|
}
|
|
|
|
Datum
|
|
strict_word_similarity_op(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
float4 res;
|
|
|
|
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
|
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
|
WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
|
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
|
|
}
|
|
|
|
Datum
|
|
strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
float4 res;
|
|
|
|
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
|
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
|
WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
|
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
|
|
}
|
|
|
|
Datum
|
|
strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
float4 res;
|
|
|
|
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
|
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
|
WORD_SIMILARITY_STRICT);
|
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
PG_RETURN_FLOAT4(1.0 - res);
|
|
}
|
|
|
|
Datum
|
|
strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_PP(0);
|
|
text *in2 = PG_GETARG_TEXT_PP(1);
|
|
float4 res;
|
|
|
|
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
|
|
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
|
|
WORD_SIMILARITY_STRICT);
|
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
PG_RETURN_FLOAT4(1.0 - res);
|
|
}
|