mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-03-25 20:10:41 +08:00
Add support for Daitch-Mokotoff Soundex in contrib/fuzzystrmatch.
This modernized version of Soundex works significantly better than the original, particularly for non-English names. Dag Lem, reviewed by quite a few people along the way Discussion: https://postgr.es/m/yger1atbgfy.fsf@sid.nimrod.no
This commit is contained in:
parent
728015a470
commit
a290378a37
2
contrib/fuzzystrmatch/.gitignore
vendored
2
contrib/fuzzystrmatch/.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
# Generated files
|
||||
/daitch_mokotoff.h
|
||||
# Generated subdirectories
|
||||
/log/
|
||||
/results/
|
||||
|
@ -3,14 +3,17 @@
|
||||
MODULE_big = fuzzystrmatch
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
daitch_mokotoff.o \
|
||||
dmetaphone.o \
|
||||
fuzzystrmatch.o
|
||||
|
||||
EXTENSION = fuzzystrmatch
|
||||
DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--1.0--1.1.sql
|
||||
DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--1.1--1.2.sql \
|
||||
fuzzystrmatch--1.0--1.1.sql
|
||||
|
||||
PGFILEDESC = "fuzzystrmatch - similarities and distance between strings"
|
||||
|
||||
REGRESS = fuzzystrmatch
|
||||
REGRESS = fuzzystrmatch fuzzystrmatch_utf8
|
||||
|
||||
ifdef USE_PGXS
|
||||
PG_CONFIG = pg_config
|
||||
@ -22,3 +25,16 @@ top_builddir = ../..
|
||||
include $(top_builddir)/src/Makefile.global
|
||||
include $(top_srcdir)/contrib/contrib-global.mk
|
||||
endif
|
||||
|
||||
# Force this dependency to be known even without dependency info built:
|
||||
daitch_mokotoff.o: daitch_mokotoff.h
|
||||
|
||||
daitch_mokotoff.h: daitch_mokotoff_header.pl
|
||||
$(PERL) $< $@
|
||||
|
||||
# daitch_mokotoff.h is included in tarballs, so it has to be made by
|
||||
# "distprep" and not cleaned except by "maintainer-clean".
|
||||
distprep: daitch_mokotoff.h
|
||||
|
||||
maintainer-clean:
|
||||
rm -f daitch_mokotoff.h
|
||||
|
577
contrib/fuzzystrmatch/daitch_mokotoff.c
Normal file
577
contrib/fuzzystrmatch/daitch_mokotoff.c
Normal file
@ -0,0 +1,577 @@
|
||||
/*
|
||||
* Daitch-Mokotoff Soundex
|
||||
*
|
||||
* Copyright (c) 2023, PostgreSQL Global Development Group
|
||||
*
|
||||
* This module was originally sponsored by Finance Norway /
|
||||
* Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
|
||||
*
|
||||
* The implementation of the Daitch-Mokotoff Soundex System aims at correctness
|
||||
* and high performance, and can be summarized as follows:
|
||||
*
|
||||
* - The processing of each phoneme is initiated by an O(1) table lookup.
|
||||
* - For phonemes containing more than one character, a coding tree is traversed
|
||||
* to process the complete phoneme.
|
||||
* - The (alternate) soundex codes are produced digit by digit in-place in
|
||||
* another tree structure.
|
||||
*
|
||||
* References:
|
||||
*
|
||||
* https://www.avotaynu.com/soundex.htm
|
||||
* https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||
* https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex
|
||||
* https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php)
|
||||
* https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java)
|
||||
* https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm)
|
||||
*
|
||||
* A few notes on other implementations:
|
||||
*
|
||||
* - All other known implementations have the same unofficial rules for "UE",
|
||||
* these are also adapted by this implementation (0, 1, NC).
|
||||
* - The only other known implementation which is capable of generating all
|
||||
* correct soundex codes in all cases is the JOS Soundex Calculator at
|
||||
* https://www.jewishgen.org/jos/jossound.htm
|
||||
* - "J" is considered (only) a vowel in dmlat.php
|
||||
* - The official rules for "RS" are commented out in dmlat.php
|
||||
* - Identical code digits for adjacent letters are not collapsed correctly in
|
||||
* dmsoundex.php when double digit codes are involved. E.g. "BESST" yields
|
||||
* 744300 instead of 743000 as for "BEST".
|
||||
* - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java
|
||||
* - "Y" is not considered a vowel in DaitchMokotoffSoundex.java
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include "catalog/pg_type.h"
|
||||
#include "mb/pg_wchar.h"
|
||||
#include "utils/array.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/memutils.h"
|
||||
|
||||
|
||||
/*
|
||||
* The soundex coding chart table is adapted from
|
||||
* https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||
* See daitch_mokotoff_header.pl for details.
|
||||
*/
|
||||
|
||||
/* Generated coding chart table */
|
||||
#include "daitch_mokotoff.h"
|
||||
|
||||
#define DM_CODE_DIGITS 6
|
||||
|
||||
/* Node in soundex code tree */
|
||||
typedef struct dm_node
|
||||
{
|
||||
int soundex_length; /* Length of generated soundex code */
|
||||
char soundex[DM_CODE_DIGITS]; /* Soundex code */
|
||||
int is_leaf; /* Candidate for complete soundex code */
|
||||
int last_update; /* Letter number for last update of node */
|
||||
char code_digit; /* Last code digit, 0 - 9 */
|
||||
|
||||
/*
|
||||
* One or two alternate code digits leading to this node. If there are two
|
||||
* digits, one of them is always an 'X'. Repeated code digits and 'X' lead
|
||||
* back to the same node.
|
||||
*/
|
||||
char prev_code_digits[2];
|
||||
/* One or two alternate code digits moving forward. */
|
||||
char next_code_digits[2];
|
||||
/* ORed together code index(es) used to reach current node. */
|
||||
int prev_code_index;
|
||||
int next_code_index;
|
||||
/* Possible nodes branching out from this node - digits 0-9. */
|
||||
struct dm_node *children[10];
|
||||
/* Next node in linked list. Alternating index for each iteration. */
|
||||
struct dm_node *next[2];
|
||||
} dm_node;
|
||||
|
||||
/* Template for new node in soundex code tree. */
|
||||
static const dm_node start_node = {
|
||||
.soundex_length = 0,
|
||||
.soundex = "000000", /* Six digits */
|
||||
.is_leaf = 0,
|
||||
.last_update = 0,
|
||||
.code_digit = '\0',
|
||||
.prev_code_digits = {'\0', '\0'},
|
||||
.next_code_digits = {'\0', '\0'},
|
||||
.prev_code_index = 0,
|
||||
.next_code_index = 0,
|
||||
.children = {NULL},
|
||||
.next = {NULL}
|
||||
};
|
||||
|
||||
/* Dummy soundex codes at end of input. */
|
||||
static const dm_codes end_codes[2] =
|
||||
{
|
||||
{
|
||||
"X", "X", "X"
|
||||
}
|
||||
};
|
||||
|
||||
/* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */
|
||||
static const char iso8859_1_to_ascii_upper[] =
|
||||
/*
|
||||
"`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
|
||||
*/
|
||||
"`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~ ! ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY";
|
||||
|
||||
/* Internal C implementation */
|
||||
static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex);
|
||||
|
||||
|
||||
PG_FUNCTION_INFO_V1(daitch_mokotoff);
|
||||
|
||||
Datum
|
||||
daitch_mokotoff(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *arg = PG_GETARG_TEXT_PP(0);
|
||||
Datum retval;
|
||||
char *string;
|
||||
ArrayBuildState *soundex;
|
||||
MemoryContext old_ctx,
|
||||
tmp_ctx;
|
||||
|
||||
/* Work in a temporary context to simplify cleanup. */
|
||||
tmp_ctx = AllocSetContextCreate(CurrentMemoryContext,
|
||||
"daitch_mokotoff temporary context",
|
||||
ALLOCSET_DEFAULT_SIZES);
|
||||
old_ctx = MemoryContextSwitchTo(tmp_ctx);
|
||||
|
||||
/* We must convert the string to UTF-8 if it isn't already. */
|
||||
string = pg_server_to_any(text_to_cstring(arg), VARSIZE_ANY_EXHDR(arg),
|
||||
PG_UTF8);
|
||||
|
||||
/* The result is built in this ArrayBuildState. */
|
||||
soundex = initArrayResult(TEXTOID, tmp_ctx, false);
|
||||
|
||||
if (!daitch_mokotoff_coding(string, soundex))
|
||||
{
|
||||
/* No encodable characters in input */
|
||||
MemoryContextSwitchTo(old_ctx);
|
||||
MemoryContextDelete(tmp_ctx);
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
retval = makeArrayResult(soundex, old_ctx);
|
||||
|
||||
MemoryContextSwitchTo(old_ctx);
|
||||
MemoryContextDelete(tmp_ctx);
|
||||
|
||||
PG_RETURN_DATUM(retval);
|
||||
}
|
||||
|
||||
|
||||
/* Initialize soundex code tree node for next code digit. */
|
||||
static void
|
||||
initialize_node(dm_node *node, int last_update)
|
||||
{
|
||||
if (node->last_update < last_update)
|
||||
{
|
||||
node->prev_code_digits[0] = node->next_code_digits[0];
|
||||
node->prev_code_digits[1] = node->next_code_digits[1];
|
||||
node->next_code_digits[0] = '\0';
|
||||
node->next_code_digits[1] = '\0';
|
||||
node->prev_code_index = node->next_code_index;
|
||||
node->next_code_index = 0;
|
||||
node->is_leaf = 0;
|
||||
node->last_update = last_update;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Update soundex code tree node with next code digit. */
|
||||
static void
|
||||
add_next_code_digit(dm_node *node, int code_index, char code_digit)
|
||||
{
|
||||
/* OR in index 1 or 2. */
|
||||
node->next_code_index |= code_index;
|
||||
|
||||
if (!node->next_code_digits[0])
|
||||
node->next_code_digits[0] = code_digit;
|
||||
else if (node->next_code_digits[0] != code_digit)
|
||||
node->next_code_digits[1] = code_digit;
|
||||
}
|
||||
|
||||
|
||||
/* Mark soundex code tree node as leaf. */
|
||||
static void
|
||||
set_leaf(dm_node *first_node[2], dm_node *last_node[2],
|
||||
dm_node *node, int ix_node)
|
||||
{
|
||||
if (!node->is_leaf)
|
||||
{
|
||||
node->is_leaf = 1;
|
||||
|
||||
if (first_node[ix_node] == NULL)
|
||||
first_node[ix_node] = node;
|
||||
else
|
||||
last_node[ix_node]->next[ix_node] = node;
|
||||
|
||||
last_node[ix_node] = node;
|
||||
node->next[ix_node] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Find next node corresponding to code digit, or create a new node. */
|
||||
static dm_node *
|
||||
find_or_create_child_node(dm_node *parent, char code_digit,
|
||||
ArrayBuildState *soundex)
|
||||
{
|
||||
int i = code_digit - '0';
|
||||
dm_node **nodes = parent->children;
|
||||
dm_node *node = nodes[i];
|
||||
|
||||
if (node)
|
||||
{
|
||||
/* Found existing child node. Skip completed nodes. */
|
||||
return node->soundex_length < DM_CODE_DIGITS ? node : NULL;
|
||||
}
|
||||
|
||||
/* Create new child node. */
|
||||
node = palloc_object(dm_node);
|
||||
nodes[i] = node;
|
||||
|
||||
*node = start_node;
|
||||
memcpy(node->soundex, parent->soundex, sizeof(parent->soundex));
|
||||
node->soundex_length = parent->soundex_length;
|
||||
node->soundex[node->soundex_length++] = code_digit;
|
||||
node->code_digit = code_digit;
|
||||
node->next_code_index = node->prev_code_index;
|
||||
|
||||
if (node->soundex_length < DM_CODE_DIGITS)
|
||||
{
|
||||
return node;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Append completed soundex code to output array. */
|
||||
text *out = cstring_to_text_with_len(node->soundex,
|
||||
DM_CODE_DIGITS);
|
||||
|
||||
accumArrayResult(soundex,
|
||||
PointerGetDatum(out),
|
||||
false,
|
||||
TEXTOID,
|
||||
CurrentMemoryContext);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Update node for next code digit(s). */
|
||||
static void
|
||||
update_node(dm_node *first_node[2], dm_node *last_node[2],
|
||||
dm_node *node, int ix_node,
|
||||
int letter_no, int prev_code_index, int next_code_index,
|
||||
const char *next_code_digits, int digit_no,
|
||||
ArrayBuildState *soundex)
|
||||
{
|
||||
int i;
|
||||
char next_code_digit = next_code_digits[digit_no];
|
||||
int num_dirty_nodes = 0;
|
||||
dm_node *dirty_nodes[2];
|
||||
|
||||
initialize_node(node, letter_no);
|
||||
|
||||
if (node->prev_code_index && !(node->prev_code_index & prev_code_index))
|
||||
{
|
||||
/*
|
||||
* If the sound (vowel / consonant) of this letter encoding doesn't
|
||||
* correspond to the coding index of the previous letter, we skip this
|
||||
* letter encoding. Note that currently, only "J" can be either a
|
||||
* vowel or a consonant.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
if (next_code_digit == 'X' ||
|
||||
(digit_no == 0 &&
|
||||
(node->prev_code_digits[0] == next_code_digit ||
|
||||
node->prev_code_digits[1] == next_code_digit)))
|
||||
{
|
||||
/* The code digit is the same as one of the previous (i.e. not added). */
|
||||
dirty_nodes[num_dirty_nodes++] = node;
|
||||
}
|
||||
|
||||
if (next_code_digit != 'X' &&
|
||||
(digit_no > 0 ||
|
||||
node->prev_code_digits[0] != next_code_digit ||
|
||||
node->prev_code_digits[1]))
|
||||
{
|
||||
/* The code digit is different from one of the previous (i.e. added). */
|
||||
node = find_or_create_child_node(node, next_code_digit, soundex);
|
||||
if (node)
|
||||
{
|
||||
initialize_node(node, letter_no);
|
||||
dirty_nodes[num_dirty_nodes++] = node;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < num_dirty_nodes; i++)
|
||||
{
|
||||
/* Add code digit leading to the current node. */
|
||||
add_next_code_digit(dirty_nodes[i], next_code_index, next_code_digit);
|
||||
|
||||
if (next_code_digits[++digit_no])
|
||||
{
|
||||
update_node(first_node, last_node, dirty_nodes[i], ix_node,
|
||||
letter_no, prev_code_index, next_code_index,
|
||||
next_code_digits, digit_no,
|
||||
soundex);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Add incomplete leaf node to linked list. */
|
||||
set_leaf(first_node, last_node, dirty_nodes[i], ix_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Update soundex tree leaf nodes. */
|
||||
static void
|
||||
update_leaves(dm_node *first_node[2], int *ix_node, int letter_no,
|
||||
const dm_codes *codes, const dm_codes *next_codes,
|
||||
ArrayBuildState *soundex)
|
||||
{
|
||||
int i,
|
||||
j,
|
||||
code_index;
|
||||
dm_node *node,
|
||||
*last_node[2];
|
||||
const dm_code *code,
|
||||
*next_code;
|
||||
int ix_node_next = (*ix_node + 1) & 1; /* Alternating index: 0, 1 */
|
||||
|
||||
/* Initialize for new linked list of leaves. */
|
||||
first_node[ix_node_next] = NULL;
|
||||
last_node[ix_node_next] = NULL;
|
||||
|
||||
/* Process all nodes. */
|
||||
for (node = first_node[*ix_node]; node; node = node->next[*ix_node])
|
||||
{
|
||||
/* One or two alternate code sequences. */
|
||||
for (i = 0; i < 2 && (code = codes[i]) && code[0][0]; i++)
|
||||
{
|
||||
/* Coding for previous letter - before vowel: 1, all other: 2 */
|
||||
int prev_code_index = (code[0][0] > '1') + 1;
|
||||
|
||||
/* One or two alternate next code sequences. */
|
||||
for (j = 0; j < 2 && (next_code = next_codes[j]) && next_code[0][0]; j++)
|
||||
{
|
||||
/* Determine which code to use. */
|
||||
if (letter_no == 0)
|
||||
{
|
||||
/* This is the first letter. */
|
||||
code_index = 0;
|
||||
}
|
||||
else if (next_code[0][0] <= '1')
|
||||
{
|
||||
/* The next letter is a vowel. */
|
||||
code_index = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* All other cases. */
|
||||
code_index = 2;
|
||||
}
|
||||
|
||||
/* One or two sequential code digits. */
|
||||
update_node(first_node, last_node, node, ix_node_next,
|
||||
letter_no, prev_code_index, code_index,
|
||||
code[code_index], 0,
|
||||
soundex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*ix_node = ix_node_next;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Return next character, converted from UTF-8 to uppercase ASCII.
|
||||
* *ix is the current string index and is incremented by the character length.
|
||||
*/
|
||||
static char
|
||||
read_char(const unsigned char *str, int *ix)
|
||||
{
|
||||
/* Substitute character for skipped code points. */
|
||||
const char na = '\x1a';
|
||||
pg_wchar c;
|
||||
|
||||
/* Decode UTF-8 character to ISO 10646 code point. */
|
||||
str += *ix;
|
||||
c = utf8_to_unicode(str);
|
||||
|
||||
/* Advance *ix, but (for safety) not if we've reached end of string. */
|
||||
if (c)
|
||||
*ix += pg_utf_mblen(str);
|
||||
|
||||
/* Convert. */
|
||||
if (c >= (unsigned char) '[' && c <= (unsigned char) ']')
|
||||
{
|
||||
/* ASCII characters [, \, and ] are reserved for Ą, Ę, and Ţ/Ț. */
|
||||
return na;
|
||||
}
|
||||
else if (c < 0x60)
|
||||
{
|
||||
/* Other non-lowercase ASCII characters can be used as-is. */
|
||||
return (char) c;
|
||||
}
|
||||
else if (c < 0x100)
|
||||
{
|
||||
/* ISO-8859-1 code point; convert to upper-case ASCII via table. */
|
||||
return iso8859_1_to_ascii_upper[c - 0x60];
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Conversion of non-ASCII characters in the coding chart. */
|
||||
switch (c)
|
||||
{
|
||||
case 0x0104:
|
||||
case 0x0105:
|
||||
/* Ą/ą */
|
||||
return '[';
|
||||
case 0x0118:
|
||||
case 0x0119:
|
||||
/* Ę/ę */
|
||||
return '\\';
|
||||
case 0x0162:
|
||||
case 0x0163:
|
||||
case 0x021A:
|
||||
case 0x021B:
|
||||
/* Ţ/ţ or Ț/ț */
|
||||
return ']';
|
||||
default:
|
||||
return na;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Read next ASCII character, skipping any characters not in [A-\]]. */
|
||||
static char
|
||||
read_valid_char(const char *str, int *ix)
|
||||
{
|
||||
char c;
|
||||
|
||||
while ((c = read_char((const unsigned char *) str, ix)) != '\0')
|
||||
{
|
||||
if (c >= 'A' && c <= ']')
|
||||
break;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
/* Return sound coding for "letter" (letter sequence) */
|
||||
static const dm_codes *
|
||||
read_letter(const char *str, int *ix)
|
||||
{
|
||||
char c,
|
||||
cmp;
|
||||
int i,
|
||||
j;
|
||||
const dm_letter *letters;
|
||||
const dm_codes *codes;
|
||||
|
||||
/* First letter in sequence. */
|
||||
if ((c = read_valid_char(str, ix)) == '\0')
|
||||
return NULL;
|
||||
|
||||
letters = &letter_[c - 'A'];
|
||||
codes = letters->codes;
|
||||
i = *ix;
|
||||
|
||||
/* Any subsequent letters in sequence. */
|
||||
while ((letters = letters->letters) && (c = read_valid_char(str, &i)))
|
||||
{
|
||||
for (j = 0; (cmp = letters[j].letter); j++)
|
||||
{
|
||||
if (cmp == c)
|
||||
{
|
||||
/* Letter found. */
|
||||
letters = &letters[j];
|
||||
if (letters->codes)
|
||||
{
|
||||
/* Coding for letter sequence found. */
|
||||
codes = letters->codes;
|
||||
*ix = i;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!cmp)
|
||||
{
|
||||
/* The sequence of letters has no coding. */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return codes;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Generate all Daitch-Mokotoff soundex codes for word,
|
||||
* adding them to the "soundex" ArrayBuildState.
|
||||
* Returns false if string has no encodable characters, else true.
|
||||
*/
|
||||
static bool
|
||||
daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex)
|
||||
{
|
||||
int i = 0;
|
||||
int letter_no = 0;
|
||||
int ix_node = 0;
|
||||
const dm_codes *codes,
|
||||
*next_codes;
|
||||
dm_node *first_node[2],
|
||||
*node;
|
||||
|
||||
/* First letter. */
|
||||
if (!(codes = read_letter(word, &i)))
|
||||
{
|
||||
/* No encodable character in input. */
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Starting point. */
|
||||
first_node[ix_node] = palloc_object(dm_node);
|
||||
*first_node[ix_node] = start_node;
|
||||
|
||||
/*
|
||||
* Loop until either the word input is exhausted, or all generated soundex
|
||||
* codes are completed to six digits.
|
||||
*/
|
||||
while (codes && first_node[ix_node])
|
||||
{
|
||||
next_codes = read_letter(word, &i);
|
||||
|
||||
/* Update leaf nodes. */
|
||||
update_leaves(first_node, &ix_node, letter_no,
|
||||
codes, next_codes ? next_codes : end_codes,
|
||||
soundex);
|
||||
|
||||
codes = next_codes;
|
||||
letter_no++;
|
||||
}
|
||||
|
||||
/* Append all remaining (incomplete) soundex codes to output array. */
|
||||
for (node = first_node[ix_node]; node; node = node->next[ix_node])
|
||||
{
|
||||
text *out = cstring_to_text_with_len(node->soundex,
|
||||
DM_CODE_DIGITS);
|
||||
|
||||
accumArrayResult(soundex,
|
||||
PointerGetDatum(out),
|
||||
false,
|
||||
TEXTOID,
|
||||
CurrentMemoryContext);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
223
contrib/fuzzystrmatch/daitch_mokotoff_header.pl
Executable file
223
contrib/fuzzystrmatch/daitch_mokotoff_header.pl
Executable file
@ -0,0 +1,223 @@
|
||||
#!/usr/bin/perl
|
||||
#
|
||||
# Generation of types and lookup tables for Daitch-Mokotoff soundex.
|
||||
#
|
||||
# Copyright (c) 2023, PostgreSQL Global Development Group
|
||||
#
|
||||
# This module was originally sponsored by Finance Norway /
|
||||
# Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use utf8;
|
||||
use open IO => ':utf8', ':std';
|
||||
use Data::Dumper;
|
||||
|
||||
die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1;
|
||||
my $output_file = $ARGV[0];
|
||||
|
||||
# Open the output file
|
||||
open my $OUTPUT, '>', $output_file
|
||||
or die "Could not open output file $output_file: $!\n";
|
||||
|
||||
# Parse code table and generate tree for letter transitions.
|
||||
my %codes;
|
||||
my $table = [ {}, [ [ "", "", "" ] ] ];
|
||||
while (<DATA>)
|
||||
{
|
||||
chomp;
|
||||
my ($letters, $codes) = split(/\s+/);
|
||||
my @codes = map { [ split(/,/) ] } split(/\|/, $codes);
|
||||
|
||||
my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes);
|
||||
my $val = join(
|
||||
",\n",
|
||||
map {
|
||||
"\t{\n\t\t"
|
||||
. join(", ", map { "\"$_\"" } @$_) . "\n\t}"
|
||||
} @codes);
|
||||
$codes{$key} = $val;
|
||||
|
||||
for my $letter (split(/,/, $letters))
|
||||
{
|
||||
my $ref = $table->[0];
|
||||
# Link each character to the next in the letter combination.
|
||||
my @c = split(//, $letter);
|
||||
my $last_c = pop(@c);
|
||||
for my $c (@c)
|
||||
{
|
||||
$ref->{$c} //= [ {}, undef ];
|
||||
$ref->{$c}[0] //= {};
|
||||
$ref = $ref->{$c}[0];
|
||||
}
|
||||
# The sound code for the letter combination is stored at the last character.
|
||||
$ref->{$last_c}[1] = $key;
|
||||
}
|
||||
}
|
||||
close(DATA);
|
||||
|
||||
print $OUTPUT <<EOF;
|
||||
/*
|
||||
* Constants and lookup tables for Daitch-Mokotoff Soundex
|
||||
*
|
||||
* Copyright (c) 2023, PostgreSQL Global Development Group
|
||||
*
|
||||
* This file is generated by daitch_mokotoff_header.pl
|
||||
*/
|
||||
|
||||
/* Coding chart table: Soundex codes */
|
||||
typedef char dm_code[2 + 1]; /* One or two sequential code digits + NUL */
|
||||
typedef dm_code dm_codes[3]; /* Start of name, before a vowel, any other */
|
||||
|
||||
/* Coding chart table: Letter in input sequence */
|
||||
struct dm_letter
|
||||
{
|
||||
char letter; /* Present letter in sequence */
|
||||
const struct dm_letter *letters; /* List of possible successive letters */
|
||||
const dm_codes *codes; /* Code sequence(s) for complete sequence */
|
||||
};
|
||||
|
||||
typedef struct dm_letter dm_letter;
|
||||
|
||||
/* Codes for letter sequence at start of name, before a vowel, and any other. */
|
||||
EOF
|
||||
|
||||
for my $key (sort keys %codes)
|
||||
{
|
||||
print $OUTPUT "static const dm_codes $key\[2\] =\n{\n"
|
||||
. $codes{$key}
|
||||
. "\n};\n";
|
||||
}
|
||||
|
||||
print $OUTPUT <<EOF;
|
||||
|
||||
/* Coding for alternative following letters in sequence. */
|
||||
EOF
|
||||
|
||||
sub hash2code
|
||||
{
|
||||
my ($ref, $letter) = @_;
|
||||
|
||||
my @letters = ();
|
||||
|
||||
my $h = $ref->[0];
|
||||
for my $key (sort keys %$h)
|
||||
{
|
||||
$ref = $h->{$key};
|
||||
my $children = "NULL";
|
||||
if (defined $ref->[0])
|
||||
{
|
||||
$children = "letter_$letter$key";
|
||||
hash2code($ref, "$letter$key");
|
||||
}
|
||||
my $codes = $ref->[1] // "NULL";
|
||||
push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}");
|
||||
}
|
||||
|
||||
print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n";
|
||||
for (@letters)
|
||||
{
|
||||
print $OUTPUT "$_,\n";
|
||||
}
|
||||
print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n";
|
||||
print $OUTPUT "};\n";
|
||||
}
|
||||
|
||||
hash2code($table, '');
|
||||
|
||||
close $OUTPUT;
|
||||
|
||||
# Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||
#
|
||||
# The conversion from the coding chart to the table should be self
|
||||
# explanatory, but note the differences stated below.
|
||||
#
|
||||
# X = NC (not coded)
|
||||
#
|
||||
# The non-ASCII letters in the coding chart are coded with substitute
|
||||
# lowercase ASCII letters, which sort after the uppercase ASCII letters:
|
||||
#
|
||||
# Ą => a (use '[' for table lookup)
|
||||
# Ę => e (use '\\' for table lookup)
|
||||
# Ţ => t (use ']' for table lookup)
|
||||
#
|
||||
# The rule for "UE" does not correspond to the coding chart, however
|
||||
# it is used by all other known implementations, including the one at
|
||||
# https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey").
|
||||
#
|
||||
# Note that the implementation assumes that vowels are assigned code
|
||||
# 0 or 1. "J" can be either a vowel or a consonant.
|
||||
#
|
||||
|
||||
__DATA__
|
||||
AI,AJ,AY 0,1,X
|
||||
AU 0,7,X
|
||||
a X,X,6|X,X,X
|
||||
A 0,X,X
|
||||
B 7,7,7
|
||||
CHS 5,54,54
|
||||
CH 5,5,5|4,4,4
|
||||
CK 5,5,5|45,45,45
|
||||
CZ,CS,CSZ,CZS 4,4,4
|
||||
C 5,5,5|4,4,4
|
||||
DRZ,DRS 4,4,4
|
||||
DS,DSH,DSZ 4,4,4
|
||||
DZ,DZH,DZS 4,4,4
|
||||
D,DT 3,3,3
|
||||
EI,EJ,EY 0,1,X
|
||||
EU 1,1,X
|
||||
e X,X,6|X,X,X
|
||||
E 0,X,X
|
||||
FB 7,7,7
|
||||
F 7,7,7
|
||||
G 5,5,5
|
||||
H 5,5,X
|
||||
IA,IE,IO,IU 1,X,X
|
||||
I 0,X,X
|
||||
J 1,X,X|4,4,4
|
||||
KS 5,54,54
|
||||
KH 5,5,5
|
||||
K 5,5,5
|
||||
L 8,8,8
|
||||
MN 66,66,66
|
||||
M 6,6,6
|
||||
NM 66,66,66
|
||||
N 6,6,6
|
||||
OI,OJ,OY 0,1,X
|
||||
O 0,X,X
|
||||
P,PF,PH 7,7,7
|
||||
Q 5,5,5
|
||||
RZ,RS 94,94,94|4,4,4
|
||||
R 9,9,9
|
||||
SCHTSCH,SCHTSH,SCHTCH 2,4,4
|
||||
SCH 4,4,4
|
||||
SHTCH,SHCH,SHTSH 2,4,4
|
||||
SHT,SCHT,SCHD 2,43,43
|
||||
SH 4,4,4
|
||||
STCH,STSCH,SC 2,4,4
|
||||
STRZ,STRS,STSH 2,4,4
|
||||
ST 2,43,43
|
||||
SZCZ,SZCS 2,4,4
|
||||
SZT,SHD,SZD,SD 2,43,43
|
||||
SZ 4,4,4
|
||||
S 4,4,4
|
||||
TCH,TTCH,TTSCH 4,4,4
|
||||
TH 3,3,3
|
||||
TRZ,TRS 4,4,4
|
||||
TSCH,TSH 4,4,4
|
||||
TS,TTS,TTSZ,TC 4,4,4
|
||||
TZ,TTZ,TZS,TSZ 4,4,4
|
||||
t 3,3,3|4,4,4
|
||||
T 3,3,3
|
||||
UI,UJ,UY,UE 0,1,X
|
||||
U 0,X,X
|
||||
V 7,7,7
|
||||
W 7,7,7
|
||||
X 5,54,54
|
||||
Y 1,X,X
|
||||
ZDZ,ZDZH,ZHDZH 2,4,4
|
||||
ZD,ZHD 2,43,43
|
||||
ZH,ZS,ZSCH,ZSH 4,4,4
|
||||
Z 4,4,4
|
@ -65,3 +65,174 @@ SELECT dmetaphone_alt('gumbo');
|
||||
KMP
|
||||
(1 row)
|
||||
|
||||
-- Wovels
|
||||
SELECT daitch_mokotoff('Augsburg');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{054795}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Breuer');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{791900}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Freud');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{793000}
|
||||
(1 row)
|
||||
|
||||
-- The letter "H"
|
||||
SELECT daitch_mokotoff('Halberstadt');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{587943,587433}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Mannheim');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{665600}
|
||||
(1 row)
|
||||
|
||||
-- Adjacent sounds
|
||||
SELECT daitch_mokotoff('Chernowitz');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{596740,496740}
|
||||
(1 row)
|
||||
|
||||
-- Adjacent letters with identical adjacent code digits
|
||||
SELECT daitch_mokotoff('Cherkassy');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{595400,495400}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Kleinman');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{586660}
|
||||
(1 row)
|
||||
|
||||
-- More than one word
|
||||
SELECT daitch_mokotoff('Nowy Targ');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{673950}
|
||||
(1 row)
|
||||
|
||||
-- Padded with "0"
|
||||
SELECT daitch_mokotoff('Berlin');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{798600}
|
||||
(1 row)
|
||||
|
||||
-- Other examples from https://www.avotaynu.com/soundex.htm
|
||||
SELECT daitch_mokotoff('Ceniow');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{567000,467000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Tsenyuv');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{467000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Holubica');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{587500,587400}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Golubitsa');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{587400}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Przemysl');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{794648,746480}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Pshemeshil');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{746480}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Rosochowaciec');
|
||||
daitch_mokotoff
|
||||
-----------------------------------------------------------
|
||||
{945755,945754,945745,945744,944755,944754,944745,944744}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Rosokhovatsets');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{945744}
|
||||
(1 row)
|
||||
|
||||
-- Ignored characters
|
||||
SELECT daitch_mokotoff('''OBrien');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{079600}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('O''Brien');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{079600}
|
||||
(1 row)
|
||||
|
||||
-- "Difficult" cases, likely to cause trouble for other implementations.
|
||||
SELECT daitch_mokotoff('CJC');
|
||||
daitch_mokotoff
|
||||
---------------------------------------------
|
||||
{550000,540000,545000,450000,400000,440000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('BESST');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{743000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('BOUEY');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{710000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('HANNMANN');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{566600}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('MCCOYJR');
|
||||
daitch_mokotoff
|
||||
-----------------------------------------------------------
|
||||
{651900,654900,654190,654490,645190,645490,641900,644900}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('ACCURSO');
|
||||
daitch_mokotoff
|
||||
-----------------------------------------------------------
|
||||
{059400,054000,054940,054400,045940,045400,049400,044000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('BIERSCHBACH');
|
||||
daitch_mokotoff
|
||||
-----------------------------------------------------------
|
||||
{794575,794574,794750,794740,745750,745740,747500,747400}
|
||||
(1 row)
|
||||
|
||||
|
61
contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8.out
Normal file
61
contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8.out
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* This test must be run in a database with UTF-8 encoding,
|
||||
* because other encodings don't support all the characters used.
|
||||
*/
|
||||
SELECT getdatabaseencoding() <> 'UTF8'
|
||||
AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
\endif
|
||||
set client_encoding = utf8;
|
||||
-- CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
|
||||
-- Accents
|
||||
SELECT daitch_mokotoff('Müller');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{689000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Schäfer');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{479000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Straßburg');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{294795}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('Éregon');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{095600}
|
||||
(1 row)
|
||||
|
||||
-- Special characters added at https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||
SELECT daitch_mokotoff('gąszczu');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{564000,540000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('brzęczy');
|
||||
daitch_mokotoff
|
||||
-------------------------------
|
||||
{794640,794400,746400,744000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('ţamas');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{364000,464000}
|
||||
(1 row)
|
||||
|
||||
SELECT daitch_mokotoff('țamas');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{364000,464000}
|
||||
(1 row)
|
||||
|
8
contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8_1.out
Normal file
8
contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8_1.out
Normal file
@ -0,0 +1,8 @@
|
||||
/*
|
||||
* This test must be run in a database with UTF-8 encoding,
|
||||
* because other encodings don't support all the characters used.
|
||||
*/
|
||||
SELECT getdatabaseencoding() <> 'UTF8'
|
||||
AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
8
contrib/fuzzystrmatch/fuzzystrmatch--1.1--1.2.sql
Normal file
8
contrib/fuzzystrmatch/fuzzystrmatch--1.1--1.2.sql
Normal file
@ -0,0 +1,8 @@
|
||||
/* contrib/fuzzystrmatch/fuzzystrmatch--1.1--1.2.sql */
|
||||
|
||||
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
|
||||
\echo Use "ALTER EXTENSION fuzzystrmatch UPDATE TO '1.2'" to load this file. \quit
|
||||
|
||||
CREATE FUNCTION daitch_mokotoff(text) RETURNS text[]
|
||||
AS 'MODULE_PATHNAME', 'daitch_mokotoff'
|
||||
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
@ -1,6 +1,6 @@
|
||||
# fuzzystrmatch extension
|
||||
comment = 'determine similarities and distance between strings'
|
||||
default_version = '1.1'
|
||||
default_version = '1.2'
|
||||
module_pathname = '$libdir/fuzzystrmatch'
|
||||
relocatable = true
|
||||
trusted = true
|
||||
|
@ -1,10 +1,19 @@
|
||||
# Copyright (c) 2022-2023, PostgreSQL Global Development Group
|
||||
|
||||
fuzzystrmatch_sources = files(
|
||||
'fuzzystrmatch.c',
|
||||
'daitch_mokotoff.c',
|
||||
'dmetaphone.c',
|
||||
'fuzzystrmatch.c',
|
||||
)
|
||||
|
||||
daitch_mokotoff_h = custom_target('daitch_mokotoff',
|
||||
input: 'daitch_mokotoff_header.pl',
|
||||
output: 'daitch_mokotoff.h',
|
||||
command: [perl, '@INPUT@', '@OUTPUT@'],
|
||||
)
|
||||
generated_sources += daitch_mokotoff_h
|
||||
fuzzystrmatch_sources += daitch_mokotoff_h
|
||||
|
||||
if host_system == 'windows'
|
||||
fuzzystrmatch_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
|
||||
'--NAME', 'fuzzystrmatch',
|
||||
@ -13,6 +22,7 @@ endif
|
||||
|
||||
fuzzystrmatch = shared_module('fuzzystrmatch',
|
||||
fuzzystrmatch_sources,
|
||||
include_directories: include_directories('.'),
|
||||
kwargs: contrib_mod_args,
|
||||
)
|
||||
contrib_targets += fuzzystrmatch
|
||||
@ -21,6 +31,7 @@ install_data(
|
||||
'fuzzystrmatch.control',
|
||||
'fuzzystrmatch--1.0--1.1.sql',
|
||||
'fuzzystrmatch--1.1.sql',
|
||||
'fuzzystrmatch--1.1--1.2.sql',
|
||||
kwargs: contrib_data_args,
|
||||
)
|
||||
|
||||
@ -31,6 +42,7 @@ tests += {
|
||||
'regress': {
|
||||
'sql': [
|
||||
'fuzzystrmatch',
|
||||
'fuzzystrmatch_utf8',
|
||||
],
|
||||
},
|
||||
}
|
||||
|
@ -19,3 +19,48 @@ SELECT metaphone('GUMBO', 4);
|
||||
|
||||
SELECT dmetaphone('gumbo');
|
||||
SELECT dmetaphone_alt('gumbo');
|
||||
|
||||
-- Wovels
|
||||
SELECT daitch_mokotoff('Augsburg');
|
||||
SELECT daitch_mokotoff('Breuer');
|
||||
SELECT daitch_mokotoff('Freud');
|
||||
|
||||
-- The letter "H"
|
||||
SELECT daitch_mokotoff('Halberstadt');
|
||||
SELECT daitch_mokotoff('Mannheim');
|
||||
|
||||
-- Adjacent sounds
|
||||
SELECT daitch_mokotoff('Chernowitz');
|
||||
|
||||
-- Adjacent letters with identical adjacent code digits
|
||||
SELECT daitch_mokotoff('Cherkassy');
|
||||
SELECT daitch_mokotoff('Kleinman');
|
||||
|
||||
-- More than one word
|
||||
SELECT daitch_mokotoff('Nowy Targ');
|
||||
|
||||
-- Padded with "0"
|
||||
SELECT daitch_mokotoff('Berlin');
|
||||
|
||||
-- Other examples from https://www.avotaynu.com/soundex.htm
|
||||
SELECT daitch_mokotoff('Ceniow');
|
||||
SELECT daitch_mokotoff('Tsenyuv');
|
||||
SELECT daitch_mokotoff('Holubica');
|
||||
SELECT daitch_mokotoff('Golubitsa');
|
||||
SELECT daitch_mokotoff('Przemysl');
|
||||
SELECT daitch_mokotoff('Pshemeshil');
|
||||
SELECT daitch_mokotoff('Rosochowaciec');
|
||||
SELECT daitch_mokotoff('Rosokhovatsets');
|
||||
|
||||
-- Ignored characters
|
||||
SELECT daitch_mokotoff('''OBrien');
|
||||
SELECT daitch_mokotoff('O''Brien');
|
||||
|
||||
-- "Difficult" cases, likely to cause trouble for other implementations.
|
||||
SELECT daitch_mokotoff('CJC');
|
||||
SELECT daitch_mokotoff('BESST');
|
||||
SELECT daitch_mokotoff('BOUEY');
|
||||
SELECT daitch_mokotoff('HANNMANN');
|
||||
SELECT daitch_mokotoff('MCCOYJR');
|
||||
SELECT daitch_mokotoff('ACCURSO');
|
||||
SELECT daitch_mokotoff('BIERSCHBACH');
|
||||
|
26
contrib/fuzzystrmatch/sql/fuzzystrmatch_utf8.sql
Normal file
26
contrib/fuzzystrmatch/sql/fuzzystrmatch_utf8.sql
Normal file
@ -0,0 +1,26 @@
|
||||
/*
|
||||
* This test must be run in a database with UTF-8 encoding,
|
||||
* because other encodings don't support all the characters used.
|
||||
*/
|
||||
|
||||
SELECT getdatabaseencoding() <> 'UTF8'
|
||||
AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
\endif
|
||||
|
||||
set client_encoding = utf8;
|
||||
|
||||
-- CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
|
||||
|
||||
-- Accents
|
||||
SELECT daitch_mokotoff('Müller');
|
||||
SELECT daitch_mokotoff('Schäfer');
|
||||
SELECT daitch_mokotoff('Straßburg');
|
||||
SELECT daitch_mokotoff('Éregon');
|
||||
|
||||
-- Special characters added at https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||
SELECT daitch_mokotoff('gąszczu');
|
||||
SELECT daitch_mokotoff('brzęczy');
|
||||
SELECT daitch_mokotoff('ţamas');
|
||||
SELECT daitch_mokotoff('țamas');
|
@ -17,6 +17,8 @@
|
||||
At present, the <function>soundex</function>, <function>metaphone</function>,
|
||||
<function>dmetaphone</function>, and <function>dmetaphone_alt</function> functions do
|
||||
not work well with multibyte encodings (such as UTF-8).
|
||||
Use <function>daitch_mokotoff</function>
|
||||
or <function>levenshtein</function> with such data.
|
||||
</para>
|
||||
</caution>
|
||||
|
||||
@ -88,6 +90,159 @@ SELECT * FROM s WHERE difference(s.nm, 'john') > 2;
|
||||
</programlisting>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="fuzzystrmatch-daitch-mokotoff">
|
||||
<title>Daitch-Mokotoff Soundex</title>
|
||||
|
||||
<para>
|
||||
Like the original Soundex system, Daitch-Mokotoff Soundex matches
|
||||
similar-sounding names by converting them to the same code.
|
||||
However, Daitch-Mokotoff Soundex is significantly more useful for
|
||||
non-English names than the original system.
|
||||
Major improvements over the original system include:
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<para>
|
||||
The code is based on the first six meaningful letters rather than four.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
A letter or combination of letters maps into ten possible codes rather
|
||||
than seven.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Where two consecutive letters have a single sound, they are coded as a
|
||||
single number.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
When a letter or combination of letters may have different sounds,
|
||||
multiple codes are emitted to cover all possibilities.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<indexterm>
|
||||
<primary>daitch_mokotoff</primary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
This function generates the Daitch-Mokotoff soundex codes for its input:
|
||||
</para>
|
||||
|
||||
<synopsis>
|
||||
daitch_mokotoff(<parameter>source</parameter> text) returns text[]
|
||||
</synopsis>
|
||||
|
||||
<para>
|
||||
The result may contain one or more codes depending on how many plausible
|
||||
pronunciations there are, so it is represented as an array.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Since a Daitch-Mokotoff soundex code consists of only 6 digits,
|
||||
<parameter>source</parameter> should be preferably a single word or name.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Here are some examples:
|
||||
</para>
|
||||
|
||||
<programlisting>
|
||||
SELECT daitch_mokotoff('George');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{595000}
|
||||
|
||||
SELECT daitch_mokotoff('John');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{160000,460000}
|
||||
|
||||
SELECT daitch_mokotoff('Bierschbach');
|
||||
daitch_mokotoff
|
||||
-----------------------------------------------------------
|
||||
{794575,794574,794750,794740,745750,745740,747500,747400}
|
||||
|
||||
SELECT daitch_mokotoff('Schwartzenegger');
|
||||
daitch_mokotoff
|
||||
-----------------
|
||||
{479465}
|
||||
</programlisting>
|
||||
|
||||
<para>
|
||||
For matching of single names, returned text arrays can be matched
|
||||
directly using the <literal>&&</literal> operator: any overlap
|
||||
can be considered a match. A GIN index may
|
||||
be used for efficiency, see <xref linkend="gin"/> and this example:
|
||||
</para>
|
||||
|
||||
<programlisting>
|
||||
CREATE TABLE s (nm text);
|
||||
CREATE INDEX ix_s_dm ON s USING gin (daitch_mokotoff(nm)) WITH (fastupdate = off);
|
||||
|
||||
INSERT INTO s (nm) VALUES
|
||||
('Schwartzenegger'),
|
||||
('John'),
|
||||
('James'),
|
||||
('Steinman'),
|
||||
('Steinmetz');
|
||||
|
||||
SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Swartzenegger');
|
||||
SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Jane');
|
||||
SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Jens');
|
||||
</programlisting>
|
||||
|
||||
<para>
|
||||
For indexing and matching of any number of names in any order, Full Text
|
||||
Search features can be used. See <xref linkend="textsearch"/> and this
|
||||
example:
|
||||
</para>
|
||||
|
||||
<programlisting>
|
||||
CREATE FUNCTION soundex_tsvector(v_name text) RETURNS tsvector
|
||||
BEGIN ATOMIC
|
||||
SELECT to_tsvector('simple',
|
||||
string_agg(array_to_string(daitch_mokotoff(n), ' '), ' '))
|
||||
FROM regexp_split_to_table(v_name, '\s+') AS n;
|
||||
END;
|
||||
|
||||
CREATE FUNCTION soundex_tsquery(v_name text) RETURNS tsquery
|
||||
BEGIN ATOMIC
|
||||
SELECT string_agg('(' || array_to_string(daitch_mokotoff(n), '|') || ')', '&')::tsquery
|
||||
FROM regexp_split_to_table(v_name, '\s+') AS n;
|
||||
END;
|
||||
|
||||
CREATE TABLE s (nm text);
|
||||
CREATE INDEX ix_s_txt ON s USING gin (soundex_tsvector(nm)) WITH (fastupdate = off);
|
||||
|
||||
INSERT INTO s (nm) VALUES
|
||||
('John Doe'),
|
||||
('Jane Roe'),
|
||||
('Public John Q.'),
|
||||
('George Best'),
|
||||
('John Yamson');
|
||||
|
||||
SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('john');
|
||||
SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('jane doe');
|
||||
SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('john public');
|
||||
SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('besst, giorgio');
|
||||
SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('Jameson John');
|
||||
</programlisting>
|
||||
|
||||
<para>
|
||||
If it is desired to avoid recalculation of soundex codes during index
|
||||
rechecks, an index on a separate column can be used instead of an index on
|
||||
an expression. A stored generated column can be used for this; see
|
||||
<xref linkend="ddl-generated-columns"/>.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="fuzzystrmatch-levenshtein">
|
||||
<title>Levenshtein</title>
|
||||
|
||||
@ -104,10 +259,10 @@ SELECT * FROM s WHERE difference(s.nm, 'john') > 2;
|
||||
</indexterm>
|
||||
|
||||
<synopsis>
|
||||
levenshtein(text source, text target, int ins_cost, int del_cost, int sub_cost) returns int
|
||||
levenshtein(text source, text target) returns int
|
||||
levenshtein_less_equal(text source, text target, int ins_cost, int del_cost, int sub_cost, int max_d) returns int
|
||||
levenshtein_less_equal(text source, text target, int max_d) returns int
|
||||
levenshtein(source text, target text, ins_cost int, del_cost int, sub_cost int) returns int
|
||||
levenshtein(source text, target text) returns int
|
||||
levenshtein_less_equal(source text, target text, ins_cost int, del_cost int, sub_cost int, max_d int) returns int
|
||||
levenshtein_less_equal(source text, target text, max_d int) returns int
|
||||
</synopsis>
|
||||
|
||||
<para>
|
||||
@ -177,7 +332,7 @@ test=# SELECT levenshtein_less_equal('extensive', 'exhaustive', 4);
|
||||
</indexterm>
|
||||
|
||||
<synopsis>
|
||||
metaphone(text source, int max_output_length) returns text
|
||||
metaphone(source text, max_output_length int) returns text
|
||||
</synopsis>
|
||||
|
||||
<para>
|
||||
@ -220,8 +375,8 @@ test=# SELECT metaphone('GUMBO', 4);
|
||||
</indexterm>
|
||||
|
||||
<synopsis>
|
||||
dmetaphone(text source) returns text
|
||||
dmetaphone_alt(text source) returns text
|
||||
dmetaphone(source text) returns text
|
||||
dmetaphone_alt(source text) returns text
|
||||
</synopsis>
|
||||
|
||||
<para>
|
||||
|
Loading…
x
Reference in New Issue
Block a user