mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-12-03 08:00:21 +08:00
29275b1d17
Reported-by: Michael Paquier Discussion: https://postgr.es/m/ZZKTDPxBBMt3C0J9@paquier.xyz Backpatch-through: 12
220 lines
4.9 KiB
Perl
Executable File
220 lines
4.9 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
#
|
|
# Generation of types and lookup tables for Daitch-Mokotoff soundex.
|
|
#
|
|
# Copyright (c) 2023-2024, PostgreSQL Global Development Group
|
|
#
|
|
# This module was originally sponsored by Finance Norway /
|
|
# Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
|
|
#
|
|
|
|
use strict;
|
|
use warnings FATAL => 'all';
|
|
|
|
die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1;
|
|
my $output_file = $ARGV[0];
|
|
|
|
# Open the output file
|
|
open my $OUTPUT, '>', $output_file
|
|
or die "Could not open output file $output_file: $!\n";
|
|
|
|
# Parse code table and generate tree for letter transitions.
|
|
my %codes;
|
|
my $table = [ {}, [ [ "", "", "" ] ] ];
|
|
while (<DATA>)
|
|
{
|
|
chomp;
|
|
my ($letters, $codes) = split(/\s+/);
|
|
my @codes = map { [ split(/,/) ] } split(/\|/, $codes);
|
|
|
|
my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes);
|
|
my $val = join(
|
|
",\n",
|
|
map {
|
|
"\t{\n\t\t"
|
|
. join(", ", map { "\"$_\"" } @$_) . "\n\t}"
|
|
} @codes);
|
|
$codes{$key} = $val;
|
|
|
|
for my $letter (split(/,/, $letters))
|
|
{
|
|
my $ref = $table->[0];
|
|
# Link each character to the next in the letter combination.
|
|
my @c = split(//, $letter);
|
|
my $last_c = pop(@c);
|
|
for my $c (@c)
|
|
{
|
|
$ref->{$c} //= [ {}, undef ];
|
|
$ref->{$c}[0] //= {};
|
|
$ref = $ref->{$c}[0];
|
|
}
|
|
# The sound code for the letter combination is stored at the last character.
|
|
$ref->{$last_c}[1] = $key;
|
|
}
|
|
}
|
|
close(DATA);
|
|
|
|
print $OUTPUT <<EOF;
|
|
/*
|
|
* Constants and lookup tables for Daitch-Mokotoff Soundex
|
|
*
|
|
* Copyright (c) 2023-2024, PostgreSQL Global Development Group
|
|
*
|
|
* This file is generated by daitch_mokotoff_header.pl
|
|
*/
|
|
|
|
/* Coding chart table: Soundex codes */
|
|
typedef char dm_code[2 + 1]; /* One or two sequential code digits + NUL */
|
|
typedef dm_code dm_codes[3]; /* Start of name, before a vowel, any other */
|
|
|
|
/* Coding chart table: Letter in input sequence */
|
|
struct dm_letter
|
|
{
|
|
char letter; /* Present letter in sequence */
|
|
const struct dm_letter *letters; /* List of possible successive letters */
|
|
const dm_codes *codes; /* Code sequence(s) for complete sequence */
|
|
};
|
|
|
|
typedef struct dm_letter dm_letter;
|
|
|
|
/* Codes for letter sequence at start of name, before a vowel, and any other. */
|
|
EOF
|
|
|
|
for my $key (sort keys %codes)
|
|
{
|
|
print $OUTPUT "static const dm_codes $key\[2\] =\n{\n"
|
|
. $codes{$key}
|
|
. "\n};\n";
|
|
}
|
|
|
|
print $OUTPUT <<EOF;
|
|
|
|
/* Coding for alternative following letters in sequence. */
|
|
EOF
|
|
|
|
sub hash2code
|
|
{
|
|
my ($ref, $letter) = @_;
|
|
|
|
my @letters = ();
|
|
|
|
my $h = $ref->[0];
|
|
for my $key (sort keys %$h)
|
|
{
|
|
$ref = $h->{$key};
|
|
my $children = "NULL";
|
|
if (defined $ref->[0])
|
|
{
|
|
$children = "letter_$letter$key";
|
|
hash2code($ref, "$letter$key");
|
|
}
|
|
my $codes = $ref->[1] // "NULL";
|
|
push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}");
|
|
}
|
|
|
|
print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n";
|
|
for (@letters)
|
|
{
|
|
print $OUTPUT "$_,\n";
|
|
}
|
|
print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n";
|
|
print $OUTPUT "};\n";
|
|
}
|
|
|
|
hash2code($table, '');
|
|
|
|
close $OUTPUT;
|
|
|
|
# Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html
|
|
#
|
|
# The conversion from the coding chart to the table should be self
|
|
# explanatory, but note the differences stated below.
|
|
#
|
|
# X = NC (not coded)
|
|
#
|
|
# The non-ASCII letters in the coding chart are coded with substitute
|
|
# lowercase ASCII letters, which sort after the uppercase ASCII letters:
|
|
#
|
|
# Ą => a (use '[' for table lookup)
|
|
# Ę => e (use '\\' for table lookup)
|
|
# Ţ => t (use ']' for table lookup)
|
|
#
|
|
# The rule for "UE" does not correspond to the coding chart, however
|
|
# it is used by all other known implementations, including the one at
|
|
# https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey").
|
|
#
|
|
# Note that the implementation assumes that vowels are assigned code
|
|
# 0 or 1. "J" can be either a vowel or a consonant.
|
|
#
|
|
|
|
__DATA__
|
|
AI,AJ,AY 0,1,X
|
|
AU 0,7,X
|
|
a X,X,6|X,X,X
|
|
A 0,X,X
|
|
B 7,7,7
|
|
CHS 5,54,54
|
|
CH 5,5,5|4,4,4
|
|
CK 5,5,5|45,45,45
|
|
CZ,CS,CSZ,CZS 4,4,4
|
|
C 5,5,5|4,4,4
|
|
DRZ,DRS 4,4,4
|
|
DS,DSH,DSZ 4,4,4
|
|
DZ,DZH,DZS 4,4,4
|
|
D,DT 3,3,3
|
|
EI,EJ,EY 0,1,X
|
|
EU 1,1,X
|
|
e X,X,6|X,X,X
|
|
E 0,X,X
|
|
FB 7,7,7
|
|
F 7,7,7
|
|
G 5,5,5
|
|
H 5,5,X
|
|
IA,IE,IO,IU 1,X,X
|
|
I 0,X,X
|
|
J 1,X,X|4,4,4
|
|
KS 5,54,54
|
|
KH 5,5,5
|
|
K 5,5,5
|
|
L 8,8,8
|
|
MN 66,66,66
|
|
M 6,6,6
|
|
NM 66,66,66
|
|
N 6,6,6
|
|
OI,OJ,OY 0,1,X
|
|
O 0,X,X
|
|
P,PF,PH 7,7,7
|
|
Q 5,5,5
|
|
RZ,RS 94,94,94|4,4,4
|
|
R 9,9,9
|
|
SCHTSCH,SCHTSH,SCHTCH 2,4,4
|
|
SCH 4,4,4
|
|
SHTCH,SHCH,SHTSH 2,4,4
|
|
SHT,SCHT,SCHD 2,43,43
|
|
SH 4,4,4
|
|
STCH,STSCH,SC 2,4,4
|
|
STRZ,STRS,STSH 2,4,4
|
|
ST 2,43,43
|
|
SZCZ,SZCS 2,4,4
|
|
SZT,SHD,SZD,SD 2,43,43
|
|
SZ 4,4,4
|
|
S 4,4,4
|
|
TCH,TTCH,TTSCH 4,4,4
|
|
TH 3,3,3
|
|
TRZ,TRS 4,4,4
|
|
TSCH,TSH 4,4,4
|
|
TS,TTS,TTSZ,TC 4,4,4
|
|
TZ,TTZ,TZS,TSZ 4,4,4
|
|
t 3,3,3|4,4,4
|
|
T 3,3,3
|
|
UI,UJ,UY,UE 0,1,X
|
|
U 0,X,X
|
|
V 7,7,7
|
|
W 7,7,7
|
|
X 5,54,54
|
|
Y 1,X,X
|
|
ZDZ,ZDZH,ZHDZH 2,4,4
|
|
ZD,ZHD 2,43,43
|
|
ZH,ZS,ZSCH,ZSH 4,4,4
|
|
Z 4,4,4
|