Use perfect hash for NFC and NFKC Unicode Normalization quick check

This makes the normalization quick check about 30% faster for NFC and
50% faster for NFKC than the binary search used previously.  The hash
lookup reuses the existing array of bit fields used for the binary
search to get the quick check property and is generated as part of "make
update-unicode" in src/common/unicode/.

Author: John Naylor
Reviewed-by: Mark Dilger, Michael Paquier
Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
This commit is contained in:
Michael Paquier 2020-10-11 19:09:01 +09:00
parent 85d08b8b72
commit 80f8eb79e2
5 changed files with 1681 additions and 24 deletions

View File

@ -9,6 +9,10 @@
use strict; use strict;
use warnings; use warnings;
use FindBin;
use lib "$FindBin::RealBin/../../tools/";
use PerfectHash;
my %data; my %data;
print print
@ -18,13 +22,25 @@ print <<EOS;
#include "common/unicode_norm.h" #include "common/unicode_norm.h"
/* /*
* We use a bit field here to save space. * Normalization quick check entry for codepoint. We use a bit field
* here to save space.
*/ */
typedef struct typedef struct
{ {
unsigned int codepoint:21; unsigned int codepoint:21;
signed int quickcheck:4; /* really UnicodeNormalizationQC */ signed int quickcheck:4; /* really UnicodeNormalizationQC */
} pg_unicode_normprops; } pg_unicode_normprops;
/* Typedef for hash function on quick check table */
typedef int (*qc_hash_func) (const void *key);
/* Information for quick check lookup with perfect hash function */
typedef struct
{
const pg_unicode_normprops *normprops;
qc_hash_func hash;
int num_normprops;
} pg_unicode_norminfo;
EOS EOS
foreach my $line (<ARGV>) foreach my $line (<ARGV>)
@ -66,6 +82,7 @@ foreach my $prop (sort keys %data)
"static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n"; "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
my %subdata = %{ $data{$prop} }; my %subdata = %{ $data{$prop} };
my @cp_packed;
foreach my $cp (sort { $a <=> $b } keys %subdata) foreach my $cp (sort { $a <=> $b } keys %subdata)
{ {
my $qc; my $qc;
@ -82,7 +99,27 @@ foreach my $prop (sort keys %data)
die; die;
} }
printf "\t{0x%04X, %s},\n", $cp, $qc; printf "\t{0x%04X, %s},\n", $cp, $qc;
# Save the bytes as a string in network order.
push @cp_packed, pack('N', $cp);
} }
print "};\n"; print "};\n";
# Emit the definition of the perfect hash function.
my $funcname = $prop . '_hash_func';
my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
fixed_key_length => 4);
printf "\n/* Perfect hash function for %s */", $prop;
print "\nstatic $f\n";
# Emit the structure that wraps the hash lookup information into
# one variable.
printf "/* Hash lookup information for %s */", $prop;
printf "\nstatic const pg_unicode_norminfo ";
printf "UnicodeNormInfo_%s = {\n", $prop;
printf "\tUnicodeNormProps_%s,\n", $prop;
printf "\t%s,\n", $funcname;
printf "\t%d\n", scalar @cp_packed;
printf "};\n";
} }

View File

@ -465,15 +465,32 @@ get_canonical_class(pg_wchar ch)
return entry->comb_class; return entry->comb_class;
} }
static int static const pg_unicode_normprops *
qc_compare(const void *p1, const void *p2) qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
{ {
uint32 v1, int h;
v2; uint32 hashkey;
v1 = ((const pg_unicode_normprops *) p1)->codepoint; /*
v2 = ((const pg_unicode_normprops *) p2)->codepoint; * Compute the hash function. The hash key is the codepoint with the bytes
return (v1 - v2); * in network order.
*/
hashkey = htonl(ch);
h = norminfo->hash(&hashkey);
/* An out-of-range result implies no match */
if (h < 0 || h >= norminfo->num_normprops)
return NULL;
/*
* Since it's a perfect hash, we need only match to the specific codepoint
* it identifies.
*/
if (ch != norminfo->normprops[h].codepoint)
return NULL;
/* Success! */
return &norminfo->normprops[h];
} }
/* /*
@ -482,26 +499,15 @@ qc_compare(const void *p1, const void *p2)
static UnicodeNormalizationQC static UnicodeNormalizationQC
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch) qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
{ {
pg_unicode_normprops key; const pg_unicode_normprops *found = NULL;
pg_unicode_normprops *found = NULL;
key.codepoint = ch;
switch (form) switch (form)
{ {
case UNICODE_NFC: case UNICODE_NFC:
found = bsearch(&key, found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
UnicodeNormProps_NFC_QC,
lengthof(UnicodeNormProps_NFC_QC),
sizeof(pg_unicode_normprops),
qc_compare);
break; break;
case UNICODE_NFKC: case UNICODE_NFKC:
found = bsearch(&key, found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
UnicodeNormProps_NFKC_QC,
lengthof(UnicodeNormProps_NFKC_QC),
sizeof(pg_unicode_normprops),
qc_compare);
break; break;
default: default:
Assert(false); Assert(false);

File diff suppressed because it is too large Load Diff

View File

@ -18,6 +18,11 @@ src/backend/utils/fmgrprotos\.h$
# they match pgindent style, they'd look worse not better, so exclude them. # they match pgindent style, they'd look worse not better, so exclude them.
kwlist_d\.h$ kwlist_d\.h$
# #
# This is generated by the scripts from src/common/unicode/. It uses
# hash functions generated by PerfectHash.pm whose format looks worse with
# pgindent.
src/include/common/unicode_normprops_table\.h$
#
# Exclude ecpg test files to avoid breaking the ecpg regression tests # Exclude ecpg test files to avoid breaking the ecpg regression tests
# (but include files at the top level of the ecpg/test/ directory). # (but include files at the top level of the ecpg/test/ directory).
src/interfaces/ecpg/test/.*/ src/interfaces/ecpg/test/.*/

View File

@ -3191,6 +3191,7 @@ pg_tz
pg_tz_cache pg_tz_cache
pg_tzenum pg_tzenum
pg_unicode_decomposition pg_unicode_decomposition
pg_unicode_norminfo
pg_unicode_normprops pg_unicode_normprops
pg_utf_to_local_combined pg_utf_to_local_combined
pg_uuid_t pg_uuid_t