mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-06 15:24:56 +08:00
Use perfect hash for NFC and NFKC Unicode Normalization quick check
This makes the normalization quick check about 30% faster for NFC and 50% faster for NFKC than the binary search used previously. The hash lookup reuses the existing array of bit fields used for the binary search to get the quick check property and is generated as part of "make update-unicode" in src/common/unicode/. Author: John Naylor Reviewed-by: Mark Dilger, Michael Paquier Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
This commit is contained in:
parent
85d08b8b72
commit
80f8eb79e2
@ -9,6 +9,10 @@
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use FindBin;
|
||||
use lib "$FindBin::RealBin/../../tools/";
|
||||
use PerfectHash;
|
||||
|
||||
my %data;
|
||||
|
||||
print
|
||||
@ -18,13 +22,25 @@ print <<EOS;
|
||||
#include "common/unicode_norm.h"
|
||||
|
||||
/*
|
||||
* We use a bit field here to save space.
|
||||
* Normalization quick check entry for codepoint. We use a bit field
|
||||
* here to save space.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
unsigned int codepoint:21;
|
||||
signed int quickcheck:4; /* really UnicodeNormalizationQC */
|
||||
} pg_unicode_normprops;
|
||||
} pg_unicode_normprops;
|
||||
|
||||
/* Typedef for hash function on quick check table */
|
||||
typedef int (*qc_hash_func) (const void *key);
|
||||
|
||||
/* Information for quick check lookup with perfect hash function */
|
||||
typedef struct
|
||||
{
|
||||
const pg_unicode_normprops *normprops;
|
||||
qc_hash_func hash;
|
||||
int num_normprops;
|
||||
} pg_unicode_norminfo;
|
||||
EOS
|
||||
|
||||
foreach my $line (<ARGV>)
|
||||
@ -66,6 +82,7 @@ foreach my $prop (sort keys %data)
|
||||
"static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
|
||||
|
||||
my %subdata = %{ $data{$prop} };
|
||||
my @cp_packed;
|
||||
foreach my $cp (sort { $a <=> $b } keys %subdata)
|
||||
{
|
||||
my $qc;
|
||||
@ -82,7 +99,27 @@ foreach my $prop (sort keys %data)
|
||||
die;
|
||||
}
|
||||
printf "\t{0x%04X, %s},\n", $cp, $qc;
|
||||
|
||||
# Save the bytes as a string in network order.
|
||||
push @cp_packed, pack('N', $cp);
|
||||
}
|
||||
|
||||
print "};\n";
|
||||
|
||||
# Emit the definition of the perfect hash function.
|
||||
my $funcname = $prop . '_hash_func';
|
||||
my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
|
||||
fixed_key_length => 4);
|
||||
printf "\n/* Perfect hash function for %s */", $prop;
|
||||
print "\nstatic $f\n";
|
||||
|
||||
# Emit the structure that wraps the hash lookup information into
|
||||
# one variable.
|
||||
printf "/* Hash lookup information for %s */", $prop;
|
||||
printf "\nstatic const pg_unicode_norminfo ";
|
||||
printf "UnicodeNormInfo_%s = {\n", $prop;
|
||||
printf "\tUnicodeNormProps_%s,\n", $prop;
|
||||
printf "\t%s,\n", $funcname;
|
||||
printf "\t%d\n", scalar @cp_packed;
|
||||
printf "};\n";
|
||||
}
|
||||
|
@ -465,15 +465,32 @@ get_canonical_class(pg_wchar ch)
|
||||
return entry->comb_class;
|
||||
}
|
||||
|
||||
static int
|
||||
qc_compare(const void *p1, const void *p2)
|
||||
static const pg_unicode_normprops *
|
||||
qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
|
||||
{
|
||||
uint32 v1,
|
||||
v2;
|
||||
int h;
|
||||
uint32 hashkey;
|
||||
|
||||
v1 = ((const pg_unicode_normprops *) p1)->codepoint;
|
||||
v2 = ((const pg_unicode_normprops *) p2)->codepoint;
|
||||
return (v1 - v2);
|
||||
/*
|
||||
* Compute the hash function. The hash key is the codepoint with the bytes
|
||||
* in network order.
|
||||
*/
|
||||
hashkey = htonl(ch);
|
||||
h = norminfo->hash(&hashkey);
|
||||
|
||||
/* An out-of-range result implies no match */
|
||||
if (h < 0 || h >= norminfo->num_normprops)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Since it's a perfect hash, we need only match to the specific codepoint
|
||||
* it identifies.
|
||||
*/
|
||||
if (ch != norminfo->normprops[h].codepoint)
|
||||
return NULL;
|
||||
|
||||
/* Success! */
|
||||
return &norminfo->normprops[h];
|
||||
}
|
||||
|
||||
/*
|
||||
@ -482,26 +499,15 @@ qc_compare(const void *p1, const void *p2)
|
||||
static UnicodeNormalizationQC
|
||||
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
|
||||
{
|
||||
pg_unicode_normprops key;
|
||||
pg_unicode_normprops *found = NULL;
|
||||
|
||||
key.codepoint = ch;
|
||||
const pg_unicode_normprops *found = NULL;
|
||||
|
||||
switch (form)
|
||||
{
|
||||
case UNICODE_NFC:
|
||||
found = bsearch(&key,
|
||||
UnicodeNormProps_NFC_QC,
|
||||
lengthof(UnicodeNormProps_NFC_QC),
|
||||
sizeof(pg_unicode_normprops),
|
||||
qc_compare);
|
||||
found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
|
||||
break;
|
||||
case UNICODE_NFKC:
|
||||
found = bsearch(&key,
|
||||
UnicodeNormProps_NFKC_QC,
|
||||
lengthof(UnicodeNormProps_NFKC_QC),
|
||||
sizeof(pg_unicode_normprops),
|
||||
qc_compare);
|
||||
found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
|
||||
break;
|
||||
default:
|
||||
Assert(false);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -18,6 +18,11 @@ src/backend/utils/fmgrprotos\.h$
|
||||
# they match pgindent style, they'd look worse not better, so exclude them.
|
||||
kwlist_d\.h$
|
||||
#
|
||||
# This is generated by the scripts from src/common/unicode/. It uses
|
||||
# hash functions generated by PerfectHash.pm whose format looks worse with
|
||||
# pgindent.
|
||||
src/include/common/unicode_normprops_table\.h$
|
||||
#
|
||||
# Exclude ecpg test files to avoid breaking the ecpg regression tests
|
||||
# (but include files at the top level of the ecpg/test/ directory).
|
||||
src/interfaces/ecpg/test/.*/
|
||||
|
@ -3191,6 +3191,7 @@ pg_tz
|
||||
pg_tz_cache
|
||||
pg_tzenum
|
||||
pg_unicode_decomposition
|
||||
pg_unicode_norminfo
|
||||
pg_unicode_normprops
|
||||
pg_utf_to_local_combined
|
||||
pg_uuid_t
|
||||
|
Loading…
Reference in New Issue
Block a user