Use perfect hash for NFC and NFKC Unicode Normalization quick check

This makes the normalization quick check about 30% faster for NFC and 50% faster for NFKC than the binary search used previously. The hash lookup reuses the existing array of bit fields used for the binary search to get the quick check property and is generated as part of "make update-unicode" in src/common/unicode/. Author: John Naylor Reviewed-by: Mark Dilger, Michael Paquier Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
2025-01-06 15:24:56 +08:00 · 2020-10-11 19:09:01 +09:00 · 2020-10-11 19:09:01 +09:00 · 80f8eb79e2
commit 80f8eb79e2
parent 85d08b8b72
5 changed files with 1681 additions and 24 deletions
--- a/src/common/unicode/generate-unicode_normprops_table.pl
+++ b/src/common/unicode/generate-unicode_normprops_table.pl
@ -9,6 +9,10 @@
 use strict;
 use warnings;

+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+use PerfectHash;
+
 my %data;

 print
@ -18,13 +22,25 @@ print <<EOS;
 #include "common/unicode_norm.h"

 /*
- * We use a bit field here to save space.
+ * Normalization quick check entry for codepoint.  We use a bit field
+ * here to save space.
 */
 typedef struct
 {
 	unsigned int codepoint:21;
 	signed int	quickcheck:4;	/* really UnicodeNormalizationQC */
-}			pg_unicode_normprops;
+} pg_unicode_normprops;
+
+/* Typedef for hash function on quick check table */
+typedef int (*qc_hash_func) (const void *key);
+
+/* Information for quick check lookup with perfect hash function */
+typedef struct
+{
+	const pg_unicode_normprops *normprops;
+	qc_hash_func	hash;
+	int		num_normprops;
+} pg_unicode_norminfo;
 EOS

 foreach my $line (<ARGV>)
@ -66,6 +82,7 @@ foreach my $prop (sort keys %data)
 	  "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";

 	my %subdata = %{ $data{$prop} };
+	my @cp_packed;
 	foreach my $cp (sort { $a <=> $b } keys %subdata)
 	{
 		my $qc;
@ -82,7 +99,27 @@ foreach my $prop (sort keys %data)
 			die;
 		}
 		printf "\t{0x%04X, %s},\n", $cp, $qc;
+
+		# Save the bytes as a string in network order.
+		push @cp_packed, pack('N', $cp);
 	}

 	print "};\n";
+
+	# Emit the definition of the perfect hash function.
+	my $funcname = $prop . '_hash_func';
+	my $f        = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
+		fixed_key_length => 4);
+	printf "\n/* Perfect hash function for %s */", $prop;
+	print "\nstatic $f\n";
+
+	# Emit the structure that wraps the hash lookup information into
+	# one variable.
+	printf "/* Hash lookup information for %s */", $prop;
+	printf "\nstatic const pg_unicode_norminfo ";
+	printf "UnicodeNormInfo_%s = {\n", $prop;
+	printf "\tUnicodeNormProps_%s,\n", $prop;
+	printf "\t%s,\n",                  $funcname;
+	printf "\t%d\n",                   scalar @cp_packed;
+	printf "};\n";
 }
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@ -465,15 +465,32 @@ get_canonical_class(pg_wchar ch)
 		return entry->comb_class;
 }

-static int
-qc_compare(const void *p1, const void *p2)
+static const pg_unicode_normprops *
+qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
 {
-	uint32		v1,
-				v2;
+	int			h;
+	uint32		hashkey;

-	v1 = ((const pg_unicode_normprops *) p1)->codepoint;
-	v2 = ((const pg_unicode_normprops *) p2)->codepoint;
-	return (v1 - v2);
+	/*
+	 * Compute the hash function. The hash key is the codepoint with the bytes
+	 * in network order.
+	 */
+	hashkey = htonl(ch);
+	h = norminfo->hash(&hashkey);
+
+	/* An out-of-range result implies no match */
+	if (h < 0 || h >= norminfo->num_normprops)
+		return NULL;
+
+	/*
+	 * Since it's a perfect hash, we need only match to the specific codepoint
+	 * it identifies.
+	 */
+	if (ch != norminfo->normprops[h].codepoint)
+		return NULL;
+
+	/* Success! */
+	return &norminfo->normprops[h];
 }

 /*
@ -482,26 +499,15 @@ qc_compare(const void *p1, const void *p2)
 static UnicodeNormalizationQC
 qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
 {
-	pg_unicode_normprops key;
-	pg_unicode_normprops *found = NULL;
-
-	key.codepoint = ch;
+	const pg_unicode_normprops *found = NULL;

 	switch (form)
 	{
 		case UNICODE_NFC:
-			found = bsearch(&key,
-							UnicodeNormProps_NFC_QC,
-							lengthof(UnicodeNormProps_NFC_QC),
-							sizeof(pg_unicode_normprops),
-							qc_compare);
+			found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
 			break;
 		case UNICODE_NFKC:
-			found = bsearch(&key,
-							UnicodeNormProps_NFKC_QC,
-							lengthof(UnicodeNormProps_NFKC_QC),
-							sizeof(pg_unicode_normprops),
-							qc_compare);
+			found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
 			break;
 		default:
 			Assert(false);
--- a/src/include/common/unicode_normprops_table.h
+++ b/src/include/common/unicode_normprops_table.h
--- a/src/tools/pgindent/exclude_file_patterns
+++ b/src/tools/pgindent/exclude_file_patterns
@ -18,6 +18,11 @@ src/backend/utils/fmgrprotos\.h$
 # they match pgindent style, they'd look worse not better, so exclude them.
 kwlist_d\.h$
 #
+# This is generated by the scripts from src/common/unicode/.  It uses
+# hash functions generated by PerfectHash.pm whose format looks worse with
+# pgindent.
+src/include/common/unicode_normprops_table\.h$
+#
 # Exclude ecpg test files to avoid breaking the ecpg regression tests
 # (but include files at the top level of the ecpg/test/ directory).
 src/interfaces/ecpg/test/.*/
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@ -3191,6 +3191,7 @@ pg_tz
 pg_tz_cache
 pg_tzenum
 pg_unicode_decomposition
+pg_unicode_norminfo
 pg_unicode_normprops
 pg_utf_to_local_combined
 pg_uuid_t