diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore index 512862e538c..46243f701df 100644 --- a/src/common/unicode/.gitignore +++ b/src/common/unicode/.gitignore @@ -4,5 +4,6 @@ # Downloaded files /CompositionExclusions.txt /DerivedNormalizationProps.txt +/EastAsianWidth.txt /NormalizationTest.txt /UnicodeData.txt diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile index eb14add28ad..a3683dd86b9 100644 --- a/src/common/unicode/Makefile +++ b/src/common/unicode/Makefile @@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS) # By default, do nothing. all: -update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h +update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h mv $^ ../../../src/include/common/ $(MAKE) normalization-check # These files are part of the Unicode Character Database. Download # them on demand. The dependency on Makefile.global is for # UNICODE_VERSION. -UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global +UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global $(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F) # Generation of conversion tables used for string normalization with @@ -38,6 +38,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt $(PERL) $^ >$@ +unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt + $(PERL) $^ >$@ + unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt $(PERL) $^ >$@ @@ -64,6 +67,6 @@ clean: rm -f $(OBJS) norm_test norm_test.o distclean: clean - rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h + rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h maintainer-clean: distclean diff --git a/src/common/unicode/generate-unicode_east_asian_fw_table.pl b/src/common/unicode/generate-unicode_east_asian_fw_table.pl new file mode 100644 index 00000000000..45f7a4b7fe7 --- /dev/null +++ b/src/common/unicode/generate-unicode_east_asian_fw_table.pl @@ -0,0 +1,76 @@ +#!/usr/bin/perl +# +# Generate a sorted list of non-overlapping intervals of East Asian Wide (W) +# and East Asian Fullwidth (F) characters, using Unicode data files as input. +# Pass EastAsianWidth.txt as argument. The output is on stdout. +# +# Copyright (c) 2019-2021, PostgreSQL Global Development Group + +use strict; +use warnings; + +my $range_start = undef; +my ($first, $last); +my $prev_last; + +print + "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n"; + +print "static const struct mbinterval east_asian_fw[] = {\n"; + +foreach my $line () +{ + chomp $line; + $line =~ s/\s*#.*$//; + next if $line eq ''; + my ($codepoint, $width) = split ';', $line; + + if ($codepoint =~ /\.\./) + { + ($first, $last) = split /\.\./, $codepoint; + } + else + { + $first = $last = $codepoint; + } + + ($first, $last) = map(hex, ($first, $last)); + + if ($width eq 'F' || $width eq 'W') + { + # fullwidth/wide characters + if (!defined($range_start)) + { + # save for start of range if one hasn't been started yet + $range_start = $first; + } + elsif ($first != $prev_last + 1) + { + # ranges aren't contiguous; emit the last and start a new one + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; + $range_start = $first; + } + } + else + { + # not wide characters, print out previous range if any + if (defined($range_start)) + { + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; + $range_start = undef; + } + } +} +continue +{ + $prev_last = $last; +} + +# don't forget any ranges at the very end of the database (though there are none +# as of Unicode 13.0) +if (defined($range_start)) +{ + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; +} + +print "};\n"; diff --git a/src/common/wchar.c b/src/common/wchar.c index 0636b8765ba..a6bffd06428 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -583,8 +583,8 @@ pg_utf_mblen(const unsigned char *s) struct mbinterval { - unsigned short first; - unsigned short last; + unsigned int first; + unsigned int last; }; /* auxiliary function for binary search in interval table */ @@ -623,12 +623,6 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max) * category code Mn or Me in the Unicode database) have a * column width of 0. * - * - Other format characters (general category code Cf in the Unicode - * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. - * - * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) - * have a column width of 0. - * * - Spacing characters in the East Asian Wide (W) or East Asian * FullWidth (F) category as defined in Unicode Technical * Report #11 have a column width of 2. @@ -645,6 +639,7 @@ static int ucs_wcwidth(pg_wchar ucs) { #include "common/unicode_combining_table.h" +#include "common/unicode_east_asian_fw_table.h" /* test for 8-bit control characters */ if (ucs == 0) @@ -653,27 +648,25 @@ ucs_wcwidth(pg_wchar ucs) if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff) return -1; - /* binary search in table of non-spacing characters */ + /* + * binary search in table of non-spacing characters + * + * XXX: In the official Unicode sources, it is possible for a character to + * be described as both non-spacing and wide at the same time. As of + * Unicode 13.0, treating the non-spacing property as the determining + * factor for display width leads to the correct behavior, so do that + * search first. + */ if (mbbisearch(ucs, combining, sizeof(combining) / sizeof(struct mbinterval) - 1)) return 0; - /* - * if we arrive here, ucs is not a combining or C0/C1 control character - */ + /* binary search in table of wide characters */ + if (mbbisearch(ucs, east_asian_fw, + sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1)) + return 2; - return 1 + - (ucs >= 0x1100 && - (ucs <= 0x115f || /* Hangul Jamo init. consonants */ - (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a && - ucs != 0x303f) || /* CJK ... Yi */ - (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */ - (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility - * Ideographs */ - (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */ - (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */ - (ucs >= 0xffe0 && ucs <= 0xffe6) || - (ucs >= 0x20000 && ucs <= 0x2ffff))); + return 1; } /* diff --git a/src/include/common/unicode_east_asian_fw_table.h b/src/include/common/unicode_east_asian_fw_table.h new file mode 100644 index 00000000000..b27f95b5dcc --- /dev/null +++ b/src/include/common/unicode_east_asian_fw_table.h @@ -0,0 +1,120 @@ +/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */ + +static const struct mbinterval east_asian_fw[] = { + {0x1100, 0x115F}, + {0x231A, 0x231B}, + {0x2329, 0x232A}, + {0x23E9, 0x23EC}, + {0x23F0, 0x23F0}, + {0x23F3, 0x23F3}, + {0x25FD, 0x25FE}, + {0x2614, 0x2615}, + {0x2648, 0x2653}, + {0x267F, 0x267F}, + {0x2693, 0x2693}, + {0x26A1, 0x26A1}, + {0x26AA, 0x26AB}, + {0x26BD, 0x26BE}, + {0x26C4, 0x26C5}, + {0x26CE, 0x26CE}, + {0x26D4, 0x26D4}, + {0x26EA, 0x26EA}, + {0x26F2, 0x26F3}, + {0x26F5, 0x26F5}, + {0x26FA, 0x26FA}, + {0x26FD, 0x26FD}, + {0x2705, 0x2705}, + {0x270A, 0x270B}, + {0x2728, 0x2728}, + {0x274C, 0x274C}, + {0x274E, 0x274E}, + {0x2753, 0x2755}, + {0x2757, 0x2757}, + {0x2795, 0x2797}, + {0x27B0, 0x27B0}, + {0x27BF, 0x27BF}, + {0x2B1B, 0x2B1C}, + {0x2B50, 0x2B50}, + {0x2B55, 0x2B55}, + {0x2E80, 0x2E99}, + {0x2E9B, 0x2EF3}, + {0x2F00, 0x2FD5}, + {0x2FF0, 0x2FFB}, + {0x3000, 0x303E}, + {0x3041, 0x3096}, + {0x3099, 0x30FF}, + {0x3105, 0x312F}, + {0x3131, 0x318E}, + {0x3190, 0x31E3}, + {0x31F0, 0x321E}, + {0x3220, 0x3247}, + {0x3250, 0x4DBF}, + {0x4E00, 0xA48C}, + {0xA490, 0xA4C6}, + {0xA960, 0xA97C}, + {0xAC00, 0xD7A3}, + {0xF900, 0xFAFF}, + {0xFE10, 0xFE19}, + {0xFE30, 0xFE52}, + {0xFE54, 0xFE66}, + {0xFE68, 0xFE6B}, + {0xFF01, 0xFF60}, + {0xFFE0, 0xFFE6}, + {0x16FE0, 0x16FE4}, + {0x16FF0, 0x16FF1}, + {0x17000, 0x187F7}, + {0x18800, 0x18CD5}, + {0x18D00, 0x18D08}, + {0x1B000, 0x1B11E}, + {0x1B150, 0x1B152}, + {0x1B164, 0x1B167}, + {0x1B170, 0x1B2FB}, + {0x1F004, 0x1F004}, + {0x1F0CF, 0x1F0CF}, + {0x1F18E, 0x1F18E}, + {0x1F191, 0x1F19A}, + {0x1F200, 0x1F202}, + {0x1F210, 0x1F23B}, + {0x1F240, 0x1F248}, + {0x1F250, 0x1F251}, + {0x1F260, 0x1F265}, + {0x1F300, 0x1F320}, + {0x1F32D, 0x1F335}, + {0x1F337, 0x1F37C}, + {0x1F37E, 0x1F393}, + {0x1F3A0, 0x1F3CA}, + {0x1F3CF, 0x1F3D3}, + {0x1F3E0, 0x1F3F0}, + {0x1F3F4, 0x1F3F4}, + {0x1F3F8, 0x1F43E}, + {0x1F440, 0x1F440}, + {0x1F442, 0x1F4FC}, + {0x1F4FF, 0x1F53D}, + {0x1F54B, 0x1F54E}, + {0x1F550, 0x1F567}, + {0x1F57A, 0x1F57A}, + {0x1F595, 0x1F596}, + {0x1F5A4, 0x1F5A4}, + {0x1F5FB, 0x1F64F}, + {0x1F680, 0x1F6C5}, + {0x1F6CC, 0x1F6CC}, + {0x1F6D0, 0x1F6D2}, + {0x1F6D5, 0x1F6D7}, + {0x1F6EB, 0x1F6EC}, + {0x1F6F4, 0x1F6FC}, + {0x1F7E0, 0x1F7EB}, + {0x1F90C, 0x1F93A}, + {0x1F93C, 0x1F945}, + {0x1F947, 0x1F978}, + {0x1F97A, 0x1F9CB}, + {0x1F9CD, 0x1F9FF}, + {0x1FA70, 0x1FA74}, + {0x1FA78, 0x1FA7A}, + {0x1FA80, 0x1FA86}, + {0x1FA90, 0x1FAA8}, + {0x1FAB0, 0x1FAB6}, + {0x1FAC0, 0x1FAC2}, + {0x1FAD0, 0x1FAD6}, + {0x20000, 0x2FFFD}, + {0x30000, 0x3FFFD}, +};