Use ICU by default at initdb time.

If the ICU locale is not specified, initialize the default collator
and retrieve the locale name from that.

Discussion: https://postgr.es/m/510d284759f6e943ce15096167760b2edcb2e700.camel@j-davis.com
Reviewed-by: Peter Eisentraut
This commit is contained in:
Jeff Davis 2023-03-09 10:52:41 -08:00
parent a7e584a7d6
commit 27b62377b4
18 changed files with 147 additions and 42 deletions

View File

@ -1,9 +1,16 @@
/*
* This test must be run in a database with UTF-8 encoding
* and a Unicode-aware locale.
*
* Also disable this file for ICU, because the test for the the
* Turkish dotted I is not correct for many ICU locales. citext always
* uses the default collation, so it's not easy to restrict the test
* to the "tr-TR-x-icu" collation where it will succeed.
*/
SELECT getdatabaseencoding() <> 'UTF8' OR
current_setting('lc_ctype') = 'C'
current_setting('lc_ctype') = 'C' OR
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset
\if :skip_test
\quit

View File

@ -1,9 +1,16 @@
/*
* This test must be run in a database with UTF-8 encoding
* and a Unicode-aware locale.
*
* Also disable this file for ICU, because the test for the the
* Turkish dotted I is not correct for many ICU locales. citext always
* uses the default collation, so it's not easy to restrict the test
* to the "tr-TR-x-icu" collation where it will succeed.
*/
SELECT getdatabaseencoding() <> 'UTF8' OR
current_setting('lc_ctype') = 'C'
current_setting('lc_ctype') = 'C' OR
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset
\if :skip_test
\quit

View File

@ -1,10 +1,17 @@
/*
* This test must be run in a database with UTF-8 encoding
* and a Unicode-aware locale.
*
* Also disable this file for ICU, because the test for the the
* Turkish dotted I is not correct for many ICU locales. citext always
* uses the default collation, so it's not easy to restrict the test
* to the "tr-TR-x-icu" collation where it will succeed.
*/
SELECT getdatabaseencoding() <> 'UTF8' OR
current_setting('lc_ctype') = 'C'
current_setting('lc_ctype') = 'C' OR
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset
\if :skip_test
\quit

View File

@ -1,3 +1,12 @@
-- unaccent is broken if the default collation is provided by ICU and
-- LC_CTYPE=C
SELECT current_setting('lc_ctype') = 'C' AND
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset
\if :skip_test
\quit
\endif
CREATE EXTENSION unaccent;
-- must have a UTF8 database
SELECT getdatabaseencoding();

View File

@ -0,0 +1,8 @@
-- unaccent is broken if the default collation is provided by ICU and
-- LC_CTYPE=C
SELECT current_setting('lc_ctype') = 'C' AND
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset
\if :skip_test
\quit

View File

@ -1,3 +1,14 @@
-- unaccent is broken if the default collation is provided by ICU and
-- LC_CTYPE=C
SELECT current_setting('lc_ctype') = 'C' AND
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset
\if :skip_test
\quit
\endif
CREATE EXTENSION unaccent;
-- must have a UTF8 database

View File

@ -89,10 +89,28 @@ PostgreSQL documentation
and character set encoding. These can also be set separately for each
database when it is created. <command>initdb</command> determines those
settings for the template databases, which will serve as the default for
all other databases. By default, <command>initdb</command> uses the
locale provider <literal>libc</literal>, takes the locale settings from
the environment, and determines the encoding from the locale settings.
This is almost always sufficient, unless there are special requirements.
all other databases.
</para>
<para>
By default, <command>initdb</command> uses the ICU library to provide
locale services if the server was built with ICU support; otherwise it uses
the <literal>libc</literal> locale provider (see <xref
linkend="locale-providers"/>). To choose the specific ICU locale ID to
apply, use the option <option>--icu-locale</option>. Note that for
implementation reasons and to support legacy code,
<command>initdb</command> will still select and initialize libc locale
settings when the ICU locale provider is used.
</para>
<para>
Alternatively, <command>initdb</command> can use the locale provider
<literal>libc</literal>. To select this option, specify
<literal>--locale-provider=libc</literal>, or build the server without ICU
support. The <literal>libc</literal> locale provider takes the locale
settings from the environment, and determines the encoding from the locale
settings. This is almost always sufficient, unless there are special
requirements.
</para>
<para>
@ -103,17 +121,6 @@ PostgreSQL documentation
categories can give nonsensical results, so this should be used with care.
</para>
<para>
Alternatively, the ICU library can be used to provide locale services.
(Again, this only sets the default for subsequently created databases.) To
select this option, specify <literal>--locale-provider=icu</literal>.
To choose the specific ICU locale ID to apply, use the option
<option>--icu-locale</option>. Note that
for implementation reasons and to support legacy code,
<command>initdb</command> will still select and initialize libc locale
settings when the ICU locale provider is used.
</para>
<para>
When <command>initdb</command> runs, it will print out the locale settings
it has chosen. If you have complex requirements or specified multiple
@ -234,7 +241,13 @@ PostgreSQL documentation
<term><option>--icu-locale=<replaceable>locale</replaceable></option></term>
<listitem>
<para>
Specifies the ICU locale ID, if the ICU locale provider is used.
Specifies the ICU locale when the ICU provider is used. Locale support
is described in <xref linkend="locale"/>.
</para>
<para>
If this option is not specified, the locale is inherited from the
environment in which <command>initdb</command> runs. The environment's
locale is matched to a similar ICU locale name, if possible.
</para>
</listitem>
</varlistentry>
@ -307,10 +320,12 @@ PostgreSQL documentation
<term><option>--locale-provider={<literal>libc</literal>|<literal>icu</literal>}</option></term>
<listitem>
<para>
This option sets the locale provider for databases created in the
new cluster. It can be overridden in the <command>CREATE
This option sets the locale provider for databases created in the new
cluster. It can be overridden in the <command>CREATE
DATABASE</command> command when new databases are subsequently
created. The default is <literal>libc</literal>.
created. The default is <literal>icu</literal> if the server was
built with ICU support; otherwise the default is
<literal>libc</literal> (see <xref linkend="locale-providers"/>).
</para>
</listitem>
</varlistentry>

View File

@ -16,7 +16,7 @@ subdir = src/bin/initdb
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
override CPPFLAGS := -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(CPPFLAGS)
override CPPFLAGS := -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(ICU_CFLAGS) $(CPPFLAGS)
# Note: it's important that we link to encnames.o from libpgcommon, not
# from libpq, else we have risks of version skew if we run with a libpq
@ -24,7 +24,7 @@ override CPPFLAGS := -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(CPPFLAGS)
# should ensure that that happens.
#
# We need libpq only because fe_utils does.
LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport)
LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) $(ICU_LIBS)
# use system timezone data?
ifneq (,$(with_system_tzdata))

View File

@ -53,6 +53,9 @@
#include <netdb.h>
#include <sys/socket.h>
#include <sys/stat.h>
#ifdef USE_ICU
#include <unicode/ucol.h>
#endif
#include <unistd.h>
#include <signal.h>
#include <time.h>
@ -133,7 +136,11 @@ static char *lc_monetary = NULL;
static char *lc_numeric = NULL;
static char *lc_time = NULL;
static char *lc_messages = NULL;
#ifdef USE_ICU
static char locale_provider = COLLPROVIDER_ICU;
#else
static char locale_provider = COLLPROVIDER_LIBC;
#endif
static char *icu_locale = NULL;
static char *icu_rules = NULL;
static const char *default_text_search_config = NULL;
@ -2028,6 +2035,50 @@ check_icu_locale_encoding(int user_enc)
return true;
}
/*
* Check that ICU accepts the locale name; or if not specified, retrieve the
* default ICU locale.
*/
static void
check_icu_locale(void)
{
#ifdef USE_ICU
UCollator *collator;
UErrorCode status;
status = U_ZERO_ERROR;
collator = ucol_open(icu_locale, &status);
if (U_FAILURE(status))
{
if (icu_locale)
pg_fatal("could not open collator for locale \"%s\": %s",
icu_locale, u_errorName(status));
else
pg_fatal("could not open collator for default locale: %s",
u_errorName(status));
}
/* if not specified, get locale from default collator */
if (icu_locale == NULL)
{
const char *default_locale;
status = U_ZERO_ERROR;
default_locale = ucol_getLocaleByType(collator, ULOC_VALID_LOCALE,
&status);
if (U_FAILURE(status))
{
ucol_close(collator);
pg_fatal("could not determine default ICU locale");
}
icu_locale = pg_strdup(default_locale);
}
ucol_close(collator);
#endif
}
/*
* set up the locale variables
*
@ -2081,8 +2132,7 @@ setlocales(void)
if (locale_provider == COLLPROVIDER_ICU)
{
if (!icu_locale)
pg_fatal("ICU locale must be specified");
check_icu_locale();
/*
* In supported builds, the ICU locale ID will be checked by the

View File

@ -97,11 +97,6 @@ SKIP:
if ($ENV{with_icu} eq 'yes')
{
command_fails_like(
[ 'initdb', '--no-sync', '--locale-provider=icu', "$tempdir/data2" ],
qr/initdb: error: ICU locale must be specified/,
'locale provider ICU requires --icu-locale');
command_ok(
[
'initdb', '--no-sync',
@ -116,7 +111,7 @@ if ($ENV{with_icu} eq 'yes')
'--locale-provider=icu', '--icu-locale=@colNumeric=lower',
"$tempdir/dataX"
],
qr/FATAL: could not open collator for locale/,
qr/error: could not open collator for locale/,
'fails for invalid ICU locale');
command_fails_like(

View File

@ -1758,7 +1758,7 @@ my %tests = (
create_sql =>
"CREATE DATABASE dump_test2 LOCALE = 'C' TEMPLATE = template0;",
regexp => qr/^
\QCREATE DATABASE dump_test2 \E.*\QLOCALE = 'C';\E
\QCREATE DATABASE dump_test2 \E.*\QLOCALE = 'C'\E
/xm,
like => { pg_dumpall_dbprivs => 1, },
},

View File

@ -13,7 +13,7 @@ program_version_ok('createdb');
program_options_handling_ok('createdb');
my $node = PostgreSQL::Test::Cluster->new('main');
$node->init;
$node->init(extra => ['--locale-provider=libc']);
$node->start;
$node->issues_sql_like(

View File

@ -14,9 +14,6 @@ override CPPFLAGS := \
'-DSHELLPROG="$(SHELL)"' \
$(CPPFLAGS)
# default encoding for regression tests
ENCODING = SQL_ASCII
ifneq ($(build_os),mingw32)
abs_builddir := $(shell pwd)
else

View File

@ -55,7 +55,7 @@ exec sql end declare section;
exec sql connect to 'unix:postgresql://localhost/ecpg2_regression' as main user :user USING "connectpw";
exec sql disconnect main;
exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=180&client_encoding=latin1 as main user regress_ecpg_user1/connectpw;
exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=180&client_encoding=sql_ascii as main user regress_ecpg_user1/connectpw;
exec sql disconnect main;
exec sql connect to "unix:postgresql://200.46.204.71/ecpg2_regression" as main user regress_ecpg_user1/connectpw;

View File

@ -117,7 +117,7 @@ main(void)
#line 56 "test5.pgc"
{ ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=180 & client_encoding=latin1" , "regress_ecpg_user1" , "connectpw" , "main", 0); }
{ ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=180 & client_encoding=sql_ascii" , "regress_ecpg_user1" , "connectpw" , "main", 0); }
#line 58 "test5.pgc"
{ ECPGdisconnect(__LINE__, "main");}

View File

@ -50,7 +50,7 @@
[NO_PID]: sqlca: code: 0, state: 00000
[NO_PID]: ecpg_finish: connection main closed
[NO_PID]: sqlca: code: 0, state: 00000
[NO_PID]: ECPGconnect: opening database ecpg2_regression on <DEFAULT> port <DEFAULT> with options connect_timeout=180 & client_encoding=latin1 for user regress_ecpg_user1
[NO_PID]: ECPGconnect: opening database ecpg2_regression on <DEFAULT> port <DEFAULT> with options connect_timeout=180 & client_encoding=sql_ascii for user regress_ecpg_user1
[NO_PID]: sqlca: code: 0, state: 00000
[NO_PID]: ecpg_finish: connection main closed
[NO_PID]: sqlca: code: 0, state: 00000

View File

@ -69,7 +69,6 @@ ecpg_test_files = files(
ecpg_regress_args = [
'--dbname=ecpg1_regression,ecpg2_regression',
'--create-role=regress_ecpg_user1,regress_ecpg_user2',
'--encoding=SQL_ASCII',
]
tests += {

View File

@ -12,7 +12,7 @@ if ($ENV{with_icu} ne 'yes')
}
my $node1 = PostgreSQL::Test::Cluster->new('node1');
$node1->init;
$node1->init(extra => ['--locale-provider=libc']);
$node1->start;
$node1->safe_psql('postgres',