Fix ILIKE to honor collation when working in single-byte encodings.

The original collation patch only fixed the multi-byte code path. This change also ensures that ILIKE's idea of the case-folding rules is exactly the same as str_tolower's.
2025-01-30 19:00:29 +08:00 · 2011-04-09 17:12:39 -04:00 · 2011-04-09 17:12:39 -04:00 · 00f11f419c
commit 00f11f419c
parent f89e4dfa75
2 changed files with 72 additions and 20 deletions
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@ -19,8 +19,10 @@

 #include <ctype.h>

+#include "catalog/pg_collation.h"
 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
+#include "utils/pg_locale.h"


 #define LIKE_TRUE						1
@ -28,15 +30,19 @@
 #define LIKE_ABORT						(-1)


-static int	SB_MatchText(char *t, int tlen, char *p, int plen);
+static int	SB_MatchText(char *t, int tlen, char *p, int plen,
+						 pg_locale_t locale, bool locale_is_c);
 static text *SB_do_like_escape(text *, text *);

-static int	MB_MatchText(char *t, int tlen, char *p, int plen);
+static int	MB_MatchText(char *t, int tlen, char *p, int plen,
+						 pg_locale_t locale, bool locale_is_c);
 static text *MB_do_like_escape(text *, text *);

-static int	UTF8_MatchText(char *t, int tlen, char *p, int plen);
+static int	UTF8_MatchText(char *t, int tlen, char *p, int plen,
+						   pg_locale_t locale, bool locale_is_c);

-static int	SB_IMatchText(char *t, int tlen, char *p, int plen);
+static int	SB_IMatchText(char *t, int tlen, char *p, int plen,
+						  pg_locale_t locale, bool locale_is_c);

 static int	GenericMatchText(char *s, int slen, char *p, int plen);
 static int	Generic_Text_IC_like(text *str, text *pat, Oid collation);
@ -78,6 +84,24 @@ wchareq(char *p1, char *p2)
 * comparison.	This should be revisited when we install better locale support.
 */

+/*
+ * We do handle case-insensitive matching for single-byte encodings using
+ * fold-on-the-fly processing, however.
+ */
+static char
+SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c)
+{
+	if (locale_is_c)
+		return pg_ascii_tolower(c);
+#ifdef HAVE_LOCALE_T
+	else if (locale)
+		return tolower_l(c, locale);
+#endif
+	else
+		return pg_tolower(c);
+}
+
+
 #define NextByte(p, plen)	((p)++, (plen)--)

 /* Set up to compile like_match.c for multibyte characters */
@ -107,7 +131,7 @@ wchareq(char *p1, char *p2)
 #include "like_match.c"

 /* setup to compile like_match.c for single byte case insensitive matches */
-#define MATCH_LOWER
+#define MATCH_LOWER(t) SB_lower_char((unsigned char) (t), locale, locale_is_c)
 #define NextChar(p, plen) NextByte((p), (plen))
 #define MatchText SB_IMatchText

@ -121,15 +145,16 @@ wchareq(char *p1, char *p2)

 #include "like_match.c"

+/* Generic for all cases not requiring inline case-folding */
 static inline int
 GenericMatchText(char *s, int slen, char *p, int plen)
 {
 	if (pg_database_encoding_max_length() == 1)
-		return SB_MatchText(s, slen, p, plen);
+		return SB_MatchText(s, slen, p, plen, 0, true);
 	else if (GetDatabaseEncoding() == PG_UTF8)
-		return UTF8_MatchText(s, slen, p, plen);
+		return UTF8_MatchText(s, slen, p, plen, 0, true);
 	else
-		return MB_MatchText(s, slen, p, plen);
+		return MB_MatchText(s, slen, p, plen, 0, true);
 }

 static inline int
@ -142,8 +167,8 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)

 	/*
 	 * For efficiency reasons, in the single byte case we don't call lower()
-	 * on the pattern and text, but instead call to_lower on each character.
-	 * In the multi-byte case we don't have much choice :-(
+	 * on the pattern and text, but instead call SB_lower_char on each
+	 * character.  In the multi-byte case we don't have much choice :-(
 	 */

 	if (pg_database_encoding_max_length() > 1)
@ -156,17 +181,42 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
 		s = VARDATA(str);
 		slen = (VARSIZE(str) - VARHDRSZ);
 		if (GetDatabaseEncoding() == PG_UTF8)
-			return UTF8_MatchText(s, slen, p, plen);
+			return UTF8_MatchText(s, slen, p, plen, 0, true);
 		else
-			return MB_MatchText(s, slen, p, plen);
+			return MB_MatchText(s, slen, p, plen, 0, true);
 	}
 	else
 	{
+		/*
+		 * Here we need to prepare locale information for SB_lower_char.
+		 * This should match the methods used in str_tolower().
+		 */
+		pg_locale_t	locale = 0;
+		bool	locale_is_c = false;
+
+		if (lc_ctype_is_c(collation))
+			locale_is_c = true;
+		else if (collation != DEFAULT_COLLATION_OID)
+		{
+			if (!OidIsValid(collation))
+			{
+				/*
+				 * This typically means that the parser could not resolve a
+				 * conflict of implicit collations, so report it that way.
+				 */
+				ereport(ERROR,
+						(errcode(ERRCODE_INDETERMINATE_COLLATION),
+						 errmsg("could not determine which collation to use for ILIKE"),
+						 errhint("Use the COLLATE clause to set the collation explicitly.")));
+			}
+			locale = pg_newlocale_from_collation(collation);
+		}
+
 		p = VARDATA_ANY(pat);
 		plen = VARSIZE_ANY_EXHDR(pat);
 		s = VARDATA_ANY(str);
 		slen = VARSIZE_ANY_EXHDR(str);
-		return SB_IMatchText(s, slen, p, plen);
+		return SB_IMatchText(s, slen, p, plen, locale, locale_is_c);
 	}
 }

@ -274,7 +324,7 @@ bytealike(PG_FUNCTION_ARGS)
 	p = VARDATA_ANY(pat);
 	plen = VARSIZE_ANY_EXHDR(pat);

-	result = (SB_MatchText(s, slen, p, plen) == LIKE_TRUE);
+	result = (SB_MatchText(s, slen, p, plen, 0, true) == LIKE_TRUE);

 	PG_RETURN_BOOL(result);
 }
@ -295,7 +345,7 @@ byteanlike(PG_FUNCTION_ARGS)
 	p = VARDATA_ANY(pat);
 	plen = VARSIZE_ANY_EXHDR(pat);

-	result = (SB_MatchText(s, slen, p, plen) != LIKE_TRUE);
+	result = (SB_MatchText(s, slen, p, plen, 0, true) != LIKE_TRUE);

 	PG_RETURN_BOOL(result);
 }
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@ -5,7 +5,7 @@
 *
 * This file is included by like.c four times, to provide matching code for
 * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
- * and (4) case insensitive matches in single byte encodings.
+ * and (4) case insensitive matches in single-byte encodings.
 * (UTF8 is a special case because we can use a much more efficient version
 * of NextChar than can be used for general multi-byte encodings.)
 *
@ -14,7 +14,7 @@
 * NextChar
 * MatchText - to name of function wanted
 * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
- * MATCH_LOWER - define for case (4), using to_lower on single-byte chars
+ * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars
 *
 * Copyright (c) 1996-2011, PostgreSQL Global Development Group
 *
@ -70,13 +70,14 @@
 */

 #ifdef MATCH_LOWER
-#define GETCHAR(t) ((char) tolower((unsigned char) (t)))
+#define GETCHAR(t) MATCH_LOWER(t)
 #else
 #define GETCHAR(t) (t)
 #endif

 static int
-MatchText(char *t, int tlen, char *p, int plen)
+MatchText(char *t, int tlen, char *p, int plen,
+		  pg_locale_t locale, bool locale_is_c)
 {
 	/* Fast path for match-everything pattern */
 	if (plen == 1 && *p == '%')
@ -170,7 +171,8 @@ MatchText(char *t, int tlen, char *p, int plen)
 			{
 				if (GETCHAR(*t) == firstpat)
 				{
-					int			matched = MatchText(t, tlen, p, plen);
+					int			matched = MatchText(t, tlen, p, plen,
+													locale, locale_is_c);

 					if (matched != LIKE_FALSE)
 						return matched; /* TRUE or ABORT */