diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 90ceb77bbb..9ba7c5fc03 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.307 2009/03/31 22:12:46 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.308 2009/04/19 21:08:54 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -2718,7 +2718,7 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals) char *start_ptr; char *end_ptr; int input_len; - bool saw_high_bit = false; + bool saw_non_ascii = false; /* Make sure space remains in fieldvals[] */ if (fieldno >= maxfields) @@ -2783,8 +2783,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals) } } c = val & 0377; - if (IS_HIGHBIT_SET(c)) - saw_high_bit = true; + if (c == '\0' || IS_HIGHBIT_SET(c)) + saw_non_ascii = true; } break; case 'x': @@ -2808,8 +2808,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals) } } c = val & 0xff; - if (IS_HIGHBIT_SET(c)) - saw_high_bit = true; + if (c == '\0' || IS_HIGHBIT_SET(c)) + saw_non_ascii = true; } } break; @@ -2847,11 +2847,11 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals) *output_ptr++ = '\0'; /* - * If we de-escaped a char with the high bit set, make sure we still + * If we de-escaped a non-7-bit-ASCII char, make sure we still * have valid data for the db encoding. Avoid calling strlen here for * the sake of efficiency. */ - if (saw_high_bit) + if (saw_non_ascii) { char *fld = fieldvals[fieldno]; diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index a3d4d857c8..8551cd2753 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -24,7 +24,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.150 2009/04/14 22:18:47 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.151 2009/04/19 21:08:54 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -60,7 +60,7 @@ bool escape_string_warning = true; bool standard_conforming_strings = false; static bool warn_on_first_escape; -static bool saw_high_bit = false; +static bool saw_non_ascii = false; /* * literalbuf is used to accumulate literal values when multiple rules @@ -453,7 +453,7 @@ other . {xqstart} { warn_on_first_escape = true; - saw_high_bit = false; + saw_non_ascii = false; SET_YYLLOC(); if (standard_conforming_strings) BEGIN(xq); @@ -463,7 +463,7 @@ other . } {xestart} { warn_on_first_escape = false; - saw_high_bit = false; + saw_non_ascii = false; SET_YYLLOC(); BEGIN(xe); startlit(); @@ -477,10 +477,11 @@ other . {quotefail} { yyless(1); BEGIN(INITIAL); - /* check that the data remains valid if it might have been + /* + * check that the data remains valid if it might have been * made invalid by unescaping any chars. */ - if (saw_high_bit) + if (saw_non_ascii) pg_verifymbstr(literalbuf, literallen, false); yylval.str = litbufdup(); return SCONST; @@ -526,16 +527,16 @@ other . check_escape_warning(); addlitchar(c); - if (IS_HIGHBIT_SET(c)) - saw_high_bit = true; + if (c == '\0' || IS_HIGHBIT_SET(c)) + saw_non_ascii = true; } {xehexesc} { unsigned char c = strtoul(yytext+2, NULL, 16); check_escape_warning(); addlitchar(c); - if (IS_HIGHBIT_SET(c)) - saw_high_bit = true; + if (c == '\0' || IS_HIGHBIT_SET(c)) + saw_non_ascii = true; } {quotecontinue} { /* ignore */ @@ -1083,6 +1084,11 @@ litbuf_udeescape(unsigned char escape) } *out = '\0'; + /* + * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII + * codes; but it's probably not worth the trouble, since this isn't + * likely to be a performance-critical path. + */ pg_verifymbstr(new, out - new, false); return new; } @@ -1090,14 +1096,6 @@ litbuf_udeescape(unsigned char escape) static unsigned char unescape_single_char(unsigned char c) { - /* Normally we wouldn't expect to see \n where n has its high bit set - * but we set the flag to check the string if we do get it, so - * that this doesn't become a way of getting around the coding validity - * checks. - */ - if (IS_HIGHBIT_SET(c)) - saw_high_bit = true; - switch (c) { case 'b': @@ -1111,6 +1109,10 @@ unescape_single_char(unsigned char c) case 't': return '\t'; default: + /* check for backslash followed by non-7-bit-ASCII */ + if (c == '\0' || IS_HIGHBIT_SET(c)) + saw_non_ascii = true; + return c; } }