From 87289ff35ca372f8c88d26cc9bffa942dd8d34a8 Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Wed, 28 Dec 2005 03:25:32 +0000 Subject: [PATCH] Add regression tests for CSV and \., and add automatic quoting of a single column dump that has a \. value, so the load works properly. I also added documentation describing this issue. --- doc/src/sgml/ref/copy.sgml | 33 ++++++++++++++++-------- src/backend/commands/copy.c | 40 +++++++++++++++++++---------- src/test/regress/expected/copy2.out | 3 +++ src/test/regress/sql/copy2.sql | 10 ++++++++ 4 files changed, 61 insertions(+), 25 deletions(-) diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 2d8dc945716..becaecf5b82 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -1,5 +1,5 @@ @@ -511,17 +511,28 @@ COPY tablename [ ( + + Because backslash is not a special character in the CSV + format, \., the end-of-data marker, could also appear + as a data value. To avoid any misinterpretation, a \. + data value appearing as a lone entry on a line is automatically + quoted on output, and on input, if quoted, is not interpreted as the + end-of-data marker. If you are loading a single-column table that + might have a column value of \., you might need to quote + that value in the input file. + + - - In CSV mode, all characters are significant. A quoted value - surrounded by white space, or any characters other than - DELIMITER, will include those characters. This can cause - errors if you import data from a system that pads CSV - lines with white space out to some fixed width. If such a situation - arises you might need to preprocess the CSV file to remove - the trailing white space, before importing the data into - PostgreSQL. - + + In CSV mode, all characters are significant. A quoted value + surrounded by white space, or any characters other than + DELIMITER, will include those characters. This can cause + errors if you import data from a system that pads CSV + lines with white space out to some fixed width. If such a situation + arises you might need to preprocess the CSV file to remove + the trailing white space, before importing the data into + PostgreSQL. + diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index ae1d40e2ef3..f97aafc2034 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.256 2005/12/27 18:10:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.257 2005/12/28 03:25:32 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -244,7 +244,7 @@ static Datum CopyReadBinaryAttribute(CopyState cstate, bool *isnull); static void CopyAttributeOutText(CopyState cstate, char *server_string); static void CopyAttributeOutCSV(CopyState cstate, char *server_string, - bool use_quote); + bool use_quote, bool single_attr); static List *CopyGetAttnums(Relation rel, List *attnamelist); static char *limit_printout_length(const char *str); @@ -1284,7 +1284,8 @@ CopyTo(CopyState cstate) colname = NameStr(attr[attnum - 1]->attname); - CopyAttributeOutCSV(cstate, colname, false); + CopyAttributeOutCSV(cstate, colname, false, + list_length(cstate->attnumlist) == 1); } CopySendEndOfRow(cstate); @@ -1359,7 +1360,8 @@ CopyTo(CopyState cstate) value)); if (cstate->csv_mode) CopyAttributeOutCSV(cstate, string, - force_quote[attnum - 1]); + force_quote[attnum - 1], + list_length(cstate->attnumlist) == 1); else CopyAttributeOutText(cstate, string); } @@ -2968,7 +2970,7 @@ CopyAttributeOutText(CopyState cstate, char *server_string) */ static void CopyAttributeOutCSV(CopyState cstate, char *server_string, - bool use_quote) + bool use_quote, bool single_attr) { char *string; char c; @@ -2993,17 +2995,27 @@ CopyAttributeOutCSV(CopyState cstate, char *server_string, */ if (!use_quote) { - for (tstring = string; (c = *tstring) != '\0'; tstring += mblen) - { - if (c == delimc || c == quotec || c == '\n' || c == '\r') + /* + * Because '\.' can be a data value, quote it if it appears + * alone on a line so it is not interpreted as the end-of-data + * marker. + */ + if (single_attr && strcmp(string, "\\.") == 0) + use_quote = true; + else + { + for (tstring = string; (c = *tstring) != '\0'; tstring += mblen) { - use_quote = true; - break; + if (c == delimc || c == quotec || c == '\n' || c == '\r') + { + use_quote = true; + break; + } + if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c)) + mblen = pg_encoding_mblen(cstate->client_encoding, tstring); + else + mblen = 1; } - if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c)) - mblen = pg_encoding_mblen(cstate->client_encoding, tstring); - else - mblen = 1; } } diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out index 78f20605702..524e88cbae6 100644 --- a/src/test/regress/expected/copy2.out +++ b/src/test/regress/expected/copy2.out @@ -194,6 +194,9 @@ COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\'; --test that we read consecutive LFs properly CREATE TEMP TABLE testnl (a int, b text, c int); COPY testnl FROM stdin CSV; +-- test end of copy marker +CREATE TEMP TABLE testeoc (a text); +COPY testeoc FROM stdin CSV; DROP TABLE x, y; DROP FUNCTION fn_x_before(); DROP FUNCTION fn_x_after(); diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql index add8214d19d..d962d2e048e 100644 --- a/src/test/regress/sql/copy2.sql +++ b/src/test/regress/sql/copy2.sql @@ -139,6 +139,16 @@ COPY testnl FROM stdin CSV; inside",2 \. +-- test end of copy marker +CREATE TEMP TABLE testeoc (a text); + +COPY testeoc FROM stdin CSV; +a\. +\.b +c\.d +"\." +\. + DROP TABLE x, y; DROP FUNCTION fn_x_before();