Handle carriage returns and line feeds in COPY CSV mode.

author Bruce Momjian <bruce@momjian.us>

Sat, 12 Mar 2005 05:41:34 +0000 (05:41 +0000)

committer Bruce Momjian <bruce@momjian.us>

Sat, 12 Mar 2005 05:41:34 +0000 (05:41 +0000)
author Bruce Momjian <bruce@momjian.us>
Sat, 12 Mar 2005 05:41:34 +0000 (05:41 +0000)
committer Bruce Momjian <bruce@momjian.us>
Sat, 12 Mar 2005 05:41:34 +0000 (05:41 +0000)
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c

index 9f99bdd9e482868a29f5d92a572b8c40990689e4..ed815098aba7577ce2b29286bf108f8e40bc3a12 100644 (file)
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.236 2004/12/31 21:59:41 pgsql Exp $
+ *       $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.237 2005/03/12 05:41:34 momjian Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -98,7 +98,6 @@ static bool fe_eof;                           /* true if detected end of copy data */
  static EolType eol_type;               /* EOL type of input */
  static int     client_encoding;        /* remote side's character encoding */
  static int     server_encoding;        /* local encoding */
-static bool embedded_line_warning;
  
  /* these are just for error messages, see copy_in_error_callback */
  static bool copy_binary;               /* is it a binary copy? */
@@ -139,7 +138,7 @@ static void CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
  static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
                  List *force_notnull_atts);
-static bool CopyReadLine(void);
+static bool CopyReadLine(char * quote, char * escape);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
                                   CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
@@ -1191,7 +1190,6 @@ CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
         attr = tupDesc->attrs;
         num_phys_attrs = tupDesc->natts;
         attr_count = list_length(attnumlist);
-       embedded_line_warning = false;
  
         /*
          * Get info about the columns we need to process.
@@ -1718,7 +1716,8 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
                         ListCell   *cur;
  
                         /* Actually read the line into memory here */
-                       done = CopyReadLine();
+                       done = csv_mode ? 
+                               CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL);
  
                         /*
                          * EOF at start of line means we're done.  If we see EOF after
@@ -2006,7 +2005,7 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
   * by newline.
   */
  static bool
-CopyReadLine(void)
+CopyReadLine(char * quote, char * escape)
  {
         bool            result;
         bool            change_encoding = (client_encoding != server_encoding);
@@ -2015,6 +2014,19 @@ CopyReadLine(void)
         int                     j;
         unsigned char s[2];
         char       *cvt;
+       bool        in_quote = false, last_was_esc = false, csv_mode = false;
+       char        quotec = '\0', escapec = '\0';
+
+       if (quote)
+       {
+               csv_mode = true;
+               quotec = quote[0];
+               escapec = escape[0];
+               /* ignore special escape processing if it's the same as quotec */
+               if (quotec == escapec)
+                       escapec = '\0';
+       }
+
  
         s[1] = 0;
  
@@ -2031,11 +2043,20 @@ CopyReadLine(void)
  
         /*
          * In this loop we only care for detecting newlines (\r and/or \n) and
-        * the end-of-copy marker (\.).  For backwards compatibility we allow
+        * the end-of-copy marker (\.).  
+        *
+        * In Text mode, for backwards compatibility we allow
          * backslashes to escape newline characters.  Backslashes other than
          * the end marker get put into the line_buf, since CopyReadAttribute
-        * does its own escape processing.      These four characters, and only
-        * these four, are assumed the same in frontend and backend encodings.
+        * does its own escape processing.      
+        *
+        * In CSV mode, CR and NL inside q quoted field are just part of the
+        * data value and are put in line_buf. We keep just enough state
+        * to know if we are currently in a quoted field or not.
+        *
+        * These four characters, and only these four, are assumed the same in 
+        * frontend and backend encodings.
+        *
          * We do not assume that second and later bytes of a frontend
          * multibyte character couldn't look like ASCII characters.
          */
@@ -2047,13 +2068,49 @@ CopyReadLine(void)
                         result = true;
                         break;
                 }
-               if (c == '\r')
+
+               if (csv_mode)
+               {
+                       /*  
+                        * Dealing with quotes and escapes here is mildly tricky. If the
+                        * quote char is also the escape char, there's no problem - we  
+                        * just use the char as a toggle. If they are different, we need
+                        * to ensure that we only take account of an escape inside a quoted
+                        * field and immediately preceding a quote char, and not the
+                        * second in a escape-escape sequence.
+                        */ 
+
+                       if (in_quote && c == escapec)
+                               last_was_esc = ! last_was_esc;
+                       if (c == quotec && ! last_was_esc)
+                               in_quote = ! in_quote;
+                       if (c != escapec)
+                               last_was_esc = false;
+
+                       /*
+                        * updating the line count for embedded CR and/or LF chars is 
+                        * necessarily a little fragile - this test is probably about 
+                        * the best we can do.
+                        */ 
+                       if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n')) 
+                               copy_lineno++; 
+               }
+
+               if (!in_quote && c == '\r')
                 {
                         if (eol_type == EOL_NL)
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-                                                errmsg("literal carriage return found in data"),
-                                 errhint("Use \"\\r\" to represent carriage return.")));
+                       {
+                               if (! csv_mode)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                        errmsg("literal carriage return found in data"),
+                                                        errhint("Use \"\\r\" to represent carriage return.")));
+                               else
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                        errmsg("unquoted carriage return found in CSV data"),
+                                                        errhint("Use quoted CSV field to represent carriage return.")));
+                       }
                         /* Check for \r\n on first line, _and_ handle \r\n. */
                         if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
                         {
@@ -2068,10 +2125,19 @@ CopyReadLine(void)
                                 {
                                         /* found \r, but no \n */
                                         if (eol_type == EOL_CRNL)
-                                               ereport(ERROR,
-                                                               (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-                                                errmsg("literal carriage return found in data"),
-                                                                errhint("Use \"\\r\" to represent carriage return.")));
+                                       {
+                                               if (!csv_mode)
+                                                       ereport(ERROR,
+                                                                       (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                                        errmsg("literal carriage return found in data"),
+                                                                        errhint("Use \"\\r\" to represent carriage return.")));
+                                               else
+                                                       ereport(ERROR,
+                                                                       (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                                        errmsg("unquoted carriage return found in data"),
+                                                                        errhint("Use quoted CSV field to represent carriage return.")));
+
+                                       }
  
                                         /*
                                          * if we got here, it is the first line and we didn't
@@ -2083,26 +2149,47 @@ CopyReadLine(void)
                         }
                         break;
                 }
-               if (c == '\n')
+               if (!in_quote && c == '\n')
                 {
                         if (eol_type == EOL_CR || eol_type == EOL_CRNL)
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-                                                errmsg("literal newline found in data"),
-                                                errhint("Use \"\\n\" to represent newline.")));
+                       {
+                               if (!csv_mode)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                        errmsg("literal newline found in data"),
+                                                        errhint("Use \"\\n\" to represent newline.")));
+                               else
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                        errmsg("unquoted newline found in data"),
+                                                        errhint("Use quoted CSV field to represent newline.")));
+                                       
+                       }
                         eol_type = EOL_NL;
                         break;
                 }
-               if (c == '\\')
+
+               if ((line_buf.len == 0 || !csv_mode) && c == '\\')
                 {
-                       c = CopyGetChar();
-                       if (c == EOF)
+                       int c2;
+                       
+                       if (csv_mode)
+                               c2 = CopyPeekChar();
+                       else
+                               c2 = c = CopyGetChar();
+
+                       if (c2 == EOF)
                         {
                                 result = true;
+                               if (csv_mode)
+                                       CopyDonePeek(c2, true);
                                 break;
                         }
-                       if (c == '.')
+                       if (c2 == '.')
                         {
+                               if (csv_mode)
+                                       CopyDonePeek(c2, true); /* allow keep calling GetChar() */
+
                                 if (eol_type == EOL_CRNL)
                                 {
                                         c = CopyGetChar();
@@ -2140,8 +2227,12 @@ CopyReadLine(void)
                                 result = true;  /* report EOF */
                                 break;
                         }
-                       /* not EOF mark, so emit \ and following char literally */
-                       appendStringInfoCharMacro(&line_buf, '\\');
+                       
+                       if (csv_mode)
+                               CopyDonePeek(c2, false); /* not a dot, so put it back */ 
+                       else
+                               /* not EOF mark, so emit \ and following char literally */
+                               appendStringInfoCharMacro(&line_buf, '\\');
                 }
  
                 appendStringInfoCharMacro(&line_buf, c);
@@ -2369,34 +2460,6 @@ CopyReadAttributeCSV(const char *delim, const char *null_print, char *quote,
  
         for (;;)
         {
-               /* handle multiline quoted fields */
-               if (in_quote && line_buf.cursor >= line_buf.len)
-               {
-                       bool            done;
-
-                       switch (eol_type)
-                       {
-                               case EOL_NL:
-                                       appendStringInfoString(&attribute_buf, "\n");
-                                       break;
-                               case EOL_CR:
-                                       appendStringInfoString(&attribute_buf, "\r");
-                                       break;
-                               case EOL_CRNL:
-                                       appendStringInfoString(&attribute_buf, "\r\n");
-                                       break;
-                               case EOL_UNKNOWN:
-                                       /* shouldn't happen - just keep going */
-                                       break;
-                       }
-
-                       copy_lineno++;
-                       done = CopyReadLine();
-                       if (done && line_buf.len == 0)
-                               break;
-                       start_cursor = line_buf.cursor;
-               }
-
                 end_cursor = line_buf.cursor;
                 if (line_buf.cursor >= line_buf.len)
                         break;
@@ -2629,25 +2692,6 @@ CopyAttributeOutCSV(char *server_string, char *delim, char *quote,
                  !use_quote && (c = *test_string) != '\0';
                  test_string += mblen)
         {
-               /*
-                * We don't know here what the surrounding line end characters
-                * might be. It might not even be under postgres' control. So
-                * we simple warn on ANY embedded line ending character.
-                *
-                * This warning will disappear when we make line parsing field-aware,
-                * so that we can reliably read in embedded line ending characters
-                * regardless of the file's line-end context.
-                *
-                */
-
-               if (!embedded_line_warning  && (c == '\n' || c == '\r') )
-               {
-                       embedded_line_warning = true;
-                       elog(WARNING,
-                                "CSV fields with embedded linefeed or carriage return "
-                                "characters might not be able to be reimported");
-               }
-
                 if (c == delimc || c == quotec || c == '\n' || c == '\r')
                         use_quote = true;
                 if (!same_encoding)
author	Bruce Momjian <bruce@momjian.us>
	Sat, 12 Mar 2005 05:41:34 +0000 (05:41 +0000)
committer	Bruce Momjian <bruce@momjian.us>
	Sat, 12 Mar 2005 05:41:34 +0000 (05:41 +0000)