Modify COPY TO to emit carriage returns and newlines as backslash escapes

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml

index 850af1f077584c434b6109dd7df3564423fce16e..b4a226876a9076f5f1adce09e4ec411b604fb72f 100644 (file)
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -1,5 +1,5 @@
  <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/ref/copy.sgml,v 1.27 2002/01/20 22:19:56 petere Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/ref/copy.sgml,v 1.28 2002/02/12 21:25:34 tgl Exp $
  PostgreSQL documentation
  -->
  
@@ -74,7 +74,7 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
        <term><replaceable class="parameter">filename</replaceable></term>
        <listitem>
         <para>
-       The absolute Unix file name of the input or output file.
+       The absolute Unix path name of the input or output file.
         </para>
        </listitem>
       </varlistentry>
@@ -225,7 +225,7 @@ ERROR: <replaceable>reason</replaceable>
      By default, a text copy uses a tab ("\t") character as a delimiter
      between fields.  The field delimiter may be changed to any other single
      character with the keyword phrase USING DELIMITERS.  Characters
-    in data fields which happen to match the delimiter character will
+    in data fields that happen to match the delimiter character will
      be backslash quoted.
     </para>
     
@@ -265,8 +265,8 @@ ERROR: <replaceable>reason</replaceable>
      by the <application>PostgreSQL</application> user (the user ID the
      server runs as), not the client.
      <command>COPY</command> naming a file is only allowed to database
-    superusers, since it allows writing on any file that the backend has
-    privileges to write on.
+    superusers, since it allows reading or writing any file that the backend
+    has privileges to access.
      
      <tip>
       <para>
@@ -297,57 +297,109 @@ ERROR: <replaceable>reason</replaceable>
    <title>File Formats</title>
    <refsect2>
     <refsect2info>
-    <date>2001-01-02</date>
+    <date>2002-02-12</date>
     </refsect2info>
     <title>Text Format</title>
     <para>
-    When <command>COPY TO</command> is used without the BINARY option,
-    the file generated will have each row (instance) on a single line, with each
-    column (attribute) separated by the delimiter character.  Embedded
-    delimiter characters will be preceded by a backslash character
-    ("\").  The attribute values themselves are strings generated by the
-    output function associated with each attribute type.  The output
-    function for a type should not try to generate the backslash
-    character; this will be handled by <command>COPY</command> itself.
+    When <command>COPY</command> is used without the BINARY option,
+    the file read or written is a text file with one line per table row.
+    Columns (attributes) in a row are separated by the delimiter character.
+    The attribute values themselves are strings generated by the
+    output function, or acceptable to the input function, of each
+    attribute's data type.  The specified null-value string is used in
+    place of attributes that are NULL.
     </para>
     <para>
-    The actual format for each instance is
-    <programlisting>
-&lt;attr1&gt;&lt;<replaceable class=parameter>separator</replaceable>&gt;&lt;attr2&gt;&lt;<replaceable class=parameter>separator</replaceable>&gt;...&lt;<replaceable class=parameter>separator</replaceable>&gt;&lt;attr<replaceable class="parameter">n</replaceable>&gt;&lt;newline&gt;
-    </programlisting>
-    Note that the end of each row is marked by a Unix-style newline
-    ("\n").  <command>COPY FROM</command> will not behave as desired
-    if given a file containing DOS- or Mac-style newlines.
+    If WITH OIDS is specified, the OID is read or written as the first column,
+    preceding the user data columns.  (An error is raised if WITH OIDS is
+    specified for a table that does not have OIDs.)
     </para>
     <para>
-    The OID is emitted as the first column if WITH OIDS is specified.
-    (An error is raised if WITH OIDS is specified for a table that does not
-    have OIDs.)
+    End of data can be represented by a single line containing just
+    backslash-period (<literal>\.</>).  An end-of-data marker is
+    not necessary when reading from a Unix file, since the end of file
+    serves perfectly well; but an end marker must be provided when copying
+    data to or from a client application.
     </para>
     <para>
-    If <command>COPY TO</command> is sending its output to standard
-    output instead of a file, after the last row it will send a backslash ("\")
-    and a period (".") followed by a newline.
-    Similarly, if <command>COPY FROM</command> is reading
-    from standard input, it will expect a backslash ("\") and a period
-    (".") followed by a newline, as the first three characters on a
-    line to denote end-of-file.  However, <command>COPY FROM</command>
-    will terminate correctly (followed by the backend itself) if the
-    input connection is closed before this special end-of-file pattern is
-    found.
+    Backslash characters (<literal>\</>) may be used in the
+    <command>COPY</command> data to quote data characters that might otherwise
+    be taken as row or column delimiters.  In particular, the following
+    characters <emphasis>must</> be preceded by a backslash if they appear
+    as part of an attribute value: backslash itself, newline, and the current
+    delimiter character.
     </para>
     <para>
-    The backslash character has other special meanings.  A literal backslash
-    character is represented as two
-    consecutive backslashes ("\\").  A literal tab character is represented
-    as a backslash and a tab.  (If you are using something other than tab
-    as the column delimiter, backslash that delimiter character to include
-    it in data.)  A literal newline character is
-    represented as a backslash and a newline.  When loading text data
-    not generated by <application>PostgreSQL</application>,
-    you will need to convert backslash
-    characters ("\") to double-backslashes ("\\") to ensure that they 
-    are loaded properly.
+    The following special backslash sequences are recognized by
+    <command>COPY FROM</command>:
+
+   <informaltable>
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Sequence</entry>
+       <entry>Represents</entry>
+      </row>
+     </thead>
+
+     <tbody>
+      <row>
+       <entry><literal>\b</></entry>
+       <entry>Backspace (ASCII 8)</entry>
+      </row>
+      <row>
+       <entry><literal>\f</></entry>
+       <entry>Form feed (ASCII 12)</entry>
+      </row>
+      <row>
+       <entry><literal>\n</></entry>
+       <entry>Newline (ASCII 10)</entry>
+      </row>
+      <row>
+       <entry><literal>\r</></entry>
+       <entry>Carriage return (ASCII 13)</entry>
+      </row>
+      <row>
+       <entry><literal>\t</></entry>
+       <entry>Tab (ASCII 9)</entry>
+      </row>
+      <row>
+       <entry><literal>\v</></entry>
+       <entry>Vertical tab (ASCII 11)</entry>
+      </row>
+      <row>
+       <entry><literal>\</><replaceable>digits</></entry>
+       <entry>Backslash followed by one to three octal digits specifies
+       the character with that numeric code</entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </informaltable>
+
+    Presently, <command>COPY TO</command> will never emit an octal-digits
+    backslash sequence, but it does use the other sequences listed above
+    for those control characters.
+   </para>
+   <para>
+    Never put a backslash before a data character <literal>N</> or period
+    (<literal>.</>). Such pairs will be mistaken for the default null string
+    or the end-of-data marker, respectively.  Any other backslashed character
+    that is not mentioned in the above table will be taken to represent itself.
+   </para>
+   <para>
+    It is strongly recommended that applications generating COPY data convert
+    data newlines and carriage returns to the <literal>\n</> and
+    <literal>\r</> sequences respectively.  At present
+    (<productname>PostgreSQL</productname> 7.2 and older versions) it is
+    possible to represent a data carriage return without any special quoting,
+    and to represent a data newline by a backslash and newline.  However,
+    these representations will not be accepted by default in future releases.
+   </para>
+   <para>
+    Note that the end of each row is marked by a Unix-style newline
+    ("\n").  Presently, <command>COPY FROM</command> will not behave as
+    desired if given a file containing DOS- or Mac-style newlines.
+    This is expected to change in future releases.
     </para>
    </refsect2>
  
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c

index b944279d7e1e21f5a0302a98fc70427cff72e87f..f42b865687c6fa0caa6f0904fe0e4be786e1a69e 100644 (file)
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.144 2001/12/04 21:19:57 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.145 2002/02/12 21:25:41 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -41,7 +41,7 @@
  #endif
  
  #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
-#define VALUE(c) ((c) - '0')
+#define OCTVALUE(c) ((c) - '0')
  
  
  /* non-export function prototypes */
@@ -83,13 +83,13 @@ static int  server_encoding;
   * Internal communications functions
   */
  static void CopySendData(void *databuf, int datasize, FILE *fp);
-static void CopySendString(char *str, FILE *fp);
+static void CopySendString(const char *str, FILE *fp);
  static void CopySendChar(char c, FILE *fp);
  static void CopyGetData(void *databuf, int datasize, FILE *fp);
  static int     CopyGetChar(FILE *fp);
  static int     CopyGetEof(FILE *fp);
  static int     CopyPeekChar(FILE *fp);
-static void CopyDonePeek(FILE *fp, int c, int pickup);
+static void CopyDonePeek(FILE *fp, int c, bool pickup);
  
  /*
   * CopySendData sends output data either to the file
@@ -118,9 +118,9 @@ CopySendData(void *databuf, int datasize, FILE *fp)
  }
  
  static void
-CopySendString(char *str, FILE *fp)
+CopySendString(const char *str, FILE *fp)
  {
-       CopySendData(str, strlen(str), fp);
+       CopySendData((void *) str, strlen(str), fp);
  }
  
  static void
@@ -178,10 +178,12 @@ CopyGetEof(FILE *fp)
  
  /*
   * CopyPeekChar reads a byte in "peekable" mode.
+ *
   * after each call to CopyPeekChar, a call to CopyDonePeek _must_
   * follow, unless EOF was returned.
- * CopyDonePeek will either take the peeked char off the steam
- * (if pickup is != 0) or leave it on the stream (if pickup == 0)
+ *
+ * CopyDonePeek will either take the peeked char off the stream
+ * (if pickup is true) or leave it on the stream (if pickup is false).
   */
  static int
  CopyPeekChar(FILE *fp)
@@ -199,15 +201,13 @@ CopyPeekChar(FILE *fp)
  }
  
  static void
-CopyDonePeek(FILE *fp, int c, int pickup)
+CopyDonePeek(FILE *fp, int c, bool pickup)
  {
         if (!fp)
         {
                 if (pickup)
                 {
-                       /*
-                        * We want to pick it up
-                        */
+                       /* We want to pick it up */
                         (void) pq_getbyte();
                 }
                 /* If we didn't want to pick it up, just leave it where it sits */
@@ -219,7 +219,7 @@ CopyDonePeek(FILE *fp, int c, int pickup)
                         /* We don't want to pick it up - so put it back in there */
                         ungetc(c, fp);
                 }
-               /* If we wanted to pick it up, it's already there */
+               /* If we wanted to pick it up, it's already done */
         }
  }
  
@@ -1078,31 +1078,30 @@ CopyReadAttribute(FILE *fp, bool *isnull, char *delim, int *newline, char *null_
                                         {
                                                 int                     val;
  
-                                               val = VALUE(c);
+                                               val = OCTVALUE(c);
                                                 c = CopyPeekChar(fp);
                                                 if (ISOCTAL(c))
                                                 {
-                                                       val = (val << 3) + VALUE(c);
-                                                       CopyDonePeek(fp, c, 1);         /* Pick up the
-                                                                                                                * character! */
+                                                       val = (val << 3) + OCTVALUE(c);
+                                                       CopyDonePeek(fp, c, true /*pick up*/);
                                                         c = CopyPeekChar(fp);
                                                         if (ISOCTAL(c))
                                                         {
-                                                               CopyDonePeek(fp, c, 1); /* pick up! */
-                                                               val = (val << 3) + VALUE(c);
+                                                               val = (val << 3) + OCTVALUE(c);
+                                                               CopyDonePeek(fp, c, true /*pick up*/);
                                                         }
                                                         else
                                                         {
                                                                 if (c == EOF)
                                                                         goto endOfFile;
-                                                               CopyDonePeek(fp, c, 0); /* Return to stream! */
+                                                               CopyDonePeek(fp, c, false /*put back*/);
                                                         }
                                                 }
                                                 else
                                                 {
                                                         if (c == EOF)
                                                                 goto endOfFile;
-                                                       CopyDonePeek(fp, c, 0);         /* Return to stream! */
+                                                       CopyDonePeek(fp, c, false /*put back*/);
                                                 }
                                                 c = val & 0377;
                                         }
@@ -1144,6 +1143,7 @@ CopyReadAttribute(FILE *fp, bool *isnull, char *delim, int *newline, char *null_
                 }
                 appendStringInfoCharMacro(&attribute_buf, c);
  #ifdef MULTIBYTE
+               /* XXX shouldn't this be done even when encoding is the same? */
                 if (client_encoding != server_encoding)
                 {
                         /* get additional bytes of the char, if any */
@@ -1190,15 +1190,18 @@ CopyAttributeOut(FILE *fp, char *server_string, char *delim)
  {
         char       *string;
         char            c;
+       char            delimc = delim[0];
  
  #ifdef MULTIBYTE
+       bool            same_encoding;
         char       *string_start;
         int                     mblen;
         int                     i;
  #endif
  
  #ifdef MULTIBYTE
-       if (client_encoding != server_encoding)
+       same_encoding = (server_encoding == client_encoding);
+       if (!same_encoding)
         {
                 string = (char *) pg_server_to_client((unsigned char *) server_string,
                                                                                           strlen(server_string));
@@ -1207,31 +1210,64 @@ CopyAttributeOut(FILE *fp, char *server_string, char *delim)
         else
         {
                 string = server_string;
-               string_start = NULL;    /* unused, but keep compiler quiet */
+               string_start = NULL;
         }
  #else
         string = server_string;
  #endif
  
  #ifdef MULTIBYTE
-       for (; (mblen = (server_encoding == client_encoding ? 1 : pg_encoding_mblen(client_encoding, string))) &&
-                ((c = *string) != '\0'); string += mblen)
+       for (; (c = *string) != '\0'; string += mblen)
  #else
         for (; (c = *string) != '\0'; string++)
  #endif
         {
-               if (c == delim[0] || c == '\n' || c == '\\')
-                       CopySendChar('\\', fp);
  #ifdef MULTIBYTE
-               for (i = 0; i < mblen; i++)
-                       CopySendChar(*(string + i), fp);
-#else
-               CopySendChar(c, fp);
+               mblen = 1;
  #endif
+               switch (c)
+               {
+                       case '\b':
+                               CopySendString("\\b", fp);
+                               break;
+                       case '\f':
+                               CopySendString("\\f", fp);
+                               break;
+                       case '\n':
+                               CopySendString("\\n", fp);
+                               break;
+                       case '\r':
+                               CopySendString("\\r", fp);
+                               break;
+                       case '\t':
+                               CopySendString("\\t", fp);
+                               break;
+                       case '\v':
+                               CopySendString("\\v", fp);
+                               break;
+                       case '\\':
+                               CopySendString("\\\\", fp);
+                               break;
+                       default:
+                               if (c == delimc)
+                                       CopySendChar('\\', fp);
+                               CopySendChar(c, fp);
+#ifdef MULTIBYTE
+                               /* XXX shouldn't this be done even when encoding is same? */
+                               if (!same_encoding)
+                               {
+                                       /* send additional bytes of the char, if any */
+                                       mblen = pg_encoding_mblen(client_encoding, string);
+                                       for (i = 1; i < mblen; i++)
+                                               CopySendChar(string[i], fp);
+                               }
+#endif
+                               break;
+               }
         }
  
  #ifdef MULTIBYTE
-       if (client_encoding != server_encoding)
+       if (string_start)
                 pfree(string_start);    /* pfree pg_server_to_client result */
  #endif
  }
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)
doc/src/sgml/ref/copy.sgml		patch \| blob \| history
src/backend/commands/copy.c		patch \| blob \| history