Add ENCODING option to COPY TO/FROM and file_fdw.

author Itagaki Takahiro <itagaki.takahiro@gmail.com>

Mon, 21 Feb 2011 05:08:04 +0000 (14:08 +0900)

committer Itagaki Takahiro <itagaki.takahiro@gmail.com>

Mon, 21 Feb 2011 05:32:40 +0000 (14:32 +0900)
author Itagaki Takahiro <itagaki.takahiro@gmail.com>
Mon, 21 Feb 2011 05:08:04 +0000 (14:08 +0900)
committer Itagaki Takahiro <itagaki.takahiro@gmail.com>
Mon, 21 Feb 2011 05:32:40 +0000 (14:32 +0900)
diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c

index 265afb5d9bc2f26a7649d64083644a13de249d74..6a84a00e8d39558b07d7767d496fa5f6c334d24d 100644 (file)
--- a/contrib/file_fdw/file_fdw.c
+++ b/contrib/file_fdw/file_fdw.c
@@ -55,6 +55,7 @@ static struct FileFdwOption valid_options[] = {
         { "quote",                      ForeignTableRelationId },
         { "escape",                     ForeignTableRelationId },
         { "null",                       ForeignTableRelationId },
+       { "encoding",           ForeignTableRelationId },
  
         /*
          * force_quote is not supported by file_fdw because it's for COPY TO.
diff --git a/doc/src/sgml/file-fdw.sgml b/doc/src/sgml/file-fdw.sgml

index e2921667184901b10c5eee30a80a270f517cb3bf..003c415b43a3561a40d034bf68063b182022f271 100644 (file)
--- a/doc/src/sgml/file-fdw.sgml
+++ b/doc/src/sgml/file-fdw.sgml
@@ -97,6 +97,17 @@
     </listitem>
    </varlistentry>
  
+  <varlistentry>
+   <term><literal>encoding</literal></term>
+
+   <listitem>
+    <para>
+     Specifies the file's encoding.
+     the same as <command>COPY</>'s <literal>ENCODING</literal> option.
+    </para>
+   </listitem>
+  </varlistentry>
+
   </variablelist>
  
   <para>
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml

index 38424ad04b9d821f474005a1e2bf91704ce4dfe4..6429a4ef0d7fbbc60ae23685bb37f76851e3e748 100644 (file)
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -40,7 +40,8 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
      QUOTE '<replaceable class="parameter">quote_character</replaceable>'
      ESCAPE '<replaceable class="parameter">escape_character</replaceable>'
      FORCE_QUOTE { ( <replaceable class="parameter">column</replaceable> [, ...] ) | * }
-    FORCE_NOT_NULL ( <replaceable class="parameter">column</replaceable> [, ...] )
+    FORCE_NOT_NULL ( <replaceable class="parameter">column</replaceable> [, ...] ) |
+    ENCODING '<replaceable class="parameter">encoding_name</replaceable>'
  </synopsis>
   </refsynopsisdiv>
  
@@ -282,6 +283,18 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
      </listitem>
     </varlistentry>
  
+   <varlistentry>
+    <term><literal>ENCODING</></term>
+    <listitem>
+     <para>
+      Specifies that the file is encoded in the <replaceable
+      class="parameter">encoding_name</replaceable>.  If this option is
+      omitted, the current client encoding is used. See the Notes below
+      for more details.
+     </para>
+    </listitem>
+   </varlistentry>
+
    </variablelist>
   </refsect1>
  
@@ -377,8 +390,9 @@ COPY <replaceable class="parameter">count</replaceable>
     </para>
  
     <para>
-    Input data is interpreted according to the current client encoding,
-    and output data is encoded in the current client encoding, even
+    Input data is interpreted according to <literal>ENCODING</literal>
+    option or the current client encoding, and output data is encoded
+    in <literal>ENCODING</literal> or the current client encoding, even
      if the data does not pass through the client but is read from or
      written to a file directly by the server.
     </para>
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c

index 294450ef660d6f752150915077653e49638a0801..cac11a6c64107a2f977d2d7e5dc0d29fce5e35c7 100644 (file)
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -95,8 +95,8 @@ typedef struct CopyStateData
                                                                  * dest == COPY_NEW_FE in COPY FROM */
         bool            fe_eof;                 /* true if detected end of copy data */
         EolType         eol_type;               /* EOL type of input */
-       int                     client_encoding;        /* remote side's character encoding */
-       bool            need_transcoding;               /* client encoding diff from server? */
+       int                     file_encoding;  /* file or remote side's character encoding */
+       bool            need_transcoding;               /* file encoding diff from server? */
         bool            encoding_embeds_ascii;  /* ASCII can be non-first byte? */
  
         /* parameters from the COPY command */
@@ -110,7 +110,7 @@ typedef struct CopyStateData
         bool            header_line;    /* CSV header line? */
         char       *null_print;         /* NULL marker string (server encoding!) */
         int                     null_print_len; /* length of same */
-       char       *null_print_client;          /* same converted to client encoding */
+       char       *null_print_client;          /* same converted to file encoding */
         char       *delim;                      /* column delimiter (must be 1 byte) */
         char       *quote;                      /* CSV quote char (must be 1 byte) */
         char       *escape;                     /* CSV escape char (must be 1 byte) */
@@ -845,6 +845,8 @@ ProcessCopyOptions(CopyState cstate,
         if (cstate == NULL)
                 cstate = (CopyStateData *) palloc0(sizeof(CopyStateData));
  
+       cstate->file_encoding = -1;
+
         /* Extract options from the statement node tree */
         foreach(option, options)
         {
@@ -948,6 +950,19 @@ ProcessCopyOptions(CopyState cstate,
                                                  errmsg("argument to option \"%s\" must be a list of column names",
                                                                 defel->defname)));
                 }
+               else if (strcmp(defel->defname, "encoding") == 0)
+               {
+                       if (cstate->file_encoding >= 0)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_SYNTAX_ERROR),
+                                                errmsg("conflicting or redundant options")));
+                       cstate->file_encoding = pg_char_to_encoding(defGetString(defel));
+                       if (cstate->file_encoding < 0)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("argument to option \"%s\" must be a valid encoding name",
+                                                               defel->defname)));
+               }
                 else
                         ereport(ERROR,
                                         (errcode(ERRCODE_SYNTAX_ERROR),
@@ -1278,17 +1293,20 @@ BeginCopy(bool is_from,
                 }
         }
  
+       /* Use client encoding when ENCODING option is not specified. */
+       if (cstate->file_encoding < 0)
+               cstate->file_encoding = pg_get_client_encoding();
+
         /*
-        * Set up encoding conversion info.  Even if the client and server
-        * encodings are the same, we must apply pg_client_to_server() to validate
+        * Set up encoding conversion info.  Even if the file and server
+        * encodings are the same, we must apply pg_any_to_server() to validate
          * data in multibyte encodings.
          */
-       cstate->client_encoding = pg_get_client_encoding();
         cstate->need_transcoding =
-               (cstate->client_encoding != GetDatabaseEncoding() ||
+               (cstate->file_encoding != GetDatabaseEncoding() ||
                  pg_database_encoding_max_length() > 1);
         /* See Multibyte encoding comment above */
-       cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding);
+       cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->file_encoding);
  
         cstate->copy_dest = COPY_FILE;          /* default */
  
@@ -1526,12 +1544,13 @@ CopyTo(CopyState cstate)
         else
         {
                 /*
-                * For non-binary copy, we need to convert null_print to client
+                * For non-binary copy, we need to convert null_print to file
                  * encoding, because it will be sent directly with CopySendString.
                  */
                 if (cstate->need_transcoding)
-                       cstate->null_print_client = pg_server_to_client(cstate->null_print,
-                                                                                                        cstate->null_print_len);
+                       cstate->null_print_client = pg_server_to_any(cstate->null_print,
+                                                                                                                cstate->null_print_len,
+                                                                                                                cstate->file_encoding);
  
                 /* if a header has been requested send the line */
                 if (cstate->header_line)
@@ -2608,8 +2627,9 @@ CopyReadLine(CopyState cstate)
         {
                 char       *cvt;
  
-               cvt = pg_client_to_server(cstate->line_buf.data,
-                                                                 cstate->line_buf.len);
+               cvt = pg_any_to_server(cstate->line_buf.data,
+                                                          cstate->line_buf.len,
+                                                          cstate->file_encoding);
                 if (cvt != cstate->line_buf.data)
                 {
                         /* transfer converted data back to line_buf */
@@ -2854,7 +2874,7 @@ CopyReadLineText(CopyState cstate)
                         /* -----
                          * get next character
                          * Note: we do not change c so if it isn't \., we can fall
-                        * through and continue processing for client encoding.
+                        * through and continue processing for file encoding.
                          * -----
                          */
                         c2 = copy_raw_buf[raw_buf_ptr];
@@ -2968,7 +2988,7 @@ not_end_of_copy:
  
                         mblen_str[0] = c;
                         /* All our encodings only read the first byte to get the length */
-                       mblen = pg_encoding_mblen(cstate->client_encoding, mblen_str);
+                       mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
                         IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
                         IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
                         raw_buf_ptr += mblen - 1;
@@ -3467,7 +3487,7 @@ CopyAttributeOutText(CopyState cstate, char *string)
         char            delimc = cstate->delim[0];
  
         if (cstate->need_transcoding)
-               ptr = pg_server_to_client(string, strlen(string));
+               ptr = pg_server_to_any(string, strlen(string), cstate->file_encoding);
         else
                 ptr = string;
  
@@ -3540,7 +3560,7 @@ CopyAttributeOutText(CopyState cstate, char *string)
                                 start = ptr++;  /* we include char in next run */
                         }
                         else if (IS_HIGHBIT_SET(c))
-                               ptr += pg_encoding_mblen(cstate->client_encoding, ptr);
+                               ptr += pg_encoding_mblen(cstate->file_encoding, ptr);
                         else
                                 ptr++;
                 }
@@ -3627,7 +3647,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string,
                 use_quote = true;
  
         if (cstate->need_transcoding)
-               ptr = pg_server_to_client(string, strlen(string));
+               ptr = pg_server_to_any(string, strlen(string), cstate->file_encoding);
         else
                 ptr = string;
  
@@ -3654,7 +3674,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string,
                                         break;
                                 }
                                 if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
-                                       tptr += pg_encoding_mblen(cstate->client_encoding, tptr);
+                                       tptr += pg_encoding_mblen(cstate->file_encoding, tptr);
                                 else
                                         tptr++;
                         }
@@ -3678,7 +3698,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string,
                                 start = ptr;    /* we include char in next run */
                         }
                         if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
-                               ptr += pg_encoding_mblen(cstate->client_encoding, ptr);
+                               ptr += pg_encoding_mblen(cstate->file_encoding, ptr);
                         else
                                 ptr++;
                 }
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y

index c6811a11bd141b44eb84716fa0e2108d9bc30b3e..cbfacec4495df3726b8f0623ec69507fcfebd82d 100644 (file)
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -2236,6 +2236,10 @@ copy_opt_item:
                                 {
                                         $$ = makeDefElem("force_not_null", (Node *)$4);
                                 }
+                       | ENCODING Sconst
+                               {
+                                       $$ = makeDefElem("encoding", (Node *)makeString($2));
+                               }
                 ;
  
  /* The following exist for backward compatibility with very old versions */
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c

index 5ee74f747d06e010ad109aa4a77ab894486a3519..b8a2728e4f5d269ef5fa7c622a6af18564b3e3a2 100644 (file)
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -496,6 +496,17 @@ pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
   */
  char *
  pg_client_to_server(const char *s, int len)
+{
+       Assert(ClientEncoding);
+
+       return pg_any_to_server(s, len, ClientEncoding->encoding);
+}
+
+/*
+ * convert any encoding to server encoding.
+ */
+char *
+pg_any_to_server(const char *s, int len, int encoding)
  {
         Assert(DatabaseEncoding);
         Assert(ClientEncoding);
@@ -503,8 +514,8 @@ pg_client_to_server(const char *s, int len)
         if (len <= 0)
                 return (char *) s;
  
-       if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
-               ClientEncoding->encoding == PG_SQL_ASCII)
+       if (encoding == DatabaseEncoding->encoding ||
+               encoding == PG_SQL_ASCII)
         {
                 /*
                  * No conversion is needed, but we must still validate the data.
@@ -524,8 +535,8 @@ pg_client_to_server(const char *s, int len)
                  * to the parser but we have no way to convert it.      We compromise by
                  * rejecting the data if it contains any non-ASCII characters.
                  */
-               if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
-                       (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
+               if (PG_VALID_BE_ENCODING(encoding))
+                       (void) pg_verify_mbstr(encoding, s, len, false);
                 else
                 {
                         int                     i;
@@ -543,7 +554,11 @@ pg_client_to_server(const char *s, int len)
                 return (char *) s;
         }
  
-       return perform_default_encoding_conversion(s, len, true);
+       if (ClientEncoding->encoding == encoding)
+               return perform_default_encoding_conversion(s, len, true);
+       else
+               return (char *) pg_do_encoding_conversion(
+                       (unsigned char *) s, len, encoding, DatabaseEncoding->encoding);
  }
  
  /*
@@ -551,6 +566,17 @@ pg_client_to_server(const char *s, int len)
   */
  char *
  pg_server_to_client(const char *s, int len)
+{
+       Assert(ClientEncoding);
+
+       return pg_any_to_server(s, len, ClientEncoding->encoding);
+}
+
+/*
+ * convert server encoding to any encoding.
+ */
+char *
+pg_server_to_any(const char *s, int len, int encoding)
  {
         Assert(DatabaseEncoding);
         Assert(ClientEncoding);
@@ -558,12 +584,16 @@ pg_server_to_client(const char *s, int len)
         if (len <= 0)
                 return (char *) s;
  
-       if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
-               ClientEncoding->encoding == PG_SQL_ASCII ||
+       if (encoding == DatabaseEncoding->encoding ||
+               encoding == PG_SQL_ASCII ||
                 DatabaseEncoding->encoding == PG_SQL_ASCII)
                 return (char *) s;              /* assume data is valid */
  
-       return perform_default_encoding_conversion(s, len, false);
+       if (ClientEncoding->encoding == encoding)
+               return perform_default_encoding_conversion(s, len, false);
+       else
+               return (char *) pg_do_encoding_conversion(
+                       (unsigned char *) s, len, DatabaseEncoding->encoding, encoding);
  }
  
  /*
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h

index 565b53b3e6ecaa89d362bd3f28b760c97115ce2a..85a7b2f87dd257ec5bd4e553299017b630811a55 100644 (file)
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -420,6 +420,8 @@ extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
  
  extern char *pg_client_to_server(const char *s, int len);
  extern char *pg_server_to_client(const char *s, int len);
+extern char *pg_any_to_server(const char *s, int len, int encoding);
+extern char *pg_server_to_any(const char *s, int len, int encoding);
  
  extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
  extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out

index 15cbe02977008dcf3d7f1f482bdd65b30793f551..8e2bc0c25049886e5b1761ef6cc0a0c87886673e 100644 (file)
--- a/src/test/regress/expected/copy2.out
+++ b/src/test/regress/expected/copy2.out
@@ -46,10 +46,10 @@ CONTEXT:  COPY x, line 1: "2001     231     \N      \N"
  COPY x from stdin;
  ERROR:  extra data after last expected column
  CONTEXT:  COPY x, line 1: "2002        232     40      50      60      70      80"
--- various COPY options: delimiters, oids, NULL string
+-- various COPY options: delimiters, oids, NULL string, encoding
  COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x';
  COPY x from stdin WITH DELIMITER AS ';' NULL AS '';
-COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X';
+COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING 'sql_ascii';
  -- check results of copy in
  SELECT * FROM x;
     a   | b  |     c      |   d    |          e           
@@ -187,7 +187,7 @@ COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|';
  Jackson, Sam|\h
  It is "perfect".|      
  ''|
-COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\';
+COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING 'sql_ascii';
  "Jackson, Sam","\\h"
  "It is \"perfect\"."," "
  "",
diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql

index c2e8b037e74be329c0829e4c6c25c4a4106e23c7..6322c8fba43dbf77e868d3ef9ef283232d659640 100644 (file)
--- a/src/test/regress/sql/copy2.sql
+++ b/src/test/regress/sql/copy2.sql
@@ -72,7 +72,7 @@ COPY x from stdin;
  2002   232     40      50      60      70      80
  \.
  
--- various COPY options: delimiters, oids, NULL string
+-- various COPY options: delimiters, oids, NULL string, encoding
  COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x';
  500000,x,45,80,90
  500001,x,\x,\\x,\\\x
@@ -83,7 +83,7 @@ COPY x from stdin WITH DELIMITER AS ';' NULL AS '';
  3000;;c;;
  \.
  
-COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X';
+COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING 'sql_ascii';
  4000:\X:C:\X:\X
  4001:1:empty::
  4002:2:null:\X:\X
@@ -127,7 +127,7 @@ INSERT INTO y VALUES ('', NULL);
  
  COPY y TO stdout WITH CSV;
  COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|';
-COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\';
+COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING 'sql_ascii';
  COPY y TO stdout WITH CSV FORCE QUOTE *;
  
  -- Repeat above tests with new 9.0 option syntax
author	Itagaki Takahiro <itagaki.takahiro@gmail.com>
	Mon, 21 Feb 2011 05:08:04 +0000 (14:08 +0900)
committer	Itagaki Takahiro <itagaki.takahiro@gmail.com>
	Mon, 21 Feb 2011 05:32:40 +0000 (14:32 +0900)
contrib/file_fdw/file_fdw.c		patch \| blob \| history
doc/src/sgml/file-fdw.sgml		patch \| blob \| history
doc/src/sgml/ref/copy.sgml		patch \| blob \| history
src/backend/commands/copy.c		patch \| blob \| history
src/backend/parser/gram.y		patch \| blob \| history
src/backend/utils/mb/mbutils.c		patch \| blob \| history
src/include/mb/pg_wchar.h		patch \| blob \| history
src/test/regress/expected/copy2.out		patch \| blob \| history
src/test/regress/sql/copy2.sql		patch \| blob \| history