]> granicus.if.org Git - postgresql/commitdiff
Add ENCODING option to COPY TO/FROM and file_fdw.
authorItagaki Takahiro <itagaki.takahiro@gmail.com>
Mon, 21 Feb 2011 05:08:04 +0000 (14:08 +0900)
committerItagaki Takahiro <itagaki.takahiro@gmail.com>
Mon, 21 Feb 2011 05:32:40 +0000 (14:32 +0900)
File encodings can be specified separately from client encoding.
If not specified, client encoding is used for backward compatibility.

Cases when the encoding doesn't match client encoding are slower
than matched cases because we don't have conversion procs for other
encodings. Performance improvement would be be a future work.

Original patch by Hitoshi Harada, and modified by me.

contrib/file_fdw/file_fdw.c
doc/src/sgml/file-fdw.sgml
doc/src/sgml/ref/copy.sgml
src/backend/commands/copy.c
src/backend/parser/gram.y
src/backend/utils/mb/mbutils.c
src/include/mb/pg_wchar.h
src/test/regress/expected/copy2.out
src/test/regress/sql/copy2.sql

index 265afb5d9bc2f26a7649d64083644a13de249d74..6a84a00e8d39558b07d7767d496fa5f6c334d24d 100644 (file)
@@ -55,6 +55,7 @@ static struct FileFdwOption valid_options[] = {
        { "quote",                      ForeignTableRelationId },
        { "escape",                     ForeignTableRelationId },
        { "null",                       ForeignTableRelationId },
+       { "encoding",           ForeignTableRelationId },
 
        /*
         * force_quote is not supported by file_fdw because it's for COPY TO.
index e2921667184901b10c5eee30a80a270f517cb3bf..003c415b43a3561a40d034bf68063b182022f271 100644 (file)
    </listitem>
   </varlistentry>
 
+  <varlistentry>
+   <term><literal>encoding</literal></term>
+
+   <listitem>
+    <para>
+     Specifies the file's encoding.
+     the same as <command>COPY</>'s <literal>ENCODING</literal> option.
+    </para>
+   </listitem>
+  </varlistentry>
+
  </variablelist>
 
  <para>
index 38424ad04b9d821f474005a1e2bf91704ce4dfe4..6429a4ef0d7fbbc60ae23685bb37f76851e3e748 100644 (file)
@@ -40,7 +40,8 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
     QUOTE '<replaceable class="parameter">quote_character</replaceable>'
     ESCAPE '<replaceable class="parameter">escape_character</replaceable>'
     FORCE_QUOTE { ( <replaceable class="parameter">column</replaceable> [, ...] ) | * }
-    FORCE_NOT_NULL ( <replaceable class="parameter">column</replaceable> [, ...] )
+    FORCE_NOT_NULL ( <replaceable class="parameter">column</replaceable> [, ...] ) |
+    ENCODING '<replaceable class="parameter">encoding_name</replaceable>'
 </synopsis>
  </refsynopsisdiv>
 
@@ -282,6 +283,18 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><literal>ENCODING</></term>
+    <listitem>
+     <para>
+      Specifies that the file is encoded in the <replaceable
+      class="parameter">encoding_name</replaceable>.  If this option is
+      omitted, the current client encoding is used. See the Notes below
+      for more details.
+     </para>
+    </listitem>
+   </varlistentry>
+
   </variablelist>
  </refsect1>
 
@@ -377,8 +390,9 @@ COPY <replaceable class="parameter">count</replaceable>
    </para>
 
    <para>
-    Input data is interpreted according to the current client encoding,
-    and output data is encoded in the current client encoding, even
+    Input data is interpreted according to <literal>ENCODING</literal>
+    option or the current client encoding, and output data is encoded
+    in <literal>ENCODING</literal> or the current client encoding, even
     if the data does not pass through the client but is read from or
     written to a file directly by the server.
    </para>
index 294450ef660d6f752150915077653e49638a0801..cac11a6c64107a2f977d2d7e5dc0d29fce5e35c7 100644 (file)
@@ -95,8 +95,8 @@ typedef struct CopyStateData
                                                                 * dest == COPY_NEW_FE in COPY FROM */
        bool            fe_eof;                 /* true if detected end of copy data */
        EolType         eol_type;               /* EOL type of input */
-       int                     client_encoding;        /* remote side's character encoding */
-       bool            need_transcoding;               /* client encoding diff from server? */
+       int                     file_encoding;  /* file or remote side's character encoding */
+       bool            need_transcoding;               /* file encoding diff from server? */
        bool            encoding_embeds_ascii;  /* ASCII can be non-first byte? */
 
        /* parameters from the COPY command */
@@ -110,7 +110,7 @@ typedef struct CopyStateData
        bool            header_line;    /* CSV header line? */
        char       *null_print;         /* NULL marker string (server encoding!) */
        int                     null_print_len; /* length of same */
-       char       *null_print_client;          /* same converted to client encoding */
+       char       *null_print_client;          /* same converted to file encoding */
        char       *delim;                      /* column delimiter (must be 1 byte) */
        char       *quote;                      /* CSV quote char (must be 1 byte) */
        char       *escape;                     /* CSV escape char (must be 1 byte) */
@@ -845,6 +845,8 @@ ProcessCopyOptions(CopyState cstate,
        if (cstate == NULL)
                cstate = (CopyStateData *) palloc0(sizeof(CopyStateData));
 
+       cstate->file_encoding = -1;
+
        /* Extract options from the statement node tree */
        foreach(option, options)
        {
@@ -948,6 +950,19 @@ ProcessCopyOptions(CopyState cstate,
                                                 errmsg("argument to option \"%s\" must be a list of column names",
                                                                defel->defname)));
                }
+               else if (strcmp(defel->defname, "encoding") == 0)
+               {
+                       if (cstate->file_encoding >= 0)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_SYNTAX_ERROR),
+                                                errmsg("conflicting or redundant options")));
+                       cstate->file_encoding = pg_char_to_encoding(defGetString(defel));
+                       if (cstate->file_encoding < 0)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("argument to option \"%s\" must be a valid encoding name",
+                                                               defel->defname)));
+               }
                else
                        ereport(ERROR,
                                        (errcode(ERRCODE_SYNTAX_ERROR),
@@ -1278,17 +1293,20 @@ BeginCopy(bool is_from,
                }
        }
 
+       /* Use client encoding when ENCODING option is not specified. */
+       if (cstate->file_encoding < 0)
+               cstate->file_encoding = pg_get_client_encoding();
+
        /*
-        * Set up encoding conversion info.  Even if the client and server
-        * encodings are the same, we must apply pg_client_to_server() to validate
+        * Set up encoding conversion info.  Even if the file and server
+        * encodings are the same, we must apply pg_any_to_server() to validate
         * data in multibyte encodings.
         */
-       cstate->client_encoding = pg_get_client_encoding();
        cstate->need_transcoding =
-               (cstate->client_encoding != GetDatabaseEncoding() ||
+               (cstate->file_encoding != GetDatabaseEncoding() ||
                 pg_database_encoding_max_length() > 1);
        /* See Multibyte encoding comment above */
-       cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding);
+       cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->file_encoding);
 
        cstate->copy_dest = COPY_FILE;          /* default */
 
@@ -1526,12 +1544,13 @@ CopyTo(CopyState cstate)
        else
        {
                /*
-                * For non-binary copy, we need to convert null_print to client
+                * For non-binary copy, we need to convert null_print to file
                 * encoding, because it will be sent directly with CopySendString.
                 */
                if (cstate->need_transcoding)
-                       cstate->null_print_client = pg_server_to_client(cstate->null_print,
-                                                                                                        cstate->null_print_len);
+                       cstate->null_print_client = pg_server_to_any(cstate->null_print,
+                                                                                                                cstate->null_print_len,
+                                                                                                                cstate->file_encoding);
 
                /* if a header has been requested send the line */
                if (cstate->header_line)
@@ -2608,8 +2627,9 @@ CopyReadLine(CopyState cstate)
        {
                char       *cvt;
 
-               cvt = pg_client_to_server(cstate->line_buf.data,
-                                                                 cstate->line_buf.len);
+               cvt = pg_any_to_server(cstate->line_buf.data,
+                                                          cstate->line_buf.len,
+                                                          cstate->file_encoding);
                if (cvt != cstate->line_buf.data)
                {
                        /* transfer converted data back to line_buf */
@@ -2854,7 +2874,7 @@ CopyReadLineText(CopyState cstate)
                        /* -----
                         * get next character
                         * Note: we do not change c so if it isn't \., we can fall
-                        * through and continue processing for client encoding.
+                        * through and continue processing for file encoding.
                         * -----
                         */
                        c2 = copy_raw_buf[raw_buf_ptr];
@@ -2968,7 +2988,7 @@ not_end_of_copy:
 
                        mblen_str[0] = c;
                        /* All our encodings only read the first byte to get the length */
-                       mblen = pg_encoding_mblen(cstate->client_encoding, mblen_str);
+                       mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
                        IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
                        IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
                        raw_buf_ptr += mblen - 1;
@@ -3467,7 +3487,7 @@ CopyAttributeOutText(CopyState cstate, char *string)
        char            delimc = cstate->delim[0];
 
        if (cstate->need_transcoding)
-               ptr = pg_server_to_client(string, strlen(string));
+               ptr = pg_server_to_any(string, strlen(string), cstate->file_encoding);
        else
                ptr = string;
 
@@ -3540,7 +3560,7 @@ CopyAttributeOutText(CopyState cstate, char *string)
                                start = ptr++;  /* we include char in next run */
                        }
                        else if (IS_HIGHBIT_SET(c))
-                               ptr += pg_encoding_mblen(cstate->client_encoding, ptr);
+                               ptr += pg_encoding_mblen(cstate->file_encoding, ptr);
                        else
                                ptr++;
                }
@@ -3627,7 +3647,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string,
                use_quote = true;
 
        if (cstate->need_transcoding)
-               ptr = pg_server_to_client(string, strlen(string));
+               ptr = pg_server_to_any(string, strlen(string), cstate->file_encoding);
        else
                ptr = string;
 
@@ -3654,7 +3674,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string,
                                        break;
                                }
                                if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
-                                       tptr += pg_encoding_mblen(cstate->client_encoding, tptr);
+                                       tptr += pg_encoding_mblen(cstate->file_encoding, tptr);
                                else
                                        tptr++;
                        }
@@ -3678,7 +3698,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string,
                                start = ptr;    /* we include char in next run */
                        }
                        if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
-                               ptr += pg_encoding_mblen(cstate->client_encoding, ptr);
+                               ptr += pg_encoding_mblen(cstate->file_encoding, ptr);
                        else
                                ptr++;
                }
index c6811a11bd141b44eb84716fa0e2108d9bc30b3e..cbfacec4495df3726b8f0623ec69507fcfebd82d 100644 (file)
@@ -2236,6 +2236,10 @@ copy_opt_item:
                                {
                                        $$ = makeDefElem("force_not_null", (Node *)$4);
                                }
+                       | ENCODING Sconst
+                               {
+                                       $$ = makeDefElem("encoding", (Node *)makeString($2));
+                               }
                ;
 
 /* The following exist for backward compatibility with very old versions */
index 5ee74f747d06e010ad109aa4a77ab894486a3519..b8a2728e4f5d269ef5fa7c622a6af18564b3e3a2 100644 (file)
@@ -496,6 +496,17 @@ pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
  */
 char *
 pg_client_to_server(const char *s, int len)
+{
+       Assert(ClientEncoding);
+
+       return pg_any_to_server(s, len, ClientEncoding->encoding);
+}
+
+/*
+ * convert any encoding to server encoding.
+ */
+char *
+pg_any_to_server(const char *s, int len, int encoding)
 {
        Assert(DatabaseEncoding);
        Assert(ClientEncoding);
@@ -503,8 +514,8 @@ pg_client_to_server(const char *s, int len)
        if (len <= 0)
                return (char *) s;
 
-       if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
-               ClientEncoding->encoding == PG_SQL_ASCII)
+       if (encoding == DatabaseEncoding->encoding ||
+               encoding == PG_SQL_ASCII)
        {
                /*
                 * No conversion is needed, but we must still validate the data.
@@ -524,8 +535,8 @@ pg_client_to_server(const char *s, int len)
                 * to the parser but we have no way to convert it.      We compromise by
                 * rejecting the data if it contains any non-ASCII characters.
                 */
-               if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
-                       (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
+               if (PG_VALID_BE_ENCODING(encoding))
+                       (void) pg_verify_mbstr(encoding, s, len, false);
                else
                {
                        int                     i;
@@ -543,7 +554,11 @@ pg_client_to_server(const char *s, int len)
                return (char *) s;
        }
 
-       return perform_default_encoding_conversion(s, len, true);
+       if (ClientEncoding->encoding == encoding)
+               return perform_default_encoding_conversion(s, len, true);
+       else
+               return (char *) pg_do_encoding_conversion(
+                       (unsigned char *) s, len, encoding, DatabaseEncoding->encoding);
 }
 
 /*
@@ -551,6 +566,17 @@ pg_client_to_server(const char *s, int len)
  */
 char *
 pg_server_to_client(const char *s, int len)
+{
+       Assert(ClientEncoding);
+
+       return pg_any_to_server(s, len, ClientEncoding->encoding);
+}
+
+/*
+ * convert server encoding to any encoding.
+ */
+char *
+pg_server_to_any(const char *s, int len, int encoding)
 {
        Assert(DatabaseEncoding);
        Assert(ClientEncoding);
@@ -558,12 +584,16 @@ pg_server_to_client(const char *s, int len)
        if (len <= 0)
                return (char *) s;
 
-       if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
-               ClientEncoding->encoding == PG_SQL_ASCII ||
+       if (encoding == DatabaseEncoding->encoding ||
+               encoding == PG_SQL_ASCII ||
                DatabaseEncoding->encoding == PG_SQL_ASCII)
                return (char *) s;              /* assume data is valid */
 
-       return perform_default_encoding_conversion(s, len, false);
+       if (ClientEncoding->encoding == encoding)
+               return perform_default_encoding_conversion(s, len, false);
+       else
+               return (char *) pg_do_encoding_conversion(
+                       (unsigned char *) s, len, DatabaseEncoding->encoding, encoding);
 }
 
 /*
index 565b53b3e6ecaa89d362bd3f28b760c97115ce2a..85a7b2f87dd257ec5bd4e553299017b630811a55 100644 (file)
@@ -420,6 +420,8 @@ extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
 
 extern char *pg_client_to_server(const char *s, int len);
 extern char *pg_server_to_client(const char *s, int len);
+extern char *pg_any_to_server(const char *s, int len, int encoding);
+extern char *pg_server_to_any(const char *s, int len, int encoding);
 
 extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
 extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
index 15cbe02977008dcf3d7f1f482bdd65b30793f551..8e2bc0c25049886e5b1761ef6cc0a0c87886673e 100644 (file)
@@ -46,10 +46,10 @@ CONTEXT:  COPY x, line 1: "2001     231     \N      \N"
 COPY x from stdin;
 ERROR:  extra data after last expected column
 CONTEXT:  COPY x, line 1: "2002        232     40      50      60      70      80"
--- various COPY options: delimiters, oids, NULL string
+-- various COPY options: delimiters, oids, NULL string, encoding
 COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x';
 COPY x from stdin WITH DELIMITER AS ';' NULL AS '';
-COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X';
+COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING 'sql_ascii';
 -- check results of copy in
 SELECT * FROM x;
    a   | b  |     c      |   d    |          e           
@@ -187,7 +187,7 @@ COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|';
 Jackson, Sam|\h
 It is "perfect".|      
 ''|
-COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\';
+COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING 'sql_ascii';
 "Jackson, Sam","\\h"
 "It is \"perfect\"."," "
 "",
index c2e8b037e74be329c0829e4c6c25c4a4106e23c7..6322c8fba43dbf77e868d3ef9ef283232d659640 100644 (file)
@@ -72,7 +72,7 @@ COPY x from stdin;
 2002   232     40      50      60      70      80
 \.
 
--- various COPY options: delimiters, oids, NULL string
+-- various COPY options: delimiters, oids, NULL string, encoding
 COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x';
 500000,x,45,80,90
 500001,x,\x,\\x,\\\x
@@ -83,7 +83,7 @@ COPY x from stdin WITH DELIMITER AS ';' NULL AS '';
 3000;;c;;
 \.
 
-COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X';
+COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING 'sql_ascii';
 4000:\X:C:\X:\X
 4001:1:empty::
 4002:2:null:\X:\X
@@ -127,7 +127,7 @@ INSERT INTO y VALUES ('', NULL);
 
 COPY y TO stdout WITH CSV;
 COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|';
-COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\';
+COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING 'sql_ascii';
 COPY y TO stdout WITH CSV FORCE QUOTE *;
 
 -- Repeat above tests with new 9.0 option syntax