File encodings can be specified separately from client encoding.
If not specified, client encoding is used for backward compatibility.
Cases when the encoding doesn't match client encoding are slower
than matched cases because we don't have conversion procs for other
encodings. Performance improvement would be be a future work.
Original patch by Hitoshi Harada, and modified by me.
{ "quote", ForeignTableRelationId },
{ "escape", ForeignTableRelationId },
{ "null", ForeignTableRelationId },
+ { "encoding", ForeignTableRelationId },
/*
* force_quote is not supported by file_fdw because it's for COPY TO.
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><literal>encoding</literal></term>
+
+ <listitem>
+ <para>
+ Specifies the file's encoding.
+ the same as <command>COPY</>'s <literal>ENCODING</literal> option.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
<para>
QUOTE '<replaceable class="parameter">quote_character</replaceable>'
ESCAPE '<replaceable class="parameter">escape_character</replaceable>'
FORCE_QUOTE { ( <replaceable class="parameter">column</replaceable> [, ...] ) | * }
- FORCE_NOT_NULL ( <replaceable class="parameter">column</replaceable> [, ...] )
+ FORCE_NOT_NULL ( <replaceable class="parameter">column</replaceable> [, ...] ) |
+ ENCODING '<replaceable class="parameter">encoding_name</replaceable>'
</synopsis>
</refsynopsisdiv>
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><literal>ENCODING</></term>
+ <listitem>
+ <para>
+ Specifies that the file is encoded in the <replaceable
+ class="parameter">encoding_name</replaceable>. If this option is
+ omitted, the current client encoding is used. See the Notes below
+ for more details.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</refsect1>
</para>
<para>
- Input data is interpreted according to the current client encoding,
- and output data is encoded in the current client encoding, even
+ Input data is interpreted according to <literal>ENCODING</literal>
+ option or the current client encoding, and output data is encoded
+ in <literal>ENCODING</literal> or the current client encoding, even
if the data does not pass through the client but is read from or
written to a file directly by the server.
</para>
* dest == COPY_NEW_FE in COPY FROM */
bool fe_eof; /* true if detected end of copy data */
EolType eol_type; /* EOL type of input */
- int client_encoding; /* remote side's character encoding */
- bool need_transcoding; /* client encoding diff from server? */
+ int file_encoding; /* file or remote side's character encoding */
+ bool need_transcoding; /* file encoding diff from server? */
bool encoding_embeds_ascii; /* ASCII can be non-first byte? */
/* parameters from the COPY command */
bool header_line; /* CSV header line? */
char *null_print; /* NULL marker string (server encoding!) */
int null_print_len; /* length of same */
- char *null_print_client; /* same converted to client encoding */
+ char *null_print_client; /* same converted to file encoding */
char *delim; /* column delimiter (must be 1 byte) */
char *quote; /* CSV quote char (must be 1 byte) */
char *escape; /* CSV escape char (must be 1 byte) */
if (cstate == NULL)
cstate = (CopyStateData *) palloc0(sizeof(CopyStateData));
+ cstate->file_encoding = -1;
+
/* Extract options from the statement node tree */
foreach(option, options)
{
errmsg("argument to option \"%s\" must be a list of column names",
defel->defname)));
}
+ else if (strcmp(defel->defname, "encoding") == 0)
+ {
+ if (cstate->file_encoding >= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("conflicting or redundant options")));
+ cstate->file_encoding = pg_char_to_encoding(defGetString(defel));
+ if (cstate->file_encoding < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("argument to option \"%s\" must be a valid encoding name",
+ defel->defname)));
+ }
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
}
}
+ /* Use client encoding when ENCODING option is not specified. */
+ if (cstate->file_encoding < 0)
+ cstate->file_encoding = pg_get_client_encoding();
+
/*
- * Set up encoding conversion info. Even if the client and server
- * encodings are the same, we must apply pg_client_to_server() to validate
+ * Set up encoding conversion info. Even if the file and server
+ * encodings are the same, we must apply pg_any_to_server() to validate
* data in multibyte encodings.
*/
- cstate->client_encoding = pg_get_client_encoding();
cstate->need_transcoding =
- (cstate->client_encoding != GetDatabaseEncoding() ||
+ (cstate->file_encoding != GetDatabaseEncoding() ||
pg_database_encoding_max_length() > 1);
/* See Multibyte encoding comment above */
- cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding);
+ cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->file_encoding);
cstate->copy_dest = COPY_FILE; /* default */
else
{
/*
- * For non-binary copy, we need to convert null_print to client
+ * For non-binary copy, we need to convert null_print to file
* encoding, because it will be sent directly with CopySendString.
*/
if (cstate->need_transcoding)
- cstate->null_print_client = pg_server_to_client(cstate->null_print,
- cstate->null_print_len);
+ cstate->null_print_client = pg_server_to_any(cstate->null_print,
+ cstate->null_print_len,
+ cstate->file_encoding);
/* if a header has been requested send the line */
if (cstate->header_line)
{
char *cvt;
- cvt = pg_client_to_server(cstate->line_buf.data,
- cstate->line_buf.len);
+ cvt = pg_any_to_server(cstate->line_buf.data,
+ cstate->line_buf.len,
+ cstate->file_encoding);
if (cvt != cstate->line_buf.data)
{
/* transfer converted data back to line_buf */
/* -----
* get next character
* Note: we do not change c so if it isn't \., we can fall
- * through and continue processing for client encoding.
+ * through and continue processing for file encoding.
* -----
*/
c2 = copy_raw_buf[raw_buf_ptr];
mblen_str[0] = c;
/* All our encodings only read the first byte to get the length */
- mblen = pg_encoding_mblen(cstate->client_encoding, mblen_str);
+ mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
raw_buf_ptr += mblen - 1;
char delimc = cstate->delim[0];
if (cstate->need_transcoding)
- ptr = pg_server_to_client(string, strlen(string));
+ ptr = pg_server_to_any(string, strlen(string), cstate->file_encoding);
else
ptr = string;
start = ptr++; /* we include char in next run */
}
else if (IS_HIGHBIT_SET(c))
- ptr += pg_encoding_mblen(cstate->client_encoding, ptr);
+ ptr += pg_encoding_mblen(cstate->file_encoding, ptr);
else
ptr++;
}
use_quote = true;
if (cstate->need_transcoding)
- ptr = pg_server_to_client(string, strlen(string));
+ ptr = pg_server_to_any(string, strlen(string), cstate->file_encoding);
else
ptr = string;
break;
}
if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
- tptr += pg_encoding_mblen(cstate->client_encoding, tptr);
+ tptr += pg_encoding_mblen(cstate->file_encoding, tptr);
else
tptr++;
}
start = ptr; /* we include char in next run */
}
if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
- ptr += pg_encoding_mblen(cstate->client_encoding, ptr);
+ ptr += pg_encoding_mblen(cstate->file_encoding, ptr);
else
ptr++;
}
{
$$ = makeDefElem("force_not_null", (Node *)$4);
}
+ | ENCODING Sconst
+ {
+ $$ = makeDefElem("encoding", (Node *)makeString($2));
+ }
;
/* The following exist for backward compatibility with very old versions */
*/
char *
pg_client_to_server(const char *s, int len)
+{
+ Assert(ClientEncoding);
+
+ return pg_any_to_server(s, len, ClientEncoding->encoding);
+}
+
+/*
+ * convert any encoding to server encoding.
+ */
+char *
+pg_any_to_server(const char *s, int len, int encoding)
{
Assert(DatabaseEncoding);
Assert(ClientEncoding);
if (len <= 0)
return (char *) s;
- if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
- ClientEncoding->encoding == PG_SQL_ASCII)
+ if (encoding == DatabaseEncoding->encoding ||
+ encoding == PG_SQL_ASCII)
{
/*
* No conversion is needed, but we must still validate the data.
* to the parser but we have no way to convert it. We compromise by
* rejecting the data if it contains any non-ASCII characters.
*/
- if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
- (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
+ if (PG_VALID_BE_ENCODING(encoding))
+ (void) pg_verify_mbstr(encoding, s, len, false);
else
{
int i;
return (char *) s;
}
- return perform_default_encoding_conversion(s, len, true);
+ if (ClientEncoding->encoding == encoding)
+ return perform_default_encoding_conversion(s, len, true);
+ else
+ return (char *) pg_do_encoding_conversion(
+ (unsigned char *) s, len, encoding, DatabaseEncoding->encoding);
}
/*
*/
char *
pg_server_to_client(const char *s, int len)
+{
+ Assert(ClientEncoding);
+
+ return pg_any_to_server(s, len, ClientEncoding->encoding);
+}
+
+/*
+ * convert server encoding to any encoding.
+ */
+char *
+pg_server_to_any(const char *s, int len, int encoding)
{
Assert(DatabaseEncoding);
Assert(ClientEncoding);
if (len <= 0)
return (char *) s;
- if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
- ClientEncoding->encoding == PG_SQL_ASCII ||
+ if (encoding == DatabaseEncoding->encoding ||
+ encoding == PG_SQL_ASCII ||
DatabaseEncoding->encoding == PG_SQL_ASCII)
return (char *) s; /* assume data is valid */
- return perform_default_encoding_conversion(s, len, false);
+ if (ClientEncoding->encoding == encoding)
+ return perform_default_encoding_conversion(s, len, false);
+ else
+ return (char *) pg_do_encoding_conversion(
+ (unsigned char *) s, len, DatabaseEncoding->encoding, encoding);
}
/*
extern char *pg_client_to_server(const char *s, int len);
extern char *pg_server_to_client(const char *s, int len);
+extern char *pg_any_to_server(const char *s, int len, int encoding);
+extern char *pg_server_to_any(const char *s, int len, int encoding);
extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
COPY x from stdin;
ERROR: extra data after last expected column
CONTEXT: COPY x, line 1: "2002 232 40 50 60 70 80"
--- various COPY options: delimiters, oids, NULL string
+-- various COPY options: delimiters, oids, NULL string, encoding
COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x';
COPY x from stdin WITH DELIMITER AS ';' NULL AS '';
-COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X';
+COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING 'sql_ascii';
-- check results of copy in
SELECT * FROM x;
a | b | c | d | e
Jackson, Sam|\h
It is "perfect".|
''|
-COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\';
+COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING 'sql_ascii';
"Jackson, Sam","\\h"
"It is \"perfect\"."," "
"",
2002 232 40 50 60 70 80
\.
--- various COPY options: delimiters, oids, NULL string
+-- various COPY options: delimiters, oids, NULL string, encoding
COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x';
500000,x,45,80,90
500001,x,\x,\\x,\\\x
3000;;c;;
\.
-COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X';
+COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING 'sql_ascii';
4000:\X:C:\X:\X
4001:1:empty::
4002:2:null:\X:\X
COPY y TO stdout WITH CSV;
COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|';
-COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\';
+COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING 'sql_ascii';
COPY y TO stdout WITH CSV FORCE QUOTE *;
-- Repeat above tests with new 9.0 option syntax