*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.255 2005/11/22 18:17:08 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.256 2005/12/27 18:10:48 momjian Exp $
*
*-------------------------------------------------------------------------
*/
/*
* This struct contains all the state variables used throughout a COPY
- * operation. For simplicity, we use the same struct for all variants
- * of COPY, even though some fields are used in only some cases.
+ * operation. For simplicity, we use the same struct for all variants of COPY,
+ * even though some fields are used in only some cases.
*
- * A word about encoding considerations: encodings that are only supported on
- * the client side are those where multibyte characters may have second or
- * later bytes with the high bit not set. When scanning data in such an
- * encoding to look for a match to a single-byte (ie ASCII) character,
- * we must use the full pg_encoding_mblen() machinery to skip over
- * multibyte characters, else we might find a false match to a trailing
- * byte. In supported server encodings, there is no possibility of
- * a false match, and it's faster to make useless comparisons to trailing
- * bytes than it is to invoke pg_encoding_mblen() to skip over them.
- * client_only_encoding is TRUE when we have to do it the hard way.
+ * Multi-byte encodings: all supported client-side encodings encode multi-byte
+ * characters by having the first byte's high bit set. Subsequent bytes of the
+ * character can have the high bit not set. When scanning data in such an
+ * encoding to look for a match to a single-byte (ie ASCII) character, we must
+ * use the full pg_encoding_mblen() machinery to skip over multibyte
+ * characters, else we might find a false match to a trailing byte. In
+ * supported server encodings, there is no possibility of a false match, and
+ * it's faster to make useless comparisons to trailing bytes than it is to
+ * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is TRUE
+ * when we have to do it the hard way.
*/
typedef struct CopyStateData
{
EolType eol_type; /* EOL type of input */
int client_encoding; /* remote side's character encoding */
bool need_transcoding; /* client encoding diff from server? */
- bool client_only_encoding; /* encoding not valid on server? */
+ bool encoding_embeds_ascii; /* ASCII can be non-first byte? */
/* parameters from the COPY command */
Relation rel; /* relation to copy to or from */
typedef CopyStateData *CopyState;
+/*
+ * These macros centralize code used to process line_buf and raw_buf buffers.
+ * They are macros because they often do continue/break control and to avoid
+ * function call overhead in tight COPY loops.
+ *
+ * We must use "if (1)" because "do {} while(0)" overrides the continue/break
+ * processing. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
+ */
+
+/*
+ * This keeps the character read at the top of the loop in the buffer
+ * even if there is more than one read-ahead.
+ */
+#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
+if (1) \
+{ \
+ if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
+ { \
+ raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \
+ need_data = true; \
+ continue; \
+ } \
+} else
+
+
+/* This consumes the remainder of the buffer and breaks */
+#define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
+if (1) \
+{ \
+ if (raw_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
+ { \
+ if (extralen) \
+ raw_buf_ptr = copy_buf_len; /* consume the partial character */ \
+ /* backslash just before EOF, treat as data char */ \
+ result = true; \
+ break; \
+ } \
+} else
+
+
+/*
+ * Transfer any approved data to line_buf; must do this to be sure
+ * there is some room in raw_buf.
+ */
+#define REFILL_LINEBUF \
+if (1) \
+{ \
+ if (raw_buf_ptr > cstate->raw_buf_index) \
+ { \
+ appendBinaryStringInfo(&cstate->line_buf, \
+ cstate->raw_buf + cstate->raw_buf_index, \
+ raw_buf_ptr - cstate->raw_buf_index); \
+ cstate->raw_buf_index = raw_buf_ptr; \
+ } \
+} else
+
+/* Undo any read-ahead and jump out of the block. */
+#define NO_END_OF_COPY_GOTO \
+if (1) \
+{ \
+ raw_buf_ptr = prev_raw_ptr + 1; \
+ goto not_end_of_copy; \
+} else
+
+
static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
static void CopyFrom(CopyState cstate);
static bool CopyReadLine(CopyState cstate);
static bool CopyReadLineText(CopyState cstate);
-static bool CopyReadLineCSV(CopyState cstate);
static int CopyReadAttributesText(CopyState cstate, int maxfields,
char **fieldvals);
static int CopyReadAttributesCSV(CopyState cstate, int maxfields,
/* Set up encoding conversion info */
cstate->client_encoding = pg_get_client_encoding();
cstate->need_transcoding = (cstate->client_encoding != GetDatabaseEncoding());
- cstate->client_only_encoding = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding);
+ /* See Multibyte encoding comment above */
+ cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding);
cstate->copy_dest = COPY_FILE; /* default */
cstate->line_buf_converted = false;
/* Parse data and transfer into line_buf */
- if (cstate->csv_mode)
- result = CopyReadLineCSV(cstate);
- else
- result = CopyReadLineText(cstate);
+ result = CopyReadLineText(cstate);
if (result)
{
}
/*
- * CopyReadLineText - inner loop of CopyReadLine for non-CSV mode
- *
- * If you need to change this, better look at CopyReadLineCSV too
+ * CopyReadLineText - inner loop of CopyReadLine for text mode
*/
static bool
CopyReadLineText(CopyState cstate)
{
- bool result;
char *copy_raw_buf;
int raw_buf_ptr;
int copy_buf_len;
- bool need_data;
- bool hit_eof;
- char s[2];
+ bool need_data = false;
+ bool hit_eof = false;
+ bool result = false;
+ char mblen_str[2];
+ /* CSV variables */
+ bool first_char_in_line = true;
+ bool in_quote = false,
+ last_was_esc = false;
+ char quotec = '\0';
+ char escapec = '\0';
- s[1] = 0;
+ if (cstate->csv_mode)
+ {
+ quotec = cstate->quote[0];
+ escapec = cstate->escape[0];
+ /* ignore special escape processing if it's the same as quotec */
+ if (quotec == escapec)
+ escapec = '\0';
+ }
- /* set default status */
- result = false;
+ mblen_str[1] = '\0';
/*
* The objective of this loop is to transfer the entire next input line
* into line_buf. Hence, we only care for detecting newlines (\r and/or
* \n) and the end-of-copy marker (\.).
*
- * For backwards compatibility we allow backslashes to escape newline
- * characters. Backslashes other than the end marker get put into the
- * line_buf, since CopyReadAttributesText does its own escape processing.
+ * In CSV mode, \r and \n inside a quoted field are just part of the data
+ * value and are put in line_buf. We keep just enough state to know if we
+ * are currently in a quoted field or not.
*
- * These four characters, and only these four, are assumed the same in
- * frontend and backend encodings.
+ * These four characters, and the CSV escape and quote characters, are
+ * assumed the same in frontend and backend encodings.
*
- * For speed, we try to move data to line_buf in chunks rather than one
- * character at a time. raw_buf_ptr points to the next character to
- * examine; any characters from raw_buf_index to raw_buf_ptr have been
- * determined to be part of the line, but not yet transferred to line_buf.
+ * For speed, we try to move data from raw_buf to line_buf in chunks
+ * rather than one character at a time. raw_buf_ptr points to the next
+ * character to examine; any characters from raw_buf_index to raw_buf_ptr
+ * have been determined to be part of the line, but not yet transferred
+ * to line_buf.
*
* For a little extra speed within the loop, we copy raw_buf and
* raw_buf_len into local variables.
copy_raw_buf = cstate->raw_buf;
raw_buf_ptr = cstate->raw_buf_index;
copy_buf_len = cstate->raw_buf_len;
- need_data = false; /* flag to force reading more data */
- hit_eof = false; /* flag indicating no more data available */
for (;;)
{
int prev_raw_ptr;
char c;
- /* Load more data if needed */
+ /*
+ * Load more data if needed. Ideally we would just force four bytes
+ * of read-ahead and avoid the many calls to
+ * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE
+ * protocol does not allow us to read too far ahead or we might
+ * read into the next data, so we read-ahead only as far we know
+ * we can. One optimization would be to read-ahead four byte here
+ * if cstate->copy_dest != COPY_OLD_FE, but it hardly seems worth it,
+ * considering the size of the buffer.
+ */
if (raw_buf_ptr >= copy_buf_len || need_data)
{
- /*
- * Transfer any approved data to line_buf; must do this to be sure
- * there is some room in raw_buf.
- */
- if (raw_buf_ptr > cstate->raw_buf_index)
- {
- appendBinaryStringInfo(&cstate->line_buf,
- cstate->raw_buf + cstate->raw_buf_index,
- raw_buf_ptr - cstate->raw_buf_index);
- cstate->raw_buf_index = raw_buf_ptr;
- }
+ REFILL_LINEBUF;
/*
* Try to read some more data. This will certainly reset
prev_raw_ptr = raw_buf_ptr;
c = copy_raw_buf[raw_buf_ptr++];
- if (c == '\r')
- {
- /* Check for \r\n on first line, _and_ handle \r\n. */
- if (cstate->eol_type == EOL_UNKNOWN ||
- cstate->eol_type == EOL_CRNL)
- {
- /*
- * If need more data, go back to loop top to load it.
- *
- * Note that if we are at EOF, c will wind up as '\0' because
- * of the guaranteed pad of raw_buf.
- */
- if (raw_buf_ptr >= copy_buf_len && !hit_eof)
- {
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
- }
- c = copy_raw_buf[raw_buf_ptr];
-
- if (c == '\n')
- {
- raw_buf_ptr++; /* eat newline */
- cstate->eol_type = EOL_CRNL; /* in case not set yet */
- }
- else
- {
- /* found \r, but no \n */
- if (cstate->eol_type == EOL_CRNL)
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("literal carriage return found in data"),
- errhint("Use \"\\r\" to represent carriage return.")));
-
- /*
- * if we got here, it is the first line and we didn't find
- * \n, so don't consume the peeked character
- */
- cstate->eol_type = EOL_CR;
- }
- }
- else if (cstate->eol_type == EOL_NL)
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("literal carriage return found in data"),
- errhint("Use \"\\r\" to represent carriage return.")));
- /* If reach here, we have found the line terminator */
- break;
- }
-
- if (c == '\n')
- {
- if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("literal newline found in data"),
- errhint("Use \"\\n\" to represent newline.")));
- cstate->eol_type = EOL_NL; /* in case not set yet */
- /* If reach here, we have found the line terminator */
- break;
- }
-
- if (c == '\\')
+ if (cstate->csv_mode)
{
/*
- * If need more data, go back to loop top to load it.
+ * If character is '\\' or '\r', we may need to look ahead below.
+ * Force fetch of the next character if we don't already have it. We
+ * need to do this before changing CSV state, in case one of these
+ * characters is also the quote or escape character.
+ *
+ * Note: old-protocol does not like forced prefetch, but it's OK here
+ * since we cannot validly be at EOF.
*/
- if (raw_buf_ptr >= copy_buf_len)
+ if (c == '\\' || c == '\r')
{
- if (hit_eof)
- {
- /* backslash just before EOF, treat as data char */
- result = true;
- break;
- }
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
+ IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
}
/*
- * In non-CSV mode, backslash quotes the following character even
- * if it's a newline, so we always advance to next character
+ * Dealing with quotes and escapes here is mildly tricky. If the quote
+ * char is also the escape char, there's no problem - we just use the
+ * char as a toggle. If they are different, we need to ensure that we
+ * only take account of an escape inside a quoted field and
+ * immediately preceding a quote char, and not the second in a
+ * escape-escape sequence.
*/
- c = copy_raw_buf[raw_buf_ptr++];
-
- if (c == '.')
- {
- if (cstate->eol_type == EOL_CRNL)
- {
- if (raw_buf_ptr >= copy_buf_len && !hit_eof)
- {
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
- }
- /* if hit_eof, c will become '\0' */
- c = copy_raw_buf[raw_buf_ptr++];
- if (c == '\n')
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("end-of-copy marker does not match previous newline style")));
- if (c != '\r')
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("end-of-copy marker corrupt")));
- }
- if (raw_buf_ptr >= copy_buf_len && !hit_eof)
- {
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
- }
- /* if hit_eof, c will become '\0' */
- c = copy_raw_buf[raw_buf_ptr++];
- if (c != '\r' && c != '\n')
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("end-of-copy marker corrupt")));
- if ((cstate->eol_type == EOL_NL && c != '\n') ||
- (cstate->eol_type == EOL_CRNL && c != '\n') ||
- (cstate->eol_type == EOL_CR && c != '\r'))
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("end-of-copy marker does not match previous newline style")));
-
- /*
- * Transfer only the data before the \. into line_buf, then
- * discard the data and the \. sequence.
- */
- if (prev_raw_ptr > cstate->raw_buf_index)
- appendBinaryStringInfo(&cstate->line_buf,
- cstate->raw_buf + cstate->raw_buf_index,
- prev_raw_ptr - cstate->raw_buf_index);
- cstate->raw_buf_index = raw_buf_ptr;
- result = true; /* report EOF */
- break;
- }
- }
-
- /*
- * Do we need to be careful about trailing bytes of multibyte
- * characters? (See note above about client_only_encoding)
- *
- * We assume here that pg_encoding_mblen only looks at the first byte
- * of the character!
- */
- if (cstate->client_only_encoding)
- {
- int mblen;
-
- s[0] = c;
- mblen = pg_encoding_mblen(cstate->client_encoding, s);
- if (raw_buf_ptr + (mblen - 1) > copy_buf_len)
- {
- if (hit_eof)
- {
- /* consume the partial character (conversion will fail) */
- raw_buf_ptr = copy_buf_len;
- result = true;
- break;
- }
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
- }
- raw_buf_ptr += mblen - 1;
- }
- } /* end of outer loop */
-
- /*
- * Transfer any still-uncopied data to line_buf.
- */
- if (raw_buf_ptr > cstate->raw_buf_index)
- {
- appendBinaryStringInfo(&cstate->line_buf,
- cstate->raw_buf + cstate->raw_buf_index,
- raw_buf_ptr - cstate->raw_buf_index);
- cstate->raw_buf_index = raw_buf_ptr;
- }
-
- return result;
-}
-
-/*
- * CopyReadLineCSV - inner loop of CopyReadLine for CSV mode
- *
- * If you need to change this, better look at CopyReadLineText too
- */
-static bool
-CopyReadLineCSV(CopyState cstate)
-{
- bool result;
- char *copy_raw_buf;
- int raw_buf_ptr;
- int copy_buf_len;
- bool need_data;
- bool hit_eof;
- char s[2];
- bool in_quote = false,
+ if (in_quote && c == escapec)
+ last_was_esc = !last_was_esc;
+ if (c == quotec && !last_was_esc)
+ in_quote = !in_quote;
+ if (c != escapec)
last_was_esc = false;
- char quotec = cstate->quote[0];
- char escapec = cstate->escape[0];
-
- /* ignore special escape processing if it's the same as quotec */
- if (quotec == escapec)
- escapec = '\0';
-
- s[1] = 0;
-
- /* set default status */
- result = false;
-
- /*
- * The objective of this loop is to transfer the entire next input line
- * into line_buf. Hence, we only care for detecting newlines (\r and/or
- * \n) and the end-of-copy marker (\.).
- *
- * In CSV mode, \r and \n inside a quoted field are just part of the data
- * value and are put in line_buf. We keep just enough state to know if we
- * are currently in a quoted field or not.
- *
- * These four characters, and the CSV escape and quote characters, are
- * assumed the same in frontend and backend encodings.
- *
- * For speed, we try to move data to line_buf in chunks rather than one
- * character at a time. raw_buf_ptr points to the next character to
- * examine; any characters from raw_buf_index to raw_buf_ptr have been
- * determined to be part of the line, but not yet transferred to line_buf.
- *
- * For a little extra speed within the loop, we copy raw_buf and
- * raw_buf_len into local variables.
- */
- copy_raw_buf = cstate->raw_buf;
- raw_buf_ptr = cstate->raw_buf_index;
- copy_buf_len = cstate->raw_buf_len;
- need_data = false; /* flag to force reading more data */
- hit_eof = false; /* flag indicating no more data available */
-
- for (;;)
- {
- int prev_raw_ptr;
- char c;
-
- /* Load more data if needed */
- if (raw_buf_ptr >= copy_buf_len || need_data)
- {
- /*
- * Transfer any approved data to line_buf; must do this to be sure
- * there is some room in raw_buf.
- */
- if (raw_buf_ptr > cstate->raw_buf_index)
- {
- appendBinaryStringInfo(&cstate->line_buf,
- cstate->raw_buf + cstate->raw_buf_index,
- raw_buf_ptr - cstate->raw_buf_index);
- cstate->raw_buf_index = raw_buf_ptr;
- }
-
- /*
- * Try to read some more data. This will certainly reset
- * raw_buf_index to zero, and raw_buf_ptr must go with it.
- */
- if (!CopyLoadRawBuf(cstate))
- hit_eof = true;
- raw_buf_ptr = 0;
- copy_buf_len = cstate->raw_buf_len;
/*
- * If we are completely out of data, break out of the loop,
- * reporting EOF.
+ * Updating the line count for embedded CR and/or LF chars is
+ * necessarily a little fragile - this test is probably about the best
+ * we can do. (XXX it's arguable whether we should do this at all ---
+ * is cur_lineno a physical or logical count?)
*/
- if (copy_buf_len <= 0)
- {
- result = true;
- break;
- }
- need_data = false;
+ if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
+ cstate->cur_lineno++;
}
- /* OK to fetch a character */
- prev_raw_ptr = raw_buf_ptr;
- c = copy_raw_buf[raw_buf_ptr++];
-
- /*
- * If character is '\\' or '\r', we may need to look ahead below.
- * Force fetch of the next character if we don't already have it. We
- * need to do this before changing CSV state, in case one of these
- * characters is also the quote or escape character.
- *
- * Note: old-protocol does not like forced prefetch, but it's OK here
- * since we cannot validly be at EOF.
- */
- if (c == '\\' || c == '\r')
- {
- if (raw_buf_ptr >= copy_buf_len && !hit_eof)
- {
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
- }
- }
-
- /*
- * Dealing with quotes and escapes here is mildly tricky. If the quote
- * char is also the escape char, there's no problem - we just use the
- * char as a toggle. If they are different, we need to ensure that we
- * only take account of an escape inside a quoted field and
- * immediately preceding a quote char, and not the second in a
- * escape-escape sequence.
- */
- if (in_quote && c == escapec)
- last_was_esc = !last_was_esc;
- if (c == quotec && !last_was_esc)
- in_quote = !in_quote;
- if (c != escapec)
- last_was_esc = false;
-
- /*
- * Updating the line count for embedded CR and/or LF chars is
- * necessarily a little fragile - this test is probably about the best
- * we can do. (XXX it's arguable whether we should do this at all ---
- * is cur_lineno a physical or logical count?)
- */
- if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
- cstate->cur_lineno++;
-
- if (c == '\r' && !in_quote)
+ /* Process \r */
+ if (c == '\r' && (!cstate->csv_mode || !in_quote))
{
/* Check for \r\n on first line, _and_ handle \r\n. */
if (cstate->eol_type == EOL_UNKNOWN ||
* Note that if we are at EOF, c will wind up as '\0' because
* of the guaranteed pad of raw_buf.
*/
- if (raw_buf_ptr >= copy_buf_len && !hit_eof)
- {
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
- }
+ IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
+
+ /* get next char */
c = copy_raw_buf[raw_buf_ptr];
if (c == '\n')
if (cstate->eol_type == EOL_CRNL)
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("unquoted carriage return found in data"),
- errhint("Use quoted CSV field to represent carriage return.")));
-
+ errmsg(!cstate->csv_mode ?
+ "literal carriage return found in data" :
+ "unquoted carriage return found in data"),
+ errhint(!cstate->csv_mode ?
+ "Use \"\\r\" to represent carriage return." :
+ "Use quoted CSV field to represent carriage return.")));
/*
* if we got here, it is the first line and we didn't find
* \n, so don't consume the peeked character
else if (cstate->eol_type == EOL_NL)
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("unquoted carriage return found in CSV data"),
- errhint("Use quoted CSV field to represent carriage return.")));
+ errmsg(!cstate->csv_mode ?
+ "literal carriage return found in data" :
+ "unquoted carriage return found in data"),
+ errhint(!cstate->csv_mode ?
+ "Use \"\\r\" to represent carriage return." :
+ "Use quoted CSV field to represent carriage return.")));
/* If reach here, we have found the line terminator */
break;
}
- if (c == '\n' && !in_quote)
+ /* Process \n */
+ if (c == '\n' && (!cstate->csv_mode || !in_quote))
{
if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("unquoted newline found in data"),
- errhint("Use quoted CSV field to represent newline.")));
+ errmsg(!cstate->csv_mode ?
+ "literal newline found in data" :
+ "unquoted newline found in data"),
+ errhint(!cstate->csv_mode ?
+ "Use \"\\n\" to represent newline." :
+ "Use quoted CSV field to represent newline.")));
cstate->eol_type = EOL_NL; /* in case not set yet */
/* If reach here, we have found the line terminator */
break;
}
/*
- * In CSV mode, we only recognize \. at start of line
+ * In CSV mode, we only recognize \. alone on a line. This is
+ * because \. is a valid CSV data value.
*/
- if (c == '\\' && cstate->line_buf.len == 0)
+ if (c == '\\' && (!cstate->csv_mode || first_char_in_line))
{
char c2;
- /*
- * If need more data, go back to loop top to load it.
- */
- if (raw_buf_ptr >= copy_buf_len)
- {
- if (hit_eof)
- {
- /* backslash just before EOF, treat as data char */
- result = true;
- break;
- }
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
- }
+ IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
+ IF_NEED_REFILL_AND_EOF_BREAK(0);
- /*
- * Note: we do not change c here since we aren't treating \ as
- * escaping the next character.
+ /* -----
+ * get next character
+ * Note: we do not change c so if it isn't \., we can fall
+ * through and continue processing for client encoding.
+ * -----
*/
c2 = copy_raw_buf[raw_buf_ptr];
*/
if (cstate->eol_type == EOL_CRNL)
{
- if (raw_buf_ptr >= copy_buf_len && !hit_eof)
- {
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
- }
+ /* Get the next character */
+ IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
/* if hit_eof, c2 will become '\0' */
c2 = copy_raw_buf[raw_buf_ptr++];
+
if (c2 == '\n')
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("end-of-copy marker does not match previous newline style")));
- if (c2 != '\r')
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("end-of-copy marker corrupt")));
- }
- if (raw_buf_ptr >= copy_buf_len && !hit_eof)
- {
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
+ {
+ if (!cstate->csv_mode)
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker does not match previous newline style")));
+ else
+ NO_END_OF_COPY_GOTO;
+ }
+ else if (c2 != '\r')
+ {
+ if (!cstate->csv_mode)
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker corrupt")));
+ else
+ NO_END_OF_COPY_GOTO;
+ }
}
+
+ /* Get the next character */
+ IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
/* if hit_eof, c2 will become '\0' */
c2 = copy_raw_buf[raw_buf_ptr++];
+
if (c2 != '\r' && c2 != '\n')
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("end-of-copy marker corrupt")));
+ {
+ if (!cstate->csv_mode)
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker corrupt")));
+ else
+ NO_END_OF_COPY_GOTO;
+ }
+
if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
(cstate->eol_type == EOL_CRNL && c2 != '\n') ||
(cstate->eol_type == EOL_CR && c2 != '\r'))
+ {
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("end-of-copy marker does not match previous newline style")));
+ }
/*
* Transfer only the data before the \. into line_buf, then
* discard the data and the \. sequence.
*/
if (prev_raw_ptr > cstate->raw_buf_index)
- appendBinaryStringInfo(&cstate->line_buf, cstate->raw_buf + cstate->raw_buf_index,
+ appendBinaryStringInfo(&cstate->line_buf,
+ cstate->raw_buf + cstate->raw_buf_index,
prev_raw_ptr - cstate->raw_buf_index);
cstate->raw_buf_index = raw_buf_ptr;
result = true; /* report EOF */
break;
}
+ else if (!cstate->csv_mode)
+ /*
+ * If we are here, it means we found a backslash followed by
+ * something other than a period. In non-CSV mode, anything
+ * after a backslash is special, so we skip over that second
+ * character too. If we didn't do that \\. would be
+ * considered an eof-of copy, while in non-CVS mode it is a
+ * literal backslash followed by a period. In CSV mode,
+ * backslashes are not special, so we want to process the
+ * character after the backslash just like a normal character,
+ * so we don't increment in those cases.
+ */
+ raw_buf_ptr++;
}
/*
- * Do we need to be careful about trailing bytes of multibyte
- * characters? (See note above about client_only_encoding)
+ * This label is for CSV cases where \. appears at the start of a line,
+ * but there is more text after it, meaning it was a data value.
+ * We are more strict for \. in CSV mode because \. could be a data
+ * value, while in non-CSV mode, \. cannot be a data value.
+ */
+not_end_of_copy:
+
+ /*
+ * Process all bytes of a multi-byte character as a group.
*
- * We assume here that pg_encoding_mblen only looks at the first byte
- * of the character!
+ * We only support multi-byte sequences where the first byte
+ * has the high-bit set, so as an optimization we can avoid
+ * this block entirely if it is not set.
*/
- if (cstate->client_only_encoding)
+ if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c))
{
int mblen;
- s[0] = c;
- mblen = pg_encoding_mblen(cstate->client_encoding, s);
- if (raw_buf_ptr + (mblen - 1) > copy_buf_len)
- {
- if (hit_eof)
- {
- /* consume the partial character (will fail below) */
- raw_buf_ptr = copy_buf_len;
- result = true;
- break;
- }
- raw_buf_ptr = prev_raw_ptr; /* undo fetch */
- need_data = true;
- continue;
- }
+ mblen_str[0] = c;
+ /* All our encodings only read the first byte to get the length */
+ mblen = pg_encoding_mblen(cstate->client_encoding, mblen_str);
+ IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
+ IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
raw_buf_ptr += mblen - 1;
}
+ first_char_in_line = false;
} /* end of outer loop */
/*
* Transfer any still-uncopied data to line_buf.
*/
- if (raw_buf_ptr > cstate->raw_buf_index)
- {
- appendBinaryStringInfo(&cstate->line_buf,
- cstate->raw_buf + cstate->raw_buf_index,
- raw_buf_ptr - cstate->raw_buf_index);
- cstate->raw_buf_index = raw_buf_ptr;
- }
+ REFILL_LINEBUF;
return result;
}
* safe, because in valid backend encodings, extra bytes of a
* multibyte character never look like ASCII.
*/
- if (cstate->client_only_encoding)
+ if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c))
mblen = pg_encoding_mblen(cstate->client_encoding, string);
CopySendData(cstate, string, mblen);
break;
use_quote = true;
break;
}
- if (cstate->client_only_encoding)
+ if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c))
mblen = pg_encoding_mblen(cstate->client_encoding, tstring);
else
mblen = 1;
{
if (use_quote && (c == quotec || c == escapec))
CopySendChar(cstate, escapec);
- if (cstate->client_only_encoding)
+ if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c))
mblen = pg_encoding_mblen(cstate->client_encoding, string);
else
mblen = 1;