static Processor prologProcessor;
static Processor contentProcessor;
+static Processor cdataSectionProcessor;
static Processor epilogProcessor;
static Processor errorProcessor;
static enum XML_Error
doContent(XML_Parser parser, int startTagLevel, const ENCODING *enc,
const char *start, const char *end, const char **endPtr);
+static enum XML_Error
+doCdataSection(XML_Parser parser, const char **startPtr, const char *end, const char **nextPtr);
static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *, const char *tagName, const char *s);
static int
defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *, int isCdata, const char *dfltValue);
"reference to external entity in attribute",
"xml processing instruction not at start of external entity",
"unknown encoding",
- "encoding specified in XML declaration is incorrect"
+ "encoding specified in XML declaration is incorrect",
+ "unclosed CDATA section",
};
if (code > 0 && code < sizeof(message)/sizeof(message[0]))
return message[code];
characterDataHandler(userData, &c, 1);
}
break;
- case XML_TOK_CDATA_SECTION:
- if (characterDataHandler) {
- const char *lim = next - enc->minBytesPerChar * 3;
- s += enc->minBytesPerChar * 9;
- do {
- char *dataPtr = dataBuf;
- XmlConvert(enc, XML_UTF8_ENCODING, &s, lim, &dataPtr, dataBufEnd);
- characterDataHandler(userData, dataBuf, dataPtr - dataBuf);
- } while (s != lim);
+ case XML_TOK_CDATA_SECT_OPEN:
+ {
+ enum XML_Error result = doCdataSection(parser, &next, end, nextPtr);
+ if (!next) {
+ processor = cdataSectionProcessor;
+ return result;
+ }
}
break;
case XML_TOK_TRAILING_RSQB:
return XML_ERROR_NONE;
}
+/* The idea here is to avoid using stack for each CDATA section when
+the whole file is parsed with one call. */
+
+static
+enum XML_Error cdataSectionProcessor(XML_Parser parser,
+ const char *start,
+ const char *end,
+ const char **endPtr)
+{
+ enum XML_Error result = doCdataSection(parser, &start, end, endPtr);
+ if (start) {
+ processor = contentProcessor;
+ return contentProcessor(parser, start, end, endPtr);
+ }
+ return result;
+}
+
+/* startPtr gets set to non-null is the section is closed, and to null if
+the section is not yet closed. */
+
+static
+enum XML_Error doCdataSection(XML_Parser parser,
+ const char **startPtr,
+ const char *end,
+ const char **nextPtr)
+{
+ const char *s = *startPtr;
+ *startPtr = 0;
+ for (;;) {
+ const char *next;
+ int tok = XmlCdataSectionTok(encoding, s, end, &next);
+ switch (tok) {
+ case XML_TOK_CDATA_SECT_CLOSE:
+ *startPtr = next;
+ return XML_ERROR_NONE;
+ case XML_TOK_DATA_NEWLINE:
+ if (characterDataHandler) {
+ char c = '\n';
+ characterDataHandler(userData, &c, 1);
+ }
+ break;
+ case XML_TOK_DATA_CHARS:
+ if (characterDataHandler) {
+ do {
+ char *dataPtr = dataBuf;
+ XmlConvert(encoding, XML_UTF8_ENCODING, &s, next, &dataPtr, dataBufEnd);
+ characterDataHandler(userData, dataBuf, dataPtr - dataBuf);
+ } while (s != next);
+ }
+ break;
+ case XML_TOK_INVALID:
+ errorPtr = next;
+ return XML_ERROR_INVALID_TOKEN;
+ case XML_TOK_PARTIAL:
+ case XML_TOK_NONE:
+ if (nextPtr) {
+ *nextPtr = s;
+ return XML_ERROR_NONE;
+ }
+ errorPtr = s;
+ return XML_ERROR_UNCLOSED_CDATA_SECTION;
+ default:
+ abort();
+ }
+ s = next;
+ }
+ /* not reached */
+}
+
+
static enum XML_Error
prologProcessor(XML_Parser parser,
const char *s,
XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF,
XML_ERROR_MISPLACED_XML_PI,
XML_ERROR_UNKNOWN_ENCODING,
- XML_ERROR_INCORRECT_ENCODING
+ XML_ERROR_INCORRECT_ENCODING,
+ XML_ERROR_UNCLOSED_CDATA_SECTION
};
int XMLPARSEAPI XML_GetErrorCode(XML_Parser parser);
#include "nametab.h"
#define VTABLE1 \
- { PREFIX(prologTok), PREFIX(contentTok) }, \
+ { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
PREFIX(sameName), \
PREFIX(nameMatchesAscii), \
#define XML_TOK_INVALID 0
/* The following tokens are returned by XmlContentTok; some are also
- returned by XmlAttributeValueTok and XmlEntityTok */
+ returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok */
#define XML_TOK_START_TAG_WITH_ATTS 1
#define XML_TOK_START_TAG_NO_ATTS 2
#define XML_TOK_END_TAG 5
#define XML_TOK_DATA_CHARS 6
#define XML_TOK_DATA_NEWLINE 7
-#define XML_TOK_CDATA_SECTION 8
+#define XML_TOK_CDATA_SECT_OPEN 8
#define XML_TOK_ENTITY_REF 9
#define XML_TOK_CHAR_REF 10 /* numeric character reference */
#define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */
#define XML_TOK_COMMA 38
- /* The following tokens is returned only by XmlAttributeValueTok */
+/* The following token is returned only by XmlAttributeValueTok */
#define XML_TOK_ATTRIBUTE_VALUE_S 39
-#define XML_N_STATES 2
+/* The following token is returned only by XmlCdataSectionTok */
+#define XML_TOK_CDATA_SECT_CLOSE 40
+
+#define XML_N_STATES 3
#define XML_PROLOG_STATE 0
#define XML_CONTENT_STATE 1
+#define XML_CDATA_SECTION_STATE 2
#define XML_N_LITERAL_TYPES 2
#define XML_ATTRIBUTE_VALUE_LITERAL 0
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
+#define XmlCdataSectionTok(enc, ptr, end, nextTokPtr) \
+ XmlTok(enc, XML_CDATA_SECTION_STATE, ptr, end, nextTokPtr)
+
/* This is used for performing a 2nd-level tokenization on
the content of a literal that has already been returned by XmlTok. */
return XML_TOK_PARTIAL;
}
-/* ptr points to character following "<![" */
static
int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
int i;
- /* CDATA[]]> */
- if (end - ptr < 9 * MINBPC)
+ /* CDATA[ */
+ if (end - ptr < 6 * MINBPC)
return XML_TOK_PARTIAL;
for (i = 0; i < 6; i++, ptr += MINBPC) {
if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) {
return XML_TOK_INVALID;
}
}
- end -= 2 * MINBPC;
+ *nextTokPtr = ptr;
+ return XML_TOK_CDATA_SECT_OPEN;
+}
+
+static
+int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
+ const char **nextTokPtr)
+{
+ if (ptr == end)
+ return XML_TOK_NONE;
+#if MINBPC > 1
+ {
+ size_t n = end - ptr;
+ if (n & (MINBPC - 1)) {
+ n &= ~(MINBPC - 1);
+ if (n == 0)
+ return XML_TOK_PARTIAL;
+ end = ptr + n;
+ }
+ }
+#endif
+ switch (BYTE_TYPE(enc, ptr)) {
+ case BT_RSQB:
+ ptr += MINBPC;
+ if (ptr == end)
+ return XML_TOK_PARTIAL;
+ if (!CHAR_MATCHES(enc, ptr, ']'))
+ break;
+ ptr += MINBPC;
+ if (ptr == end)
+ return XML_TOK_PARTIAL;
+ if (!CHAR_MATCHES(enc, ptr, '>')) {
+ ptr -= MINBPC;
+ break;
+ }
+ *nextTokPtr = ptr + MINBPC;
+ return XML_TOK_CDATA_SECT_CLOSE;
+ case BT_CR:
+ ptr += MINBPC;
+ if (ptr == end)
+ return XML_TOK_PARTIAL;
+ if (BYTE_TYPE(enc, ptr) == BT_LF)
+ ptr += MINBPC;
+ *nextTokPtr = ptr;
+ return XML_TOK_DATA_NEWLINE;
+ case BT_LF:
+ *nextTokPtr = ptr + MINBPC;
+ return XML_TOK_DATA_NEWLINE;
+ INVALID_CASES(ptr, nextTokPtr)
+ default:
+ ptr += MINBPC;
+ break;
+ }
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
- INVALID_CASES(ptr, nextTokPtr)
+#define LEAD_CASE(n) \
+ case BT_LEAD ## n: \
+ if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
+ *nextTokPtr = ptr; \
+ return XML_TOK_DATA_CHARS; \
+ } \
+ ptr += n; \
+ break;
+ LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
+#undef LEAD_CASE
+ case BT_NONXML:
+ case BT_MALFORM:
+ case BT_TRAIL:
+ case BT_CR:
+ case BT_LF:
case BT_RSQB:
- if (CHAR_MATCHES(enc, ptr + MINBPC, ']')
- && CHAR_MATCHES(enc, ptr + 2 * MINBPC, '>')) {
- *nextTokPtr = ptr + 3 * MINBPC;
- return XML_TOK_CDATA_SECTION;
- }
- /* fall through */
+ *nextTokPtr = ptr;
+ return XML_TOK_DATA_CHARS;
default:
ptr += MINBPC;
+ break;
}
}
- return XML_TOK_PARTIAL;
+ *nextTokPtr = ptr;
+ return XML_TOK_DATA_CHARS;
}
/* ptr points to character following "</" */