/* TODO
+method to get name length
+method to extract attribute names (returns number of atts)
+size_t getAttributes(const char *ptr, const char *end, const char **atts, size_t maxAtts)
+
+
+Provide method to count lines/columns.
+
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
Better prolog tokenization
NAME
PEREF
+MatchEndTag(endTagStart, endTagEnd, startTagPtr, startTagEnd)
+
*/
#ifdef _MSC_VER
#undef IS_NMSTRT_CHAR
const struct normal_encoding utf8_encoding = {
- { { PREFIX(prologTok), PREFIX(contentTok) }, 1 },
+ { { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 },
#include "asciitab.h"
#include "utf8tab.h"
};
#undef IS_NMSTRT_CHAR
const struct encoding little2_encoding = {
- { PREFIX(prologTok), PREFIX(contentTok) }, 2
+ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
};
#undef PREFIX
#undef IS_NMSTRT_CHAR
const struct encoding big2_encoding = {
- { PREFIX(prologTok), PREFIX(contentTok) }, 2
+ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
};
#undef PREFIX
#define XML_TOK_PARTIAL_CHAR -2 /* only part of a multibyte sequence */
#define XML_TOK_PARTIAL -1 /* only part of a token */
#define XML_TOK_INVALID 0
-#define XML_TOK_BOM 1 /* Byte order mark */
-#define XML_TOK_COMMENT 2
-#define XML_TOK_PI 3 /* processing instruction */
-
-/* The following tokens are returned only by XmlPrologTok */
-#define XML_TOK_LITERAL 4
-#define XML_TOK_PROLOG_CHARS 5
-#define XML_TOK_PROLOG_S 6
/* The following token is returned by XmlPrologTok when it detects the end
of the prolog and is also returned by XmlContentTok */
-#define XML_TOK_START_TAG 7
+#define XML_TOK_START_TAG_WITH_ATTS 1
+#define XML_TOK_START_TAG_NO_ATTS 2
+#define XML_TOK_EMPTY_ELEMENT_WITH_ATTS 3 /* empty element tag <e/> */
+#define XML_TOK_EMPTY_ELEMENT_NO_ATTS 4
/* The following tokens are returned only by XmlContentTok */
-#define XML_TOK_END_TAG 8
-#define XML_TOK_EMPTY_ELEMENT 9 /* empty element tag <e/> */
-#define XML_TOK_DATA_CHARS 10
-#define XML_TOK_CDATA_SECTION 11
-#define XML_TOK_ENTITY_REF 12
-#define XML_TOK_CHAR_REF 13 /* numeric character reference */
+#define XML_TOK_END_TAG 5
+#define XML_TOK_DATA_CHARS 6
+#define XML_TOK_CDATA_SECTION 7
+#define XML_TOK_ENTITY_REF 8
+#define XML_TOK_CHAR_REF 9 /* numeric character reference */
+
+/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
+#define XML_TOK_PI 10 /* processing instruction */
+#define XML_TOK_COMMENT 11
+#define XML_TOK_BOM 12 /* Byte order mark */
+
+/* The following tokens are returned only by XmlPrologTok */
+#define XML_TOK_LITERAL 13
+#define XML_TOK_PROLOG_CHARS 14
+#define XML_TOK_PROLOG_S 15
#define XML_NSTATES 2
#define XML_PROLOG_STATE 0
const char *,
const char *,
const char **);
+ int (*sameName)(const struct encoding *,
+ const char *, const char *);
+ int (*getAtts)(const struct encoding *enc, const char *ptr,
+ int attsMax, const char **atts);
int minBytesPerChar;
} ENCODING;
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
+#define XmlSameName(enc, ptr1, ptr2) (((enc)->sameName)(enc, ptr1, ptr2))
+
+#define XmlGetAttributes(enc, ptr, attsMax, atts) \
+ (((enc)->getAtts)(enc, ptr, attsMax, atts))
+
typedef struct {
ENCODING initEnc;
const ENCODING **encPtr;
continue;
case BT_GT:
*nextTokPtr = ptr + MINBPC;
- return XML_TOK_START_TAG;
+ return XML_TOK_START_TAG_WITH_ATTS;
case BT_SOL:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + MINBPC;
- return XML_TOK_EMPTY_ELEMENT;
+ return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
case BT_GT:
gt:
*nextTokPtr = ptr + MINBPC;
- return XML_TOK_START_TAG;
+ return XML_TOK_START_TAG_NO_ATTS;
case BT_SOL:
sol:
ptr += MINBPC;
return XML_TOK_INVALID;
}
*nextTokPtr = ptr + MINBPC;
- return XML_TOK_EMPTY_ELEMENT;
+ return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
return XML_TOK_PROLOG_CHARS;
}
+/* This must only be called for a well-formed start-tag or empty element tag.
+Returns the number of attributes. Pointers to the names of up to the first
+attsMax attributes are stored in atts. */
+static
+int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
+ int attsMax, const char **atts)
+{
+ enum { other, inName, inValue } state = inName;
+ int nAtts = 0;
+ int open;
+
+ for (ptr += MINBPC;; ptr += MINBPC) {
+ switch (BYTE_TYPE(enc, ptr)) {
+#define START_NAME \
+ if (state == other) { \
+ if (nAtts < attsMax) \
+ atts[nAtts] = ptr; \
+ ++nAtts; \
+ state = inName; \
+ }
+#define LEAD_CASE(n) \
+ case BT_LEAD ## n: START_NAME ptr += (n - MINBPC); break;
+ LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) LEAD_CASE(5) LEAD_CASE(6)
+#undef LEAD_CASE
+ case BT_NONASCII:
+ case BT_NMSTRT:
+ case BT_HEX:
+ START_NAME
+ break;
+#undef START_NAME
+ case BT_QUOT:
+ if (state == other) {
+ state = inValue;
+ open = BT_QUOT;
+ }
+ else if (open == BT_QUOT)
+ state = other;
+ break;
+ case BT_APOS:
+ if (state == other) {
+ state = inValue;
+ open = BT_APOS;
+ }
+ else if (open == BT_APOS)
+ state = other;
+ break;
+ case BT_S:
+ /* This case ensures that the first attribute name is counted
+ Apart from that we could just change state on the quote. */
+ if (state == inName)
+ state = other;
+ break;
+ case BT_GT:
+ case BT_SOL:
+ if (state != inValue)
+ return nAtts;
+ break;
+ default:
+ break;
+ }
+ }
+ /* not reached */
+}
+
+static
+int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
+{
+ for (;;) {
+ switch (BYTE_TYPE(enc, ptr1)) {
+#define LEAD_CASE(n) \
+ case BT_LEAD ## n: \
+ if (*ptr1++ != *ptr2++) \
+ return 0;
+ LEAD_CASE(6) LEAD_CASE(5) LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
+#undef LEAD_CASE
+ /* fall through */
+ if (*ptr1++ != *ptr2++)
+ return 0;
+ break;
+ case BT_NONASCII:
+ case BT_NMSTRT:
+ case BT_HEX:
+ case BT_DIGIT:
+ case BT_NAME:
+ case BT_MINUS:
+ if (*ptr2 != *ptr1)
+ return 0;
+ ptr1 += MINBPC;
+ ptr2 += MINBPC;
+ break;
+ default:
+ if (*ptr1 == *ptr2)
+ return 1;
+ switch (BYTE_TYPE(enc, ptr2)) {
+ case BT_LEAD2:
+ case BT_LEAD3:
+ case BT_LEAD4:
+ case BT_LEAD5:
+ case BT_LEAD6:
+ case BT_NONASCII:
+ case BT_NMSTRT:
+ case BT_HEX:
+ case BT_DIGIT:
+ case BT_NAME:
+ case BT_MINUS:
+ return 0;
+ default:
+ return 1;
+ }
+ }
+ }
+ /* not reached */
+}
+
#undef DO_LEAD_CASE
#undef MULTIBYTE_CASES
#undef INVALID_CASES