/* 0x00 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x04 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
-/* 0x08 */ BT_NONXML, BT_S, BT_S, BT_NONXML,
-/* 0x0C */ BT_NONXML, BT_S, BT_NONXML, BT_NONXML,
+/* 0x08 */ BT_NONXML, BT_S, BT_LF, BT_NONXML,
+/* 0x0C */ BT_NONXML, BT_CR, BT_NONXML, BT_NONXML,
/* 0x10 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x14 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
/* 0x18 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
Provide method to get name length.
-Provide method to count lines/columns.
-
Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
Tokenize prologs in a way useful for well-formedness checking
#undef IS_NMSTRT_CHAR
const struct normal_encoding utf8_encoding = {
- { { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 },
+ { { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 1 },
#include "asciitab.h"
#include "utf8tab.h"
};
#undef IS_NMSTRT_CHAR
const struct encoding little2_encoding = {
- { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
+ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2
};
#undef PREFIX
#undef IS_NMSTRT_CHAR
const struct encoding big2_encoding = {
- { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
+ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2
};
#undef PREFIX
return initScan(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr);
}
+static
+void initUpdatePosition(const ENCODING *enc, const char *ptr,
+ const char *end, POSITION *pos)
+{
+ normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
+}
+
void XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr)
{
p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog;
p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent;
+ p->initEnc.updatePosition = initUpdatePosition;
p->initEnc.minBytesPerChar = 1;
p->encPtr = encPtr;
*encPtr = &(p->initEnc);
#define XML_PROLOG_STATE 0
#define XML_CONTENT_STATE 1
+typedef struct position {
+ /* first line and first column are 0 not 1 */
+ unsigned long lineNumber;
+ unsigned long columnNumber;
+ /* if the last character counted was CR, then an immediately
+ following LF should be ignored */
+ int ignoreInitialLF;
+} POSITION;
+
typedef struct encoding {
int (*scanners[XML_NSTATES])(const struct encoding *,
const char *,
const char *, const char *);
int (*getAtts)(const struct encoding *enc, const char *ptr,
int attsMax, const char **atts);
+ void (*updatePosition)(const struct encoding *,
+ const char *ptr,
+ const char *end,
+ POSITION *);
int minBytesPerChar;
} ENCODING;
#define XmlGetAttributes(enc, ptr, attsMax, atts) \
(((enc)->getAtts)(enc, ptr, attsMax, atts))
+#define XmlUpdatePosition(enc, ptr, end, pos) \
+ (((enc)->updatePosition)(enc, ptr, end, pos))
+
typedef struct {
ENCODING initEnc;
const ENCODING **encPtr;
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
- case BT_S:
+ case BT_S: case BT_CR: case BT_LF:
ptr += MINBPC;
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
- case BT_S:
+ case BT_S: case BT_CR: case BT_LF:
for (ptr += MINBPC; ptr != end; ptr += MINBPC) {
switch (BYTE_TYPE(enc, ptr)) {
- case BT_S:
+ case BT_S: case BT_CR: case BT_LF:
break;
case BT_GT:
*nextTokPtr = ptr + MINBPC;
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
- case BT_S:
+ case BT_S: case BT_CR: case BT_LF:
for (;;) {
int t;
t = BYTE_TYPE(enc, ptr);
if (t == BT_EQUALS)
break;
- if (t != BT_S) {
+ switch (t) {
+ case BT_S:
+ case BT_LF:
+ case BT_CR:
+ break;
+ default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
open = BYTE_TYPE(enc, ptr);
if (open == BT_QUOT || open == BT_APOS)
break;
- if (open != BT_S) {
+ switch (open) {
+ case BT_S:
+ case BT_LF:
+ case BT_CR:
+ break;
+ default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
return XML_TOK_PARTIAL;
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
- case BT_S:
+ case BT_S: case BT_CR: case BT_LF:
continue;
case BT_GT:
*nextTokPtr = ptr + MINBPC;
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
- case BT_S:
+ case BT_S: case BT_CR: case BT_LF:
{
ptr += MINBPC;
while (ptr != end) {
goto gt;
case BT_SOL:
goto sol;
- case BT_S:
+ case BT_S: case BT_CR: case BT_LF:
ptr += MINBPC;
continue;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
- case BT_S:
- do {
+ case BT_S: case BT_CR: case BT_LF:
+ for (;;) {
ptr += MINBPC;
- } while (ptr != end && BYTE_TYPE(enc, ptr) == BT_S);
+ if (ptr == end)
+ break;
+ switch (BYTE_TYPE(enc, ptr)) {
+ case BT_S: case BT_CR: case BT_LF:
+ break;
+ default:
+ *nextTokPtr = ptr;
+ return XML_TOK_PROLOG_S;
+ }
+ }
*nextTokPtr = ptr;
return XML_TOK_PROLOG_S;
default:
case BT_NONXML:
case BT_MALFORM:
case BT_TRAIL:
- case BT_S:
+ case BT_S: case BT_CR: case BT_LF:
*nextTokPtr = ptr;
return XML_TOK_PROLOG_CHARS;
MULTIBYTE_CASES(ptr, end, (*nextTokPtr = ptr, XML_TOK_PROLOG_CHARS))
else if (open == BT_APOS)
state = other;
break;
- case BT_S:
+ case BT_S: case BT_CR: case BT_LF:
/* This case ensures that the first attribute name is counted
Apart from that we could just change state on the quote. */
if (state == inName)
/* not reached */
}
+static
+void PREFIX(updatePosition)(const ENCODING *enc,
+ const char *ptr,
+ const char *end,
+ POSITION *pos)
+{
+ if (pos->ignoreInitialLF) {
+ if (ptr == end)
+ return;
+ if (CHAR_MATCHES(enc, ptr, '\n'))
+ ptr += MINBPC;
+ pos->ignoreInitialLF = 0;
+ }
+ while (ptr != end) {
+ switch (BYTE_TYPE(enc, ptr)) {
+ MULTIBYTE_CASES(ptr, end, ;/* hack! */)
+ case BT_LF:
+ pos->columnNumber = (unsigned)-1;
+ pos->lineNumber++;
+ ptr += MINBPC;
+ break;
+ case BT_CR:
+ pos->lineNumber++;
+ ptr += MINBPC;
+ if (ptr == end) {
+ pos->ignoreInitialLF = 1;
+ pos->columnNumber = 0;
+ return;
+ }
+ pos->columnNumber = (unsigned)-1;
+ if (CHAR_MATCHES(enc, ptr, '\n'))
+ ptr += MINBPC;
+ break;
+ default:
+ ptr += MINBPC;
+ break;
+ }
+ pos->columnNumber++;
+ }
+}
+
#undef DO_LEAD_CASE
#undef MULTIBYTE_CASES
#undef INVALID_CASES
BT_LSQB,
BT_RSQB,
BT_S,
+ BT_CR,
+ BT_LF,
BT_NMSTRT,
BT_HEX,
BT_DIGIT,
#include <stdlib.h>
+#include <string.h>
#include "wfcheck.h"
#ifdef _MSC_VER
static
int skipProlog(const char **s, const char *end, const char **nextTokP,
const ENCODING **enc);
+static
+void setPosition(const ENCODING *enc,
+ const char *start, const char *end,
+ const char **badPtr, unsigned long *badLine, unsigned long *badCol);
enum WfCheckResult
-wfCheck(const char *s, size_t n, const char **badPtr)
+wfCheck(const char *s, size_t n,
+ const char **badPtr, unsigned long *badLine, unsigned long *badCol)
{
unsigned nElements = 0;
unsigned nAtts = 0;
for (;;) {
switch (tok) {
case XML_TOK_NONE:
- *badPtr = s;
+ setPosition(enc, start, s, badPtr, badLine, badCol);
RETURN_CLEANUP(noElements);
case XML_TOK_INVALID:
- *badPtr = next;
+ setPosition(enc, start, next, badPtr, badLine, badCol);
RETURN_CLEANUP(invalidToken);
case XML_TOK_PARTIAL:
- *badPtr = s;
+ setPosition(enc, start, s, badPtr, badLine, badCol);
RETURN_CLEANUP(unclosedToken);
case XML_TOK_PARTIAL_CHAR:
- *badPtr = s;
+ setPosition(enc, start, s, badPtr, badLine, badCol);
RETURN_CLEANUP(partialChar);
case XML_TOK_EMPTY_ELEMENT_NO_ATTS:
nElements++;
int j;
for (j = 0; j < i; j++) {
if (XmlSameName(enc, atts[i], atts[j])) {
- *badPtr = atts[i];
+ setPosition(enc, start, atts[i], badPtr, badLine, badCol);
RETURN_CLEANUP(duplicateAttribute);
}
}
case XML_TOK_END_TAG:
--level;
if (!XmlSameName(enc, startName[level], s + enc->minBytesPerChar * 2)) {
- *badPtr = s;
+ setPosition(enc, start, s, badPtr, badLine, badCol);
RETURN_CLEANUP(tagMismatch);
}
break;
break;
default:
if (tok > 0) {
- *badPtr = s;
+ setPosition(enc, start, s, badPtr, badLine, badCol);
RETURN_CLEANUP(junkAfterDocElement);
}
break;
}
/* not reached */
}
+
+static
+void setPosition(const ENCODING *enc,
+ const char *start, const char *end,
+ const char **badPtr, unsigned long *badLine, unsigned long *badCol)
+{
+ POSITION pos;
+ memset(&pos, 0, sizeof(POSITION));
+ XmlUpdatePosition(enc, start, end, &pos);
+ *badPtr = end;
+ *badLine = pos.lineNumber;
+ *badCol = pos.columnNumber;
+}
junkAfterDocElement
};
-enum WfCheckResult wfCheck(const char *s, size_t n, const char **badPtr);
+enum WfCheckResult wfCheck(const char *s, size_t n,
+ const char **errorPtr,
+ unsigned long *errorLineNumber,
+ unsigned long *errorColNumber);
HANDLE f;
HANDLE m;
DWORD size;
+ DWORD sizeHi;
const char *p;
- const char *bad = 0;
+ const char *badPtr = 0;
+ unsigned long badLine = 0;
+ unsigned long badCol = 0;
int ret;
enum WfCheckResult result;
fprintf(stderr, "%s: CreateFile failed\n", name);
return 0;
}
- size = GetFileSize(f, NULL);
+ size = GetFileSize(f, &sizeHi);
+ if (sizeHi) {
+ fprintf(stderr, "%s: too big (limit 2Gb)\n", name);
+ return 0;
+ }
+ /* CreateFileMapping barfs on zero length files */
+ if (size == 0) {
+ fprintf(stderr, "%s: zero-length file\n", name);
+ return 0;
+ }
m = CreateFileMapping(f, NULL, PAGE_READONLY, 0, 0, NULL);
if (m == NULL) {
fprintf(stderr, "%s: CreateFileMapping failed\n", name);
fprintf(stderr, "%s: MapViewOfFile failed\n", name);
return 0;
}
- result = wfCheck(p, size, &bad);
+ result = wfCheck(p, size, &badPtr, &badLine, &badCol);
if (result) {
static const char *message[] = {
0,
"out of memory",
"no element found",
- "invalid token after %lu bytes",
- "unclosed token started after %lu bytes",
- "unclosed token started after %lu bytes",
- "mismatched tag after %lu bytes",
- "duplicate attribute after %lu bytes",
- "junk after document element after %lu bytes",
+ "invalid token",
+ "unclosed token",
+ "unclosed token",
+ "mismatched tag",
+ "duplicate attribute",
+ "junk after document element",
};
- fprintf(stderr, "%s: ", name);
- fprintf(stderr, message[result], (unsigned long)(bad - p));
+ fprintf(stderr, "%s:", name);
+ if (badPtr != 0)
+ fprintf(stderr, "%lu:%lu:", badLine+1, badCol);
+ fprintf(stderr, "E: %s", message[result]);
putc('\n', stderr);
ret = 1;
}