From: James Clark Date: Wed, 12 Nov 1997 10:38:58 +0000 (+0000) Subject: Support for line and column numbers X-Git-Tag: REC1_0~68 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9651443ca7e1384b2973a7e8a745e8ab3650cce9;p=libexpat Support for line and column numbers --- diff --git a/expat/xmltok/asciitab.h b/expat/xmltok/asciitab.h index 49388455..eec36387 100755 --- a/expat/xmltok/asciitab.h +++ b/expat/xmltok/asciitab.h @@ -1,7 +1,7 @@ /* 0x00 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x04 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, -/* 0x08 */ BT_NONXML, BT_S, BT_S, BT_NONXML, -/* 0x0C */ BT_NONXML, BT_S, BT_NONXML, BT_NONXML, +/* 0x08 */ BT_NONXML, BT_S, BT_LF, BT_NONXML, +/* 0x0C */ BT_NONXML, BT_CR, BT_NONXML, BT_NONXML, /* 0x10 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x14 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x18 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, diff --git a/expat/xmltok/xmltok.c b/expat/xmltok/xmltok.c index d7dfeb19..ce835a24 100755 --- a/expat/xmltok/xmltok.c +++ b/expat/xmltok/xmltok.c @@ -2,8 +2,6 @@ Provide method to get name length. -Provide method to count lines/columns. - Provide methods to convert to any of UTF-8, UTF-18, UCS-4. Tokenize prologs in a way useful for well-formedness checking @@ -82,7 +80,7 @@ struct normal_encoding { #undef IS_NMSTRT_CHAR const struct normal_encoding utf8_encoding = { - { { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 }, + { { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 1 }, #include "asciitab.h" #include "utf8tab.h" }; @@ -131,7 +129,7 @@ static int unicode_byte_type(char hi, char lo) #undef IS_NMSTRT_CHAR const struct encoding little2_encoding = { - { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2 + { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2 }; #undef PREFIX @@ -156,7 +154,7 @@ const struct encoding little2_encoding = { #undef IS_NMSTRT_CHAR const struct encoding big2_encoding = { - { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2 + { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), PREFIX(updatePosition), 2 }; #undef PREFIX @@ -215,10 +213,18 @@ int initScanContent(const ENCODING *enc, const char *ptr, const char *end, return initScan(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr); } +static +void initUpdatePosition(const ENCODING *enc, const char *ptr, + const char *end, POSITION *pos) +{ + normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); +} + void XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr) { p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog; p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent; + p->initEnc.updatePosition = initUpdatePosition; p->initEnc.minBytesPerChar = 1; p->encPtr = encPtr; *encPtr = &(p->initEnc); diff --git a/expat/xmltok/xmltok.h b/expat/xmltok/xmltok.h index 3583e068..371daf63 100755 --- a/expat/xmltok/xmltok.h +++ b/expat/xmltok/xmltok.h @@ -45,6 +45,15 @@ of the prolog and is also returned by XmlContentTok */ #define XML_PROLOG_STATE 0 #define XML_CONTENT_STATE 1 +typedef struct position { + /* first line and first column are 0 not 1 */ + unsigned long lineNumber; + unsigned long columnNumber; + /* if the last character counted was CR, then an immediately + following LF should be ignored */ + int ignoreInitialLF; +} POSITION; + typedef struct encoding { int (*scanners[XML_NSTATES])(const struct encoding *, const char *, @@ -54,6 +63,10 @@ typedef struct encoding { const char *, const char *); int (*getAtts)(const struct encoding *enc, const char *ptr, int attsMax, const char **atts); + void (*updatePosition)(const struct encoding *, + const char *ptr, + const char *end, + POSITION *); int minBytesPerChar; } ENCODING; @@ -92,6 +105,9 @@ literals, comments and processing instructions. #define XmlGetAttributes(enc, ptr, attsMax, atts) \ (((enc)->getAtts)(enc, ptr, attsMax, atts)) +#define XmlUpdatePosition(enc, ptr, end, pos) \ + (((enc)->updatePosition)(enc, ptr, end, pos)) + typedef struct { ENCODING initEnc; const ENCODING **encPtr; diff --git a/expat/xmltok/xmltok_impl.c b/expat/xmltok/xmltok_impl.c index a71d76c4..be241e41 100755 --- a/expat/xmltok/xmltok_impl.c +++ b/expat/xmltok/xmltok_impl.c @@ -162,7 +162,7 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end, while (ptr != end) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) - case BT_S: + case BT_S: case BT_CR: case BT_LF: ptr += MINBPC; while (ptr != end) { switch (BYTE_TYPE(enc, ptr)) { @@ -252,10 +252,10 @@ int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end, while (ptr != end) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) - case BT_S: + case BT_S: case BT_CR: case BT_LF: for (ptr += MINBPC; ptr != end; ptr += MINBPC) { switch (BYTE_TYPE(enc, ptr)) { - case BT_S: + case BT_S: case BT_CR: case BT_LF: break; case BT_GT: *nextTokPtr = ptr + MINBPC; @@ -380,7 +380,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, while (ptr != end) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) - case BT_S: + case BT_S: case BT_CR: case BT_LF: for (;;) { int t; @@ -390,7 +390,12 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, t = BYTE_TYPE(enc, ptr); if (t == BT_EQUALS) break; - if (t != BT_S) { + switch (t) { + case BT_S: + case BT_LF: + case BT_CR: + break; + default: *nextTokPtr = ptr; return XML_TOK_INVALID; } @@ -407,7 +412,12 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, open = BYTE_TYPE(enc, ptr); if (open == BT_QUOT || open == BT_APOS) break; - if (open != BT_S) { + switch (open) { + case BT_S: + case BT_LF: + case BT_CR: + break; + default: *nextTokPtr = ptr; return XML_TOK_INVALID; } @@ -449,7 +459,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, return XML_TOK_PARTIAL; switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) - case BT_S: + case BT_S: case BT_CR: case BT_LF: continue; case BT_GT: *nextTokPtr = ptr + MINBPC; @@ -513,7 +523,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, while (ptr != end) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) - case BT_S: + case BT_S: case BT_CR: case BT_LF: { ptr += MINBPC; while (ptr != end) { @@ -523,7 +533,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, goto gt; case BT_SOL: goto sol; - case BT_S: + case BT_S: case BT_CR: case BT_LF: ptr += MINBPC; continue; default: @@ -697,10 +707,19 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, *nextTokPtr = ptr; return XML_TOK_INVALID; } - case BT_S: - do { + case BT_S: case BT_CR: case BT_LF: + for (;;) { ptr += MINBPC; - } while (ptr != end && BYTE_TYPE(enc, ptr) == BT_S); + if (ptr == end) + break; + switch (BYTE_TYPE(enc, ptr)) { + case BT_S: case BT_CR: case BT_LF: + break; + default: + *nextTokPtr = ptr; + return XML_TOK_PROLOG_S; + } + } *nextTokPtr = ptr; return XML_TOK_PROLOG_S; default: @@ -715,7 +734,7 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, case BT_NONXML: case BT_MALFORM: case BT_TRAIL: - case BT_S: + case BT_S: case BT_CR: case BT_LF: *nextTokPtr = ptr; return XML_TOK_PROLOG_CHARS; MULTIBYTE_CASES(ptr, end, (*nextTokPtr = ptr, XML_TOK_PROLOG_CHARS)) @@ -774,7 +793,7 @@ int PREFIX(getAtts)(const ENCODING *enc, const char *ptr, else if (open == BT_APOS) state = other; break; - case BT_S: + case BT_S: case BT_CR: case BT_LF: /* This case ensures that the first attribute name is counted Apart from that we could just change state on the quote. */ if (state == inName) @@ -842,6 +861,47 @@ int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) /* not reached */ } +static +void PREFIX(updatePosition)(const ENCODING *enc, + const char *ptr, + const char *end, + POSITION *pos) +{ + if (pos->ignoreInitialLF) { + if (ptr == end) + return; + if (CHAR_MATCHES(enc, ptr, '\n')) + ptr += MINBPC; + pos->ignoreInitialLF = 0; + } + while (ptr != end) { + switch (BYTE_TYPE(enc, ptr)) { + MULTIBYTE_CASES(ptr, end, ;/* hack! */) + case BT_LF: + pos->columnNumber = (unsigned)-1; + pos->lineNumber++; + ptr += MINBPC; + break; + case BT_CR: + pos->lineNumber++; + ptr += MINBPC; + if (ptr == end) { + pos->ignoreInitialLF = 1; + pos->columnNumber = 0; + return; + } + pos->columnNumber = (unsigned)-1; + if (CHAR_MATCHES(enc, ptr, '\n')) + ptr += MINBPC; + break; + default: + ptr += MINBPC; + break; + } + pos->columnNumber++; + } +} + #undef DO_LEAD_CASE #undef MULTIBYTE_CASES #undef INVALID_CASES diff --git a/expat/xmltok/xmltok_impl.h b/expat/xmltok/xmltok_impl.h index 2aa55c93..ffad0279 100755 --- a/expat/xmltok/xmltok_impl.h +++ b/expat/xmltok/xmltok_impl.h @@ -21,6 +21,8 @@ enum { BT_LSQB, BT_RSQB, BT_S, + BT_CR, + BT_LF, BT_NMSTRT, BT_HEX, BT_DIGIT, diff --git a/expat/xmlwf/wfcheck.c b/expat/xmlwf/wfcheck.c index 7d515b08..5eaedfd4 100755 --- a/expat/xmlwf/wfcheck.c +++ b/expat/xmlwf/wfcheck.c @@ -1,4 +1,5 @@ #include +#include #include "wfcheck.h" #ifdef _MSC_VER @@ -10,9 +11,14 @@ static int skipProlog(const char **s, const char *end, const char **nextTokP, const ENCODING **enc); +static +void setPosition(const ENCODING *enc, + const char *start, const char *end, + const char **badPtr, unsigned long *badLine, unsigned long *badCol); enum WfCheckResult -wfCheck(const char *s, size_t n, const char **badPtr) +wfCheck(const char *s, size_t n, + const char **badPtr, unsigned long *badLine, unsigned long *badCol) { unsigned nElements = 0; unsigned nAtts = 0; @@ -33,16 +39,16 @@ wfCheck(const char *s, size_t n, const char **badPtr) for (;;) { switch (tok) { case XML_TOK_NONE: - *badPtr = s; + setPosition(enc, start, s, badPtr, badLine, badCol); RETURN_CLEANUP(noElements); case XML_TOK_INVALID: - *badPtr = next; + setPosition(enc, start, next, badPtr, badLine, badCol); RETURN_CLEANUP(invalidToken); case XML_TOK_PARTIAL: - *badPtr = s; + setPosition(enc, start, s, badPtr, badLine, badCol); RETURN_CLEANUP(unclosedToken); case XML_TOK_PARTIAL_CHAR: - *badPtr = s; + setPosition(enc, start, s, badPtr, badLine, badCol); RETURN_CLEANUP(partialChar); case XML_TOK_EMPTY_ELEMENT_NO_ATTS: nElements++; @@ -87,7 +93,7 @@ wfCheck(const char *s, size_t n, const char **badPtr) int j; for (j = 0; j < i; j++) { if (XmlSameName(enc, atts[i], atts[j])) { - *badPtr = atts[i]; + setPosition(enc, start, atts[i], badPtr, badLine, badCol); RETURN_CLEANUP(duplicateAttribute); } } @@ -97,7 +103,7 @@ wfCheck(const char *s, size_t n, const char **badPtr) case XML_TOK_END_TAG: --level; if (!XmlSameName(enc, startName[level], s + enc->minBytesPerChar * 2)) { - *badPtr = s; + setPosition(enc, start, s, badPtr, badLine, badCol); RETURN_CLEANUP(tagMismatch); } break; @@ -116,7 +122,7 @@ wfCheck(const char *s, size_t n, const char **badPtr) break; default: if (tok > 0) { - *badPtr = s; + setPosition(enc, start, s, badPtr, badLine, badCol); RETURN_CLEANUP(junkAfterDocElement); } break; @@ -156,3 +162,16 @@ int skipProlog(const char **startp, const char *end, } /* not reached */ } + +static +void setPosition(const ENCODING *enc, + const char *start, const char *end, + const char **badPtr, unsigned long *badLine, unsigned long *badCol) +{ + POSITION pos; + memset(&pos, 0, sizeof(POSITION)); + XmlUpdatePosition(enc, start, end, &pos); + *badPtr = end; + *badLine = pos.lineNumber; + *badCol = pos.columnNumber; +} diff --git a/expat/xmlwf/wfcheck.h b/expat/xmlwf/wfcheck.h index 9afef67c..7b410455 100755 --- a/expat/xmlwf/wfcheck.h +++ b/expat/xmlwf/wfcheck.h @@ -13,5 +13,8 @@ enum WfCheckResult { junkAfterDocElement }; -enum WfCheckResult wfCheck(const char *s, size_t n, const char **badPtr); +enum WfCheckResult wfCheck(const char *s, size_t n, + const char **errorPtr, + unsigned long *errorLineNumber, + unsigned long *errorColNumber); diff --git a/expat/xmlwf/xmlwf.c b/expat/xmlwf/xmlwf.c index ab9c0e8d..9723dbda 100755 --- a/expat/xmlwf/xmlwf.c +++ b/expat/xmlwf/xmlwf.c @@ -10,8 +10,11 @@ int doFile(const char *name) HANDLE f; HANDLE m; DWORD size; + DWORD sizeHi; const char *p; - const char *bad = 0; + const char *badPtr = 0; + unsigned long badLine = 0; + unsigned long badCol = 0; int ret; enum WfCheckResult result; @@ -21,7 +24,16 @@ int doFile(const char *name) fprintf(stderr, "%s: CreateFile failed\n", name); return 0; } - size = GetFileSize(f, NULL); + size = GetFileSize(f, &sizeHi); + if (sizeHi) { + fprintf(stderr, "%s: too big (limit 2Gb)\n", name); + return 0; + } + /* CreateFileMapping barfs on zero length files */ + if (size == 0) { + fprintf(stderr, "%s: zero-length file\n", name); + return 0; + } m = CreateFileMapping(f, NULL, PAGE_READONLY, 0, 0, NULL); if (m == NULL) { fprintf(stderr, "%s: CreateFileMapping failed\n", name); @@ -35,21 +47,23 @@ int doFile(const char *name) fprintf(stderr, "%s: MapViewOfFile failed\n", name); return 0; } - result = wfCheck(p, size, &bad); + result = wfCheck(p, size, &badPtr, &badLine, &badCol); if (result) { static const char *message[] = { 0, "out of memory", "no element found", - "invalid token after %lu bytes", - "unclosed token started after %lu bytes", - "unclosed token started after %lu bytes", - "mismatched tag after %lu bytes", - "duplicate attribute after %lu bytes", - "junk after document element after %lu bytes", + "invalid token", + "unclosed token", + "unclosed token", + "mismatched tag", + "duplicate attribute", + "junk after document element", }; - fprintf(stderr, "%s: ", name); - fprintf(stderr, message[result], (unsigned long)(bad - p)); + fprintf(stderr, "%s:", name); + if (badPtr != 0) + fprintf(stderr, "%lu:%lu:", badLine+1, badCol); + fprintf(stderr, "E: %s", message[result]); putc('\n', stderr); ret = 1; }