From 748ac8799d53da6ebc247310852c2d8b832279ed Mon Sep 17 00:00:00 2001 From: Sebastian Pipping Date: Tue, 27 Aug 2019 18:46:00 +0200 Subject: [PATCH] xmltok: Add more in-code documentation about byte types --- expat/lib/xmltok.c | 6 ++-- expat/lib/xmltok_impl.h | 70 ++++++++++++++++++++--------------------- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/expat/lib/xmltok.c b/expat/lib/xmltok.c index acc29651..45f1f16f 100644 --- a/expat/lib/xmltok.c +++ b/expat/lib/xmltok.c @@ -587,11 +587,13 @@ static const struct normal_encoding ascii_encoding static int PTRFASTCALL unicode_byte_type(char hi, char lo) { switch ((unsigned char)hi) { + /* 0xD800–0xDBFF first 16-bit code unit or high surrogate (W1) */ case 0xD8: case 0xD9: case 0xDA: case 0xDB: return BT_LEAD4; + /* 0xDC00–0xDFFF second 16-bit code unit or low surrogate (W2) */ case 0xDC: case 0xDD: case 0xDE: @@ -599,8 +601,8 @@ unicode_byte_type(char hi, char lo) { return BT_TRAIL; case 0xFF: switch ((unsigned char)lo) { - case 0xFF: - case 0xFE: + case 0xFF: /* noncharacter-FFFF */ + case 0xFE: /* noncharacter-FFFE */ return BT_NONXML; } break; diff --git a/expat/lib/xmltok_impl.h b/expat/lib/xmltok_impl.h index cb0f8878..e925dbc7 100644 --- a/expat/lib/xmltok_impl.h +++ b/expat/lib/xmltok_impl.h @@ -31,43 +31,43 @@ */ enum { - BT_NONXML, - BT_MALFORM, - BT_LT, - BT_AMP, - BT_RSQB, - BT_LEAD2, - BT_LEAD3, - BT_LEAD4, - BT_TRAIL, - BT_CR, - BT_LF, - BT_GT, - BT_QUOT, - BT_APOS, - BT_EQUALS, - BT_QUEST, - BT_EXCL, - BT_SOL, - BT_SEMI, - BT_NUM, - BT_LSQB, - BT_S, - BT_NMSTRT, - BT_COLON, - BT_HEX, - BT_DIGIT, - BT_NAME, - BT_MINUS, + BT_NONXML, /* e.g. noncharacter-FFFF */ + BT_MALFORM, /* illegal, with regard to encoding */ + BT_LT, /* less than = "<" */ + BT_AMP, /* ampersand = "&" */ + BT_RSQB, /* right square bracket = "[" */ + BT_LEAD2, /* lead byte of a 2-byte UTF-8 character */ + BT_LEAD3, /* lead byte of a 3-byte UTF-8 character */ + BT_LEAD4, /* lead byte of a 4-byte UTF-8 character */ + BT_TRAIL, /* trailing unit, e.g. second 16-bit unit of a 4-byte char. */ + BT_CR, /* carriage return = "\r" */ + BT_LF, /* line feed = "\n" */ + BT_GT, /* greater than = ">" */ + BT_QUOT, /* quotation character = "\"" */ + BT_APOS, /* aposthrophe = "'" */ + BT_EQUALS, /* equal sign = "=" */ + BT_QUEST, /* question mark = "?" */ + BT_EXCL, /* exclamation mark = "!" */ + BT_SOL, /* solidus, slash = "/" */ + BT_SEMI, /* semicolon = ";" */ + BT_NUM, /* number sign = "#" */ + BT_LSQB, /* left square bracket = "[" */ + BT_S, /* white space, e.g. "\t", " "[, "\r"] */ + BT_NMSTRT, /* non-hex name start letter = "G".."Z" + "g".."z" + "_" */ + BT_COLON, /* colon = ":" */ + BT_HEX, /* hex letter = "A".."F" + "a".."f" */ + BT_DIGIT, /* digit = "0".."9" */ + BT_NAME, /* dot and middle dot = "." + chr(0xb7) */ + BT_MINUS, /* minus = "-" */ BT_OTHER, /* known not to be a name or name start character */ BT_NONASCII, /* might be a name or name start character */ - BT_PERCNT, - BT_LPAR, - BT_RPAR, - BT_AST, - BT_PLUS, - BT_COMMA, - BT_VERBAR + BT_PERCNT, /* percent sign = "%" */ + BT_LPAR, /* left parenthesis = "(" */ + BT_RPAR, /* right parenthesis = "(" */ + BT_AST, /* asterisk = "*" */ + BT_PLUS, /* plus sign = "+" */ + BT_COMMA, /* comma = "," */ + BT_VERBAR /* vertical bar = "|" */ }; #include -- 2.40.0