Add methods for getting attributes and comparing names.

author James Clark <jjc@jclark.com>

Tue, 11 Nov 1997 05:53:20 +0000 (05:53 +0000)

committer James Clark <jjc@jclark.com>

Tue, 11 Nov 1997 05:53:20 +0000 (05:53 +0000)
author James Clark <jjc@jclark.com>
Tue, 11 Nov 1997 05:53:20 +0000 (05:53 +0000)
committer James Clark <jjc@jclark.com>
Tue, 11 Nov 1997 05:53:20 +0000 (05:53 +0000)
diff --git a/expat/xmltok/xmltok.c b/expat/xmltok/xmltok.c

index f1a0d3518176c3623d4fb9d35e3af15e492fc4ac..b0693c85853cb4846be8e04be25e4f663d466cbe 100755 (executable)
--- a/expat/xmltok/xmltok.c
+++ b/expat/xmltok/xmltok.c
@@ -1,5 +1,12 @@
  /* TODO
  
+method to get name length
+method to extract attribute names (returns number of atts)
+size_t getAttributes(const char *ptr, const char *end, const char **atts, size_t maxAtts)
+
+
+Provide method to count lines/columns.
+
  Provide methods to convert to any of UTF-8, UTF-18, UCS-4.
  
  Better prolog tokenization
@@ -9,6 +16,8 @@ NMTOKEN
  NAME
  PEREF
  
+MatchEndTag(endTagStart, endTagEnd, startTagPtr, startTagEnd)
+
  */
  
  #ifdef _MSC_VER
@@ -78,7 +87,7 @@ struct normal_encoding {
  #undef IS_NMSTRT_CHAR
  
  const struct normal_encoding utf8_encoding = {
-  { { PREFIX(prologTok), PREFIX(contentTok) }, 1 },
+  { { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 1 },
  #include "asciitab.h"
  #include "utf8tab.h"
  };
@@ -127,7 +136,7 @@ static int unicode_byte_type(char hi, char lo)
  #undef IS_NMSTRT_CHAR
  
  const struct encoding little2_encoding = {
- { PREFIX(prologTok), PREFIX(contentTok) }, 2
+ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
  };
  
  #undef PREFIX
@@ -152,7 +161,7 @@ const struct encoding little2_encoding = {
  #undef IS_NMSTRT_CHAR
  
  const struct encoding big2_encoding = {
- { PREFIX(prologTok), PREFIX(contentTok) }, 2
+ { PREFIX(prologTok), PREFIX(contentTok) }, PREFIX(sameName), PREFIX(getAtts), 2
  };
  
  #undef PREFIX
diff --git a/expat/xmltok/xmltok.h b/expat/xmltok/xmltok.h

index 511251e7ec59e4c4bd11305b352812bbca10c5b1..3583e06854ba8f5bd69e6cfdc926ce896ebb0a8f 100755 (executable)
--- a/expat/xmltok/xmltok.h
+++ b/expat/xmltok/xmltok.h
@@ -14,28 +14,32 @@ extern "C" {
  #define XML_TOK_PARTIAL_CHAR -2 /* only part of a multibyte sequence */
  #define XML_TOK_PARTIAL -1 /* only part of a token */
  #define XML_TOK_INVALID 0
-#define XML_TOK_BOM 1     /* Byte order mark */
-#define XML_TOK_COMMENT 2
-#define XML_TOK_PI 3      /* processing instruction */
-
-/* The following tokens are returned only by XmlPrologTok */
-#define XML_TOK_LITERAL 4
-#define XML_TOK_PROLOG_CHARS 5
-#define XML_TOK_PROLOG_S 6
  
  /* The following token is returned by XmlPrologTok when it detects the end
  of the prolog and is also returned by XmlContentTok */
  
-#define XML_TOK_START_TAG 7
+#define XML_TOK_START_TAG_WITH_ATTS 1
+#define XML_TOK_START_TAG_NO_ATTS 2
+#define XML_TOK_EMPTY_ELEMENT_WITH_ATTS 3 /* empty element tag <e/> */
+#define XML_TOK_EMPTY_ELEMENT_NO_ATTS 4
  
  /* The following tokens are returned only by XmlContentTok */
  
-#define XML_TOK_END_TAG 8
-#define XML_TOK_EMPTY_ELEMENT 9 /* empty element tag <e/> */
-#define XML_TOK_DATA_CHARS 10
-#define XML_TOK_CDATA_SECTION 11
-#define XML_TOK_ENTITY_REF 12
-#define XML_TOK_CHAR_REF 13     /* numeric character reference */
+#define XML_TOK_END_TAG 5
+#define XML_TOK_DATA_CHARS 6
+#define XML_TOK_CDATA_SECTION 7
+#define XML_TOK_ENTITY_REF 8
+#define XML_TOK_CHAR_REF 9     /* numeric character reference */
+
+/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
+#define XML_TOK_PI 10      /* processing instruction */
+#define XML_TOK_COMMENT 11
+#define XML_TOK_BOM 12     /* Byte order mark */
+
+/* The following tokens are returned only by XmlPrologTok */
+#define XML_TOK_LITERAL 13
+#define XML_TOK_PROLOG_CHARS 14
+#define XML_TOK_PROLOG_S 15
  
  #define XML_NSTATES 2
  #define XML_PROLOG_STATE 0
@@ -46,6 +50,10 @@ typedef struct encoding {
                                const char *,
                                const char *,
                                const char **);
+  int (*sameName)(const struct encoding *,
+                 const char *, const char *);
+  int (*getAtts)(const struct encoding *enc, const char *ptr,
+                int attsMax, const char **atts);
    int minBytesPerChar;
  } ENCODING;
  
@@ -79,6 +87,11 @@ literals, comments and processing instructions.
  #define XmlContentTok(enc, ptr, end, nextTokPtr) \
     XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
  
+#define XmlSameName(enc, ptr1, ptr2) (((enc)->sameName)(enc, ptr1, ptr2))
+
+#define XmlGetAttributes(enc, ptr, attsMax, atts) \
+  (((enc)->getAtts)(enc, ptr, attsMax, atts))
+
  typedef struct {
    ENCODING initEnc;
    const ENCODING **encPtr;
diff --git a/expat/xmltok/xmltok_impl.c b/expat/xmltok/xmltok_impl.c

index efb0f380252ab1ee19c834152c9e654abfb31c43..a71d76c4c0097bea3ade3826b478cb2cf1abad33 100755 (executable)
--- a/expat/xmltok/xmltok_impl.c
+++ b/expat/xmltok/xmltok_impl.c
@@ -453,7 +453,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
             continue;
           case BT_GT:
             *nextTokPtr = ptr + MINBPC;
-           return XML_TOK_START_TAG;
+           return XML_TOK_START_TAG_WITH_ATTS;
           case BT_SOL:
             ptr += MINBPC;
             if (ptr == end)
@@ -463,7 +463,7 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
               return XML_TOK_INVALID;
             }
             *nextTokPtr = ptr + MINBPC;
-           return XML_TOK_EMPTY_ELEMENT;
+           return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
           default:
             *nextTokPtr = ptr;
             return XML_TOK_INVALID;
@@ -537,7 +537,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
      case BT_GT:
      gt:
        *nextTokPtr = ptr + MINBPC;
-      return XML_TOK_START_TAG;
+      return XML_TOK_START_TAG_NO_ATTS;
      case BT_SOL:
      sol:
        ptr += MINBPC;
@@ -548,7 +548,7 @@ int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
         return XML_TOK_INVALID;
        }
        *nextTokPtr = ptr + MINBPC;
-      return XML_TOK_EMPTY_ELEMENT;
+      return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
      default:
        *nextTokPtr = ptr;
        return XML_TOK_INVALID;
@@ -728,6 +728,120 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
    return XML_TOK_PROLOG_CHARS;
  }
  
+/* This must only be called for a well-formed start-tag or empty element tag.
+Returns the number of attributes.  Pointers to the names of up to the first
+attsMax attributes are stored in atts. */
+static
+int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
+                   int attsMax, const char **atts)
+{
+  enum { other, inName, inValue } state = inName;
+  int nAtts = 0;
+  int open;
+
+  for (ptr += MINBPC;; ptr += MINBPC) {
+    switch (BYTE_TYPE(enc, ptr)) {
+#define START_NAME \
+      if (state == other) { \
+       if (nAtts < attsMax) \
+         atts[nAtts] = ptr; \
+       ++nAtts; \
+       state = inName; \
+      }
+#define LEAD_CASE(n) \
+    case BT_LEAD ## n: START_NAME ptr += (n - MINBPC); break;
+    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) LEAD_CASE(5) LEAD_CASE(6)
+#undef LEAD_CASE
+    case BT_NONASCII:
+    case BT_NMSTRT:
+    case BT_HEX:
+      START_NAME
+      break;
+#undef START_NAME
+    case BT_QUOT:
+      if (state == other) {
+        state = inValue;
+        open = BT_QUOT;
+      }
+      else if (open == BT_QUOT)
+        state = other;
+      break;
+    case BT_APOS:
+      if (state == other) {
+        state = inValue;
+        open = BT_APOS;
+      }
+      else if (open == BT_APOS)
+        state = other;
+      break;
+    case BT_S:
+      /* This case ensures that the first attribute name is counted
+         Apart from that we could just change state on the quote. */
+      if (state == inName)
+        state = other;
+      break;
+    case BT_GT:
+    case BT_SOL:
+      if (state != inValue)
+       return nAtts;
+      break;
+    default:
+      break;
+    }
+  }
+  /* not reached */
+}
+
+static
+int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
+{
+  for (;;) {
+    switch (BYTE_TYPE(enc, ptr1)) {
+#define LEAD_CASE(n) \
+    case BT_LEAD ## n: \
+      if (*ptr1++ != *ptr2++) \
+       return 0;
+    LEAD_CASE(6) LEAD_CASE(5) LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
+#undef LEAD_CASE
+      /* fall through */
+      if (*ptr1++ != *ptr2++)
+       return 0;
+      break;
+    case BT_NONASCII:
+    case BT_NMSTRT:
+    case BT_HEX:
+    case BT_DIGIT:
+    case BT_NAME:
+    case BT_MINUS:
+      if (*ptr2 != *ptr1)
+       return 0;
+      ptr1 += MINBPC;
+      ptr2 += MINBPC;
+      break;
+    default:
+      if (*ptr1 == *ptr2)
+       return 1;
+      switch (BYTE_TYPE(enc, ptr2)) {
+      case BT_LEAD2:
+      case BT_LEAD3:
+      case BT_LEAD4:
+      case BT_LEAD5:
+      case BT_LEAD6:
+      case BT_NONASCII:
+      case BT_NMSTRT:
+      case BT_HEX:
+      case BT_DIGIT:
+      case BT_NAME:
+      case BT_MINUS:
+       return 0;
+      default:
+       return 1;
+      }
+    }
+  }
+  /* not reached */
+}
+
  #undef DO_LEAD_CASE
  #undef MULTIBYTE_CASES
  #undef INVALID_CASES
author	James Clark <jjc@jclark.com>
	Tue, 11 Nov 1997 05:53:20 +0000 (05:53 +0000)
committer	James Clark <jjc@jclark.com>
	Tue, 11 Nov 1997 05:53:20 +0000 (05:53 +0000)
expat/xmltok/xmltok.c		patch \| blob \| history
expat/xmltok/xmltok.h		patch \| blob \| history
expat/xmltok/xmltok_impl.c		patch \| blob \| history