Patch #534304: Implement phase 1 of PEP 263.

author Martin v. Löwis <martin@v.loewis.de>

Sun, 4 Aug 2002 17:29:52 +0000 (17:29 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Sun, 4 Aug 2002 17:29:52 +0000 (17:29 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Sun, 4 Aug 2002 17:29:52 +0000 (17:29 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Sun, 4 Aug 2002 17:29:52 +0000 (17:29 +0000)
diff --git a/Doc/ref/ref2.tex b/Doc/ref/ref2.tex

index e9fab58fd0d1da97792dcab00a2b624905756cb5..c8ecb4f6dbfad1db05ae070551e2aaa5cb8cef55 100644 (file)
--- a/Doc/ref/ref2.tex
+++ b/Doc/ref/ref2.tex
@@ -7,11 +7,14 @@ chapter describes how the lexical analyzer breaks a file into tokens.
  \index{parser}
  \index{token}
  
-Python uses the 7-bit \ASCII{} character set for program text and string
-literals. 8-bit characters may be used in string literals and comments
-but their interpretation is platform dependent; the proper way to
-insert 8-bit characters in string literals is by using octal or
-hexadecimal escape sequences.
+Python uses the 7-bit \ASCII{} character set for program text.
+\versionadded[An encoding declaration can be used to indicate that 
+string literals and comments use an encoding different from ASCII.]{2.3}
+For compatibility with older versions, Python only warns if it finds
+8-bit characters; those warnings should be corrected by either declaring
+an explicit encoding, or using escape sequences if those bytes are binary
+data, instead of characters.
+
  
  The run-time character set depends on the I/O devices connected to the
  program but is generally a superset of \ASCII.
@@ -69,6 +72,37 @@ Comments are ignored by the syntax; they are not tokens.
  \index{hash character}
  
  
+\subsection{Encoding declarations\label{encodings}}
+
+If a comment in the first or second line of the Python script matches
+the regular expression "coding[=:]\s*([\w-_.]+)", this comment is
+processed as an encoding declaration; the first group of this
+expression names the encoding of the source code file. The recommended
+forms of this expression are
+
+\begin{verbatim}
+# -*- coding: <encoding-name> -*-
+\end{verbatim}
+
+which is recognized also by GNU Emacs, and
+
+\begin{verbatim}
+# vim:fileencoding=<encoding-name>
+\end{verbatim}
+
+which is recognized by Bram Moolenar's VIM. In addition, if the first
+bytes of the file are the UTF-8 signature ($'\xef\xbb\xbf'$), the
+declared file encoding is UTF-8 (this is supported, among others, by
+Microsoft's notepad.exe).
+
+If an encoding is declared, the encoding name must be recognized by
+Python. % XXX there should be a list of supported encodings.
+The encoding is used for all lexical analysis, in particular to find
+the end of a string, and to interpret the contents of Unicode literals.
+String literals are converted to Unicode for syntactical analysis,
+then converted back to their original encoding before interpretation
+starts.
+
  \subsection{Explicit line joining\label{explicit-joining}}
  
  Two or more physical lines may be joined into logical lines using
diff --git a/Grammar/Grammar b/Grammar/Grammar

index 3f2b4ef1c60ed70dc5960a16df107ef74cb11323..e52a544bc8426a68362cbaeed13ce435b1a9d010 100644 (file)
--- a/Grammar/Grammar
+++ b/Grammar/Grammar
@@ -102,3 +102,6 @@ list_for: 'for' exprlist 'in' testlist_safe [list_iter]
  list_if: 'if' test [list_iter]
  
  testlist1: test (',' test)*
+
+# not used in grammar, but may appear in "node" passed from Parser to Compiler
+encoding_decl: NAME
diff --git a/Include/errcode.h b/Include/errcode.h

index daa702f7e177c518bd62529907f244b99ac21d53..a8b1aaab6f7a06e9f7d7230daca17e2d979e216f 100644 (file)
--- a/Include/errcode.h
+++ b/Include/errcode.h
@@ -25,6 +25,7 @@ extern "C" {
  #define E_OVERFLOW      19     /* Node had too many children */
  #define E_TOODEEP      20      /* Too many indentation levels */
  #define E_DEDENT       21      /* No matching outer block for dedent */
+#define E_DECODE       22      /* Error in decoding into Unicode */
  
  #ifdef __cplusplus
  }
diff --git a/Include/graminit.h b/Include/graminit.h

index 50abda0d3000a81bf781a1f40f1b5ade769e34b5..1f2ab3eadcc1777f6e8d53c681dc53cafec56b3f 100644 (file)
--- a/Include/graminit.h
+++ b/Include/graminit.h
@@ -65,3 +65,4 @@
  #define list_for 320
  #define list_if 321
  #define testlist1 322
+#define encoding_decl 323
diff --git a/Makefile.pre.in b/Makefile.pre.in

index 7e654789777e2103d92b2196a7b82e557ca35176..5ab5b146ddda7ccffe5751fd111eb86c66cd4879 100644 (file)
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -190,15 +190,15 @@ POBJS=            \
                 Parser/node.o \
                 Parser/parser.o \
                 Parser/parsetok.o \
-               Parser/tokenizer.o \
                 Parser/bitset.o \
                 Parser/metagrammar.o
  
-PARSER_OBJS=   $(POBJS) Parser/myreadline.o
+PARSER_OBJS=   $(POBJS) Parser/myreadline.o Parser/tokenizer.o
  
  PGOBJS=                \
                 Objects/obmalloc.o \
                 Python/mysnprintf.o \
+               Parser/tokenizer_pgen.o \
                 Parser/firstsets.o \
                 Parser/grammar.o \
                 Parser/pgen.o \
@@ -434,6 +434,8 @@ Parser/grammar.o:   $(srcdir)/Parser/grammar.c \
                                 $(srcdir)/Include/grammar.h
  Parser/metagrammar.o:  $(srcdir)/Parser/metagrammar.c
  
+Parser/tokenizer_pgen.o:       $(srcdir)/Parser/tokenizer.c
+
  
  Python/compile.o Python/symtable.o: $(GRAMMAR_H)
  
diff --git a/Misc/NEWS b/Misc/NEWS

index 77e8c62f52d8ad3c206987aa039c6cec94909af9..0f06ff450f4d5c8e9c5f015751400a861644d1d6 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -6,6 +6,8 @@ Type/class unification and new-style classes
  
  Core and builtins
  
+- Encoding declarations (PEP 263, phase 1) have been implemented.
+
  - list.sort() has a new implementation.  While cross-platform results
    may vary, and in data-dependent ways, this is much faster on many
    kinds of partially ordered lists than the previous implementation,
diff --git a/Parser/parsetok.c b/Parser/parsetok.c

index 5758fa749ba1d4332a58420235bb13cf4b4b8148..d70e2d64f832659b007265b0af868689011bdac3 100644 (file)
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -8,6 +8,7 @@
  #include "parser.h"
  #include "parsetok.h"
  #include "errcode.h"
+#include "graminit.h"
  
  int Py_TabcheckFlag;
  
@@ -45,8 +46,8 @@ PyParser_ParseStringFlagsFilename(char *s, char *filename,
                 return NULL;
         }
  
+        tok->filename = filename ? filename : "<string>";
         if (Py_TabcheckFlag || Py_VerboseFlag) {
-               tok->filename = filename ? filename : "<string>";
                 tok->altwarning = (tok->filename != NULL);
                 if (Py_TabcheckFlag >= 2)
                         tok->alterror++;
@@ -78,8 +79,8 @@ PyParser_ParseFileFlags(FILE *fp, char *filename, grammar *g, int start,
                 err_ret->error = E_NOMEM;
                 return NULL;
         }
+       tok->filename = filename;
         if (Py_TabcheckFlag || Py_VerboseFlag) {
-               tok->filename = filename;
                 tok->altwarning = (filename != NULL);
                 if (Py_TabcheckFlag >= 2)
                         tok->alterror++;
@@ -185,6 +186,13 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
                                 err_ret->text[len] = '\0';
                         }
                 }
+       } else if (tok->encoding != NULL) {
+               node* r = PyNode_New(encoding_decl);
+               r->n_str = tok->encoding;
+               r->n_nchildren = 1;
+               r->n_child = n;
+               tok->encoding = NULL;
+               n = r;
         }
  
         PyTokenizer_Free(tok);
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index b4e0fbf7e58d9446e02d3f5af619b42b0191f2fa..fffc19fa276c32cb47a1b26e5067d34e9b66d877 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -5,10 +5,19 @@
  #include "pgenheaders.h"
  
  #include <ctype.h>
+#include <assert.h>
  
  #include "tokenizer.h"
  #include "errcode.h"
  
+#ifndef PGEN
+#include "unicodeobject.h"
+#include "stringobject.h"
+#include "fileobject.h"
+#include "codecs.h"
+#include "abstract.h"
+#endif /* PGEN */
+
  extern char *PyOS_Readline(char *);
  /* Return malloc'ed string including trailing \n;
     empty malloc'ed string for EOF;
@@ -114,9 +123,416 @@ tok_new(void)
         tok->alterror = 0;
         tok->alttabsize = 1;
         tok->altindstack[0] = 0;
+       tok->decoding_state = 0;
+       tok->decoding_erred = 0;
+       tok->read_coding_spec = 0;
+       tok->issued_encoding_warning = 0;
+       tok->encoding = NULL;
+       tok->decoding_readline = NULL;
+       tok->decoding_buffer = NULL;
         return tok;
  }
  
+#ifdef PGEN
+
+static char *
+decoding_fgets(char *s, int size, struct tok_state *tok)
+{
+       return fgets(s, size, tok->fp);
+}
+
+static int
+decoding_feof(struct tok_state *tok)
+{
+       return feof(tok->fp);
+}
+
+static const char *
+decode_str(const char *str, struct tok_state *tok)
+{
+       return str;
+}
+
+#else /* PGEN */
+
+static char *
+error_ret(struct tok_state *tok) /* XXX */
+{
+       tok->decoding_erred = 1;
+       if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
+               PyMem_DEL(tok->buf);
+       tok->buf = NULL;
+       return NULL;            /* as if it were EOF */
+}
+
+static char *
+new_string(const char *s, int len)
+{
+       char* result = PyMem_NEW(char, len + 1);
+       if (result != NULL) {
+               memcpy(result, s, len);
+               result[len] = '\0';
+       }
+       return result;
+}
+
+static char *
+get_normal_name(char *s)       /* for utf-8 and latin-1 */
+{
+       char buf[13];
+       int i;
+       for (i = 0; i < 12; i++) {
+               int c = s[i];
+               if (c == '\0') break;
+               else if (c == '_') buf[i] = '-';
+               else buf[i] = tolower(c);
+       }
+       buf[i] = '\0';
+       if (strcmp(buf, "utf-8") == 0 ||
+           strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
+       else if (strcmp(buf, "latin-1") == 0 ||
+                strcmp(buf, "iso-8859-1") == 0 ||
+                strcmp(buf, "iso-latin-1") == 0 ||
+                strncmp(buf, "latin-1-", 8) == 0 ||
+                strncmp(buf, "iso-8859-1-", 11) == 0 ||
+                strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
+       else return s;
+}
+
+/* Return the coding spec in S, or NULL if none is found.  */
+
+static char *
+get_coding_spec(const char *s, int size)
+{
+       int i;
+       for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
+               const char* t = s + i;
+               if (strncmp(t, "coding", 6) == 0) {
+                       const char* begin = NULL;
+                       t += 6;
+                       if (t[0] != ':' && t[0] != '=')
+                               continue;
+                       do {
+                               t++;
+                       } while (t[0] == '\x20' || t[0] == '\t');
+
+                       begin = t;
+                       while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
+                              t[0] == '.')
+                               t++;
+
+                       if (begin < t) {
+                               char* r = new_string(begin, t - begin);
+                               char* q = get_normal_name(r);
+                               if (r != q) {
+                                       assert(strlen(r) >= strlen(q));
+                                       strcpy(r, q);
+                               }
+                               return r;
+                       }
+               }
+       }
+       return NULL;
+}
+
+/* Check whether the line contains a coding spec. If it does,
+   invoke the set_readline function for the new encoding.
+   This function receives the tok_state and the new encoding.
+   Return 1 on success, 0 on failure.  */
+
+static int
+check_coding_spec(const char* line, int size, struct tok_state *tok,
+                 int set_readline(struct tok_state *, const char *))
+{
+       int r = 1;
+       char* cs = get_coding_spec(line, size);
+       if (cs != NULL) {
+               tok->read_coding_spec = 1;
+               if (tok->encoding == NULL) {
+                       assert(tok->decoding_state == 1); /* raw */
+                       if (strcmp(cs, "utf-8") == 0 ||
+                           strcmp(cs, "iso-8859-1") == 0) {
+                               tok->encoding = cs;
+                       } else {
+                               r = set_readline(tok, cs);
+                               if (r) {
+                                       tok->encoding = cs;
+                                       tok->decoding_state = -1;
+                               }
+                       }
+               } else {        /* then, compare cs with BOM */
+                       r = (strcmp(tok->encoding, cs) == 0);
+                       PyMem_DEL(cs);
+               }
+       }
+       return r;
+}
+
+/* See whether the file starts with a BOM. If it does,
+   invoke the set_readline function with the new encoding.
+   Return 1 on success, 0 on failure.  */
+
+static int
+check_bom(int get_char(struct tok_state *),
+         void unget_char(int, struct tok_state *),
+         int set_readline(struct tok_state *, const char *),
+         struct tok_state *tok)
+{
+       int ch = get_char(tok);
+       tok->decoding_state = 1;
+       if (ch == EOF) {
+               return 1;
+       } else if (ch == 0xEF) {
+               ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
+               ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
+#if 0
+       /* Disable support for UTF-16 BOMs until a decision
+          is made whether this needs to be supported.  */
+       } else if (ch == 0xFE) {
+               ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
+               if (!set_readline(tok, "utf-16-be")) return 0;
+               tok->decoding_state = -1;
+       } else if (ch == 0xFF) {
+               ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
+               if (!set_readline(tok, "utf-16-le")) return 0;
+               tok->decoding_state = -1;
+#endif
+       } else {
+               unget_char(ch, tok);
+               return 1;
+       }
+       tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
+       return 1;
+  NON_BOM:
+       /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
+       unget_char(0xFF, tok);  /* XXX this will cause a syntax error */
+       return 1;
+}
+
+/* Read a line of text from TOK into S, using the stream in TOK.
+   Return NULL on failure, else S.  */
+
+static char *
+fp_readl(char *s, int size, struct tok_state *tok)
+{
+       PyObject* utf8;
+       PyObject* buf = tok->decoding_buffer;
+       if (buf == NULL) {
+               buf = PyObject_CallObject(tok->decoding_readline, NULL);
+               if (buf == NULL) return error_ret(tok);
+       } else {
+               tok->decoding_buffer = NULL;
+       }
+       utf8 = PyUnicode_AsUTF8String(buf);
+       Py_DECREF(buf);
+       if (utf8 == NULL) return error_ret(tok);
+       else {
+               const char* str = PyString_AsString(utf8);
+               assert(strlen(str) < size); /* XXX */
+               strcpy(s, str);
+               Py_DECREF(utf8);
+               if (s[0] == '\0') return NULL; /* EOF */
+               return s;
+       }
+}
+
+/* Set the readline function for TOK to a StreamReader's
+   readline function. The StreamReader is named ENC.
+
+   This function is called from check_bom and check_coding_spec.
+
+   ENC is usually identical to the future value of tok->encoding,
+   except for the (currently unsupported) case of UTF-16.
+
+   Return 1 on success, 0 on failure. */
+
+static int
+fp_setreadl(struct tok_state *tok, const char* enc)
+{
+       PyObject *reader, *stream, *readline;
+
+       stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
+       if (stream == NULL) return 0;
+
+       reader = PyCodec_StreamReader(enc, stream, NULL);
+       Py_DECREF(stream);
+       if (reader == NULL) return 0;
+
+       readline = PyObject_GetAttrString(reader, "readline");
+       Py_DECREF(reader);
+       if (readline == NULL) return 0;
+
+       tok->decoding_readline = readline;
+       return 1;
+}
+
+/* Fetch the next byte from TOK. */
+
+static int fp_getc(struct tok_state *tok) {
+       return getc(tok->fp);
+}
+
+/* Unfetch the last byte back into TOK.  */
+
+static void fp_ungetc(int c, struct tok_state *tok) {
+       ungetc(c, tok->fp);
+}
+
+/* Read a line of input from TOK. Determine encoding
+   if necessary.  */
+
+static char *
+decoding_fgets(char *s, int size, struct tok_state *tok)
+{
+       char *line;
+       int warn = 0, badchar = 0;
+       for (;;)
+               if (tok->decoding_state < 0) {
+                       /* We already have a codec associated with
+                          this input. */
+                       line = fp_readl(s, size, tok);
+                       break;
+               } else if (tok->decoding_state > 0) {
+                       /* We want a 'raw' read. */
+                       line = Py_UniversalNewlineFgets(s, size, 
+                                                       tok->fp, NULL);
+                       warn = 1;
+                       break;
+               } else {
+                       /* We have not yet determined the encoding.
+                          If an encoding is found, use the file-pointer
+                          reader functions from now on. */
+                       if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
+                               return error_ret(tok);
+                       assert(tok->decoding_state != 0);
+               }
+       if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
+               if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
+                       return error_ret(tok);
+               }
+       }
+#ifndef PGEN
+       if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
+               unsigned char *c;
+               for (c = line; *c; c++)
+                       if (*c > 127) {
+                               badchar = *c;
+                               break;
+                       }
+       }
+       if (badchar) {
+               char buf[200];
+               sprintf(buf, "Non-ASCII character '\\x%.2x', "
+                       "but no declared encoding", badchar);
+               PyErr_WarnExplicit(PyExc_DeprecationWarning,
+                                  buf, tok->filename, tok->lineno, 
+                                  NULL, NULL);
+               tok->issued_encoding_warning = 1;
+       }
+#endif
+       return line;
+}
+
+static int
+decoding_feof(struct tok_state *tok)
+{
+       if (tok->decoding_state >= 0) {
+               return feof(tok->fp);
+       } else {
+               PyObject* buf = tok->decoding_buffer;
+               if (buf == NULL) {
+                       buf = PyObject_CallObject(tok->decoding_readline, NULL);
+                       if (buf == NULL) {
+                               error_ret(tok);
+                               return 1;
+                       } else {
+                               tok->decoding_buffer = buf;
+                       }
+               }
+               return PyObject_Length(buf) == 0;
+       }
+}
+
+/* Fetch a byte from TOK, using the string buffer. */
+
+static int buf_getc(struct tok_state *tok) {
+       return *tok->str++;
+}
+
+/* Unfetch a byte from TOK, using the string buffer. */
+
+static void buf_ungetc(int c, struct tok_state *tok) {
+       tok->str--;
+       assert(*tok->str == c); /* tok->cur may point to read-only segment */
+}
+
+/* Set the readline function for TOK to ENC. For the string-based
+   tokenizer, this means to just record the encoding. */
+
+static int buf_setreadl(struct tok_state *tok, const char* enc) {
+       tok->enc = enc;
+       return 1;
+}
+
+/* Return a UTF-8 encoding Python string object from the
+   C byte string STR, which is encoded with ENC. */
+
+static PyObject *
+translate_into_utf8(const char* str, const char* enc) {
+       PyObject *utf8;
+       PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
+       if (buf == NULL)
+               return NULL;
+       utf8 = PyUnicode_AsUTF8String(buf);
+       Py_DECREF(buf);
+       return utf8;
+}
+
+/* Decode a byte string STR for use as the buffer of TOK.
+   Look for encoding declarations inside STR, and record them
+   inside TOK.  */
+
+static const char *
+decode_str(const char *str, struct tok_state *tok)
+{
+       PyObject* utf8 = NULL;
+       const char *s;
+       int lineno = 0;
+       tok->enc = NULL;
+       tok->str = str;
+       if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
+               return NULL;
+       str = tok->str;         /* string after BOM if any */
+       assert(r);
+       if (tok->enc != NULL) {
+               utf8 = translate_into_utf8(str, tok->enc);
+               if (utf8 == NULL)
+                       return NULL;
+               str = PyString_AsString(utf8);
+       }
+       for (s = str;; s++) {
+               if (*s == '\0') break;
+               else if (*s == '\n') {
+                       lineno++;
+                       if (lineno == 2) break;
+               }
+       }
+       tok->enc = NULL;
+       if (!check_coding_spec(str, s - str, tok, buf_setreadl))
+               return NULL;
+       if (tok->enc != NULL) {
+               assert(utf8 == NULL);
+               utf8 = translate_into_utf8(str, tok->enc);
+               if (utf8 == NULL)
+                       return NULL;
+               str = PyString_AsString(utf8);
+       }
+       assert(tok->decoding_buffer == NULL);
+       tok->decoding_buffer = utf8; /* CAUTION */
+       return str;
+}
+
+#endif /* PGEN */
  
  /* Set up tokenizer for string */
  
@@ -126,6 +542,9 @@ PyTokenizer_FromString(char *str)
         struct tok_state *tok = tok_new();
         if (tok == NULL)
                 return NULL;
+       str = (char *)decode_str(str, tok);
+       if (str == NULL)
+               return NULL;
         tok->buf = tok->cur = tok->end = tok->inp = str;
         return tok;
  }
@@ -157,6 +576,10 @@ PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
  void
  PyTokenizer_Free(struct tok_state *tok)
  {
+       if (tok->encoding != NULL)
+               PyMem_DEL(tok->encoding);
+       Py_XDECREF(tok->decoding_readline);
+       Py_XDECREF(tok->decoding_buffer);
         if (tok->fp != NULL && tok->buf != NULL)
                 PyMem_DEL(tok->buf);
         PyMem_DEL(tok);
@@ -246,8 +669,8 @@ tok_nextc(register struct tok_state *tok)
                                         }
                                         tok->end = tok->buf + BUFSIZ;
                                 }
-                               if (Py_UniversalNewlineFgets(tok->buf, (int)(tok->end - tok->buf),
-                                         tok->fp, NULL) == NULL) {
+                               if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
+                                         tok) == NULL) {
                                         tok->done = E_EOF;
                                         done = 1;
                                 }
@@ -259,7 +682,7 @@ tok_nextc(register struct tok_state *tok)
                         }
                         else {
                                 cur = tok->cur - tok->buf;
-                               if (feof(tok->fp)) {
+                               if (decoding_feof(tok)) {
                                         tok->done = E_EOF;
                                         done = 1;
                                 }
@@ -285,9 +708,9 @@ tok_nextc(register struct tok_state *tok)
                                 tok->end = tok->buf + newsize;
                                 tok->start = curstart < 0 ? NULL :
                                              tok->buf + curstart;
-                               if (Py_UniversalNewlineFgets(tok->inp,
+                               if (decoding_fgets(tok->inp,
                                                (int)(tok->end - tok->inp),
-                                              tok->fp, NULL) == NULL) {
+                                              tok) == NULL) {
                                         /* Last line does not end in \n,
                                            fake one */
                                         strcpy(tok->inp, "\n");
@@ -506,9 +929,8 @@ indenterror(struct tok_state *tok)
  
  /* Get next token, after space stripping etc. */
  
-int
-PyTokenizer_Get(register struct tok_state *tok, char **p_start,
-               char **p_end)
+static int
+tok_get(register struct tok_state *tok, char **p_start, char **p_end)
  {
         register int c;
         int blankline;
@@ -915,6 +1337,16 @@ PyTokenizer_Get(register struct tok_state *tok, char **p_start,
         return PyToken_OneChar(c);
  }
  
+int
+PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
+{
+       int result = tok_get(tok, p_start, p_end);
+       if (tok->decoding_erred) {
+               result = ERRORTOKEN;
+               tok->done = E_DECODE;
+       }
+       return result;
+}
  
  #ifdef Py_DEBUG
  
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h

index 8fded37ad998cbe17352458ec6739c5a66c29607..9782666fd82450dfcd2f9e197f7cd9cd2c9c20cd 100644 (file)
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -4,6 +4,7 @@
  extern "C" {
  #endif
  
+#include "object.h"
  
  /* Tokenizer interface */
  
@@ -38,6 +39,16 @@ struct tok_state {
         int alterror;   /* Issue error if alternate tabs don't match */
         int alttabsize; /* Alternate tab spacing */
         int altindstack[MAXINDENT];     /* Stack of alternate indents */
+       /* Stuff for PEP 0263 */
+       int decoding_state;     /* -1:decoding, 0:init, 1:raw */
+       int decoding_erred;     /* whether erred in decoding  */
+       int read_coding_spec;   /* whether 'coding:...' has been read  */
+       int issued_encoding_warning; /* whether non-ASCII warning was issued */
+       char *encoding;
+       PyObject *decoding_readline; /* codecs.open(...).readline */
+       PyObject *decoding_buffer;
+       const char* enc;
+       const char* str;
  };
  
  extern struct tok_state *PyTokenizer_FromString(char *);
diff --git a/Parser/tokenizer_pgen.c b/Parser/tokenizer_pgen.c

new file mode 100644 (file)

index 0000000..9cb8492
--- /dev/null
+++ b/Parser/tokenizer_pgen.c
@@ -0,0 +1,2 @@
+#define PGEN
+#include "tokenizer.c"
diff --git a/Python/compile.c b/Python/compile.c

index 3a0948e0c4ecbd6ea0034bad7a0c5fe48ccd7474..512b5a339dbb394f78230ddf3dd88a9e44eb9b8b 100644 (file)
--- a/Python/compile.c
+++ b/Python/compile.c
@@ -485,6 +485,7 @@ struct compiling {
         int c_closure;          /* Is nested w/freevars? */
         struct symtable *c_symtable; /* pointer to module symbol table */
          PyFutureFeatures *c_future; /* pointer to module's __future__ */
+       char *c_encoding;       /* source encoding (a borrowed reference) */
  };
  
  static int
@@ -1181,6 +1182,23 @@ parsenumber(struct compiling *co, char *s)
         }
  }
  
+static PyObject *
+decode_utf8(char **sPtr, char *end, char* encoding)
+{
+       PyObject *u, *v;
+       char *s, *t;
+       t = s = *sPtr;
+       /* while (s < end && *s != '\\') s++; */ /* inefficient for u".." */
+       while (s < end && (*s & 0x80)) s++;
+       *sPtr = s;
+       u = PyUnicode_DecodeUTF8(t, s - t, NULL);
+       if (u == NULL)
+               return NULL;
+       v = PyUnicode_AsEncodedString(u, encoding, NULL);
+       Py_DECREF(u);
+       return v;
+}
+
  static PyObject *
  parsestr(struct compiling *com, char *s)
  {
@@ -1193,6 +1211,8 @@ parsestr(struct compiling *com, char *s)
         int first = *s;
         int quote = first;
         int rawmode = 0;
+       char* encoding = ((com == NULL) ? NULL : com->c_encoding);
+       int need_encoding;
         int unicode = 0;
  
         if (isalpha(quote) || quote == '_') {
@@ -1230,28 +1250,101 @@ parsestr(struct compiling *com, char *s)
         }
  #ifdef Py_USING_UNICODE
         if (unicode || Py_UnicodeFlag) {
+               PyObject *u, *w;
+               if (encoding == NULL) {
+                       buf = s;
+                       u = NULL;
+               } else if (strcmp(encoding, "iso-8859-1") == 0) {
+                       buf = s;
+                       u = NULL;
+               } else {
+                       /* "\XX" may become "\u005c\uHHLL" (12 bytes) */
+                       u = PyString_FromStringAndSize((char *)NULL, len * 4);
+                       if (u == NULL)
+                               return NULL;
+                       p = buf = PyString_AsString(u);
+                       end = s + len;
+                       while (s < end) {
+                               if (*s == '\\') {
+                                       *p++ = *s++;
+                                       if (*s & 0x80) {
+                                               strcpy(p, "u005c");
+                                               p += 5;
+                                       }
+                               }
+                               if (*s & 0x80) { /* XXX inefficient */
+                                       char *r;
+                                       int rn, i;
+                                       w = decode_utf8(&s, end, "utf-16-be");
+                                       if (w == NULL) {
+                                               Py_DECREF(u);
+                                               return NULL;
+                                       }
+                                       r = PyString_AsString(w);
+                                       rn = PyString_Size(w);
+                                       assert(rn % 2 == 0);
+                                       for (i = 0; i < rn; i += 2) {
+                                               sprintf(p, "\\u%02x%02x",
+                                                       r[i + 0] & 0xFF,
+                                                       r[i + 1] & 0xFF);
+                                               p += 6;
+                                       }
+                                       Py_DECREF(w);
+                               } else {
+                                       *p++ = *s++;
+                               }
+                       }
+                       len = p - buf;
+               }
                 if (rawmode)
-                       v = PyUnicode_DecodeRawUnicodeEscape(
-                                s, len, NULL);
+                       v = PyUnicode_DecodeRawUnicodeEscape(buf, len, NULL);
                 else
-                       v = PyUnicode_DecodeUnicodeEscape(
-                               s, len, NULL);
+                       v = PyUnicode_DecodeUnicodeEscape(buf, len, NULL);
+               Py_XDECREF(u);
                 if (v == NULL)
                         PyErr_SyntaxLocation(com->c_filename, com->c_lineno);
                 return v;
                         
         }
  #endif
-       if (rawmode || strchr(s, '\\') == NULL)
-               return PyString_FromStringAndSize(s, len);
-       v = PyString_FromStringAndSize((char *)NULL, len);
+       need_encoding = (encoding != NULL &&
+                        strcmp(encoding, "utf-8") != 0 &&
+                        strcmp(encoding, "iso-8859-1") != 0);
+       if (rawmode || strchr(s, '\\') == NULL) {
+               if (need_encoding) {
+                       PyObject* u = PyUnicode_DecodeUTF8(s, len, NULL);
+                       if (u == NULL)
+                               return NULL;
+                       v = PyUnicode_AsEncodedString(u, encoding, NULL);
+                       Py_DECREF(u);
+                       return v;
+               } else {
+                       return PyString_FromStringAndSize(s, len);
+               }
+       }
+       v = PyString_FromStringAndSize((char *)NULL, /* XXX 4 is enough? */
+                                      need_encoding ? len * 4 : len);
         if (v == NULL)
                 return NULL;
         p = buf = PyString_AsString(v);
         end = s + len;
         while (s < end) {
                 if (*s != '\\') {
-                       *p++ = *s++;
+                 ORDINAL: 
+                       if (need_encoding && (*s & 0x80)) {
+                               char *r;
+                               int rn;
+                               PyObject* w = decode_utf8(&s, end, encoding);
+                               if (w == NULL)
+                                       return NULL;
+                               r = PyString_AsString(w);
+                               rn = PyString_Size(w);
+                               memcpy(p, r, rn);
+                               p += rn;
+                               Py_DECREF(w);
+                       } else {
+                               *p++ = *s++;
+                       }
                         continue;
                 }
                 s++;
@@ -1320,8 +1413,8 @@ parsestr(struct compiling *com, char *s)
  #endif
                 default:
                         *p++ = '\\';
-                       *p++ = s[-1];
-                       break;
+                       s--;
+                       goto ORDINAL;
                 }
         }
         _PyString_Resize(&v, (int)(p - buf));
@@ -4149,6 +4242,12 @@ jcompile(node *n, char *filename, struct compiling *base,
         PyCodeObject *co;
         if (!com_init(&sc, filename))
                 return NULL;
+       if (TYPE(n) == encoding_decl) {
+               sc.c_encoding = STR(n);
+               n = CHILD(n, 0);
+       } else {
+               sc.c_encoding = NULL;
+       }
         if (base) {
                 sc.c_private = base->c_private;
                 sc.c_symtable = base->c_symtable;
@@ -4157,6 +4256,10 @@ jcompile(node *n, char *filename, struct compiling *base,
                     || (sc.c_symtable->st_cur->ste_type == TYPE_FUNCTION))
                         sc.c_nested = 1;
                 sc.c_flags |= base->c_flags & PyCF_MASK;
+               if (base->c_encoding != NULL) {
+                       assert(sc.c_encoding == NULL);
+                       sc.c_encoding = base->c_encoding;
+               }
         } else {
                 sc.c_private = NULL;
                 sc.c_future = PyNode_Future(n, filename);
diff --git a/Python/graminit.c b/Python/graminit.c

index ef7d4677b5f370290b33a76a3250438d565d9d83..98bad94bc7c6bc832b2133c24bf235d5262e454f 100644 (file)
--- a/Python/graminit.c
+++ b/Python/graminit.c
@@ -1463,7 +1463,17 @@ static state states_66[2] = {
         {1, arcs_66_0},
         {2, arcs_66_1},
  };
-static dfa dfas[67] = {
+static arc arcs_67_0[1] = {
+       {12, 1},
+};
+static arc arcs_67_1[1] = {
+       {0, 1},
+};
+static state states_67[2] = {
+       {1, arcs_67_0},
+       {1, arcs_67_1},
+};
+static dfa dfas[68] = {
         {256, "single_input", 0, 3, states_0,
          "\004\030\001\000\000\000\124\360\213\011\162\000\002\000\140\210\244\005\001"},
         {257, "file_input", 0, 2, states_1,
@@ -1598,8 +1608,10 @@ static dfa dfas[67] = {
          "\000\000\000\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\000"},
         {322, "testlist1", 0, 2, states_66,
          "\000\020\001\000\000\000\000\000\000\000\000\000\002\000\140\210\244\005\000"},
+       {323, "encoding_decl", 0, 2, states_67,
+        "\000\020\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"},
  };
-static label labels[148] = {
+static label labels[149] = {
         {0, "EMPTY"},
         {256, 0},
         {4, 0},
@@ -1748,10 +1760,11 @@ static label labels[148] = {
         {318, 0},
         {319, 0},
         {321, 0},
+       {323, 0},
  };
  grammar _PyParser_Grammar = {
-       67,
+       68,
         dfas,
-       {148, labels},
+       {149, labels},
         256
  };
diff --git a/Python/pythonrun.c b/Python/pythonrun.c

index b1fde29bd285bee530a92b24997cd32270df434b..006ff083d252746b5f73fd526885afeda5b9c11d 100644 (file)
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -1221,6 +1221,7 @@ static void
  err_input(perrdetail *err)
  {
         PyObject *v, *w, *errtype;
+       PyObject* u = NULL;
         char *msg = NULL;
         errtype = PyExc_SyntaxError;
         v = Py_BuildValue("(ziiz)", err->filename,
@@ -1272,12 +1273,24 @@ err_input(perrdetail *err)
                 errtype = PyExc_IndentationError;
                 msg = "too many levels of indentation";
                 break;
+       case E_DECODE: {        /* XXX */
+               PyThreadState* tstate = PyThreadState_Get();
+               PyObject* value = tstate->curexc_value;
+               if (value != NULL) {
+                       u = PyObject_Repr(value);
+                       if (u != NULL) {
+                               msg = PyString_AsString(u);
+                               break;
+                       }
+               }
+       }
         default:
                 fprintf(stderr, "error=%d\n", err->error);
                 msg = "unknown parsing error";
                 break;
         }
         w = Py_BuildValue("(sO)", msg, v);
+       Py_XDECREF(u);
         Py_XDECREF(v);
         PyErr_SetObject(errtype, w);
         Py_XDECREF(w);
author	Martin v. Löwis <martin@v.loewis.de>
	Sun, 4 Aug 2002 17:29:52 +0000 (17:29 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Sun, 4 Aug 2002 17:29:52 +0000 (17:29 +0000)
Doc/ref/ref2.tex		patch \| blob \| history
Grammar/Grammar		patch \| blob \| history
Include/errcode.h		patch \| blob \| history
Include/graminit.h		patch \| blob \| history
Makefile.pre.in		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Parser/parsetok.c		patch \| blob \| history
Parser/tokenizer.c		patch \| blob \| history
Parser/tokenizer.h		patch \| blob \| history
Parser/tokenizer_pgen.c	[new file with mode: 0644]	patch \| blob
Python/compile.c		patch \| blob \| history
Python/graminit.c		patch \| blob \| history
Python/pythonrun.c		patch \| blob \| history