Issue #15096: Drop support for the ur string prefix

author Christian Heimes <c.heimes@cheimes.de>

Wed, 20 Jun 2012 09:17:58 +0000 (11:17 +0200)

committer Christian Heimes <c.heimes@cheimes.de>

Wed, 20 Jun 2012 09:17:58 +0000 (11:17 +0200)
author Christian Heimes <c.heimes@cheimes.de>
Wed, 20 Jun 2012 09:17:58 +0000 (11:17 +0200)
committer Christian Heimes <c.heimes@cheimes.de>
Wed, 20 Jun 2012 09:17:58 +0000 (11:17 +0200)
diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst

index c94a47f0897026d66131654389a9a5180581197f..5e5903f7a92589a5ddd9e5182f00ae26ae9795ac 100644 (file)
--- a/Doc/reference/lexical_analysis.rst
+++ b/Doc/reference/lexical_analysis.rst
@@ -401,7 +401,7 @@ String literals are described by the following lexical definitions:
  
  .. productionlist::
     stringliteral: [`stringprefix`](`shortstring` | `longstring`)
-   stringprefix: "r" | "u" | "ur" | "R" | "U" | "UR" | "Ur" | "uR"
+   stringprefix: "r" | "u" | "R" | "U"
     shortstring: "'" `shortstringitem`* "'" | '"' `shortstringitem`* '"'
     longstring: "'''" `longstringitem`* "'''" | '"""' `longstringitem`* '"""'
     shortstringitem: `shortstringchar` | `stringescapeseq`
@@ -444,19 +444,21 @@ must be expressed with escapes.
  As of Python 3.3 it is possible again to prefix unicode strings with a
  ``u`` prefix to simplify maintenance of dual 2.x and 3.x codebases.
  
-Both string and bytes literals may optionally be prefixed with a letter ``'r'``
+Bytes literals may optionally be prefixed with a letter ``'r'``
  or ``'R'``; such strings are called :dfn:`raw strings` and treat backslashes as
  literal characters.  As a result, in string literals, ``'\U'`` and ``'\u'``
-escapes in raw strings are not treated specially.
+escapes in raw strings are not treated specially. Given that Python 2.x's raw
+unicode literals behave differently than Python 3.x's the ``'ur'`` syntax
+is not supported.
  
     .. versionadded:: 3.3
        The ``'rb'`` prefix of raw bytes literals has been added as a synonym
        of ``'br'``.
  
     .. versionadded:: 3.3
-      Support for the unicode legacy literal (``u'value'``) and other
-      versions were reintroduced to simplify the maintenance of dual
-      Python 2.x and 3.x codebases.  See :pep:`414` for more information.
+      Support for the unicode legacy literal (``u'value'``) was reintroduced
+      to simplify the maintenance of dual Python 2.x and 3.x codebases.
+      See :pep:`414` for more information.
  
  In triple-quoted strings, unescaped newlines and quotes are allowed (and are
  retained), except that three unescaped quotes in a row terminate the string.  (A
diff --git a/Lib/test/test_strlit.py b/Lib/test/test_strlit.py

index 1f041c80abe2c85dd1fe39a0c4d970097f298720..07bc48880a98b979f02ef8befba5c8110bed45ca 100644 (file)
--- a/Lib/test/test_strlit.py
+++ b/Lib/test/test_strlit.py
@@ -123,6 +123,15 @@ class TestLiterals(unittest.TestCase):
          self.assertRaises(SyntaxError, eval, """ rrb'' """)
          self.assertRaises(SyntaxError, eval, """ rbb'' """)
  
+    def test_eval_str_u(self):
+        self.assertEqual(eval(""" u'x' """), 'x')
+        self.assertEqual(eval(""" U'\u00e4' """), 'ä')
+        self.assertEqual(eval(""" u'\N{LATIN SMALL LETTER A WITH DIAERESIS}' """), 'ä')
+        self.assertRaises(SyntaxError, eval, """ ur'' """)
+        self.assertRaises(SyntaxError, eval, """ ru'' """)
+        self.assertRaises(SyntaxError, eval, """ bu'' """)
+        self.assertRaises(SyntaxError, eval, """ ub'' """)
+
      def check_encoding(self, encoding, extra=""):
          modname = "xx_" + encoding.replace("-", "_")
          fn = os.path.join(self.tmpdir, modname + ".py")
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 4c2e4e2b6778d2d827e4f8cb40983190bac963b6..4e798d789f6419e6814a4f9ff61c9a959c665bfc 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -299,24 +299,6 @@ String literals
      STRING     'u"abc"'      (1, 0) (1, 6)
      OP         '+'           (1, 7) (1, 8)
      STRING     'U"abc"'      (1, 9) (1, 15)
-    >>> dump_tokens("ur'abc' + uR'abc' + Ur'abc' + UR'abc'")
-    ENCODING   'utf-8'       (0, 0) (0, 0)
-    STRING     "ur'abc'"     (1, 0) (1, 7)
-    OP         '+'           (1, 8) (1, 9)
-    STRING     "uR'abc'"     (1, 10) (1, 17)
-    OP         '+'           (1, 18) (1, 19)
-    STRING     "Ur'abc'"     (1, 20) (1, 27)
-    OP         '+'           (1, 28) (1, 29)
-    STRING     "UR'abc'"     (1, 30) (1, 37)
-    >>> dump_tokens('ur"abc" + uR"abc" + Ur"abc" + UR"abc"')
-    ENCODING   'utf-8'       (0, 0) (0, 0)
-    STRING     'ur"abc"'     (1, 0) (1, 7)
-    OP         '+'           (1, 8) (1, 9)
-    STRING     'uR"abc"'     (1, 10) (1, 17)
-    OP         '+'           (1, 18) (1, 19)
-    STRING     'Ur"abc"'     (1, 20) (1, 27)
-    OP         '+'           (1, 28) (1, 29)
-    STRING     'UR"abc"'     (1, 30) (1, 37)
  
      >>> dump_tokens("b'abc' + B'abc'")
      ENCODING   'utf-8'       (0, 0) (0, 0)
@@ -642,7 +624,7 @@ Non-ascii identifiers
  
  Legacy unicode literals:
  
-    >>> dump_tokens("Örter = u'places'\\ngrün = UR'green'")
+    >>> dump_tokens("Örter = u'places'\\ngrün = U'green'")
      ENCODING   'utf-8'       (0, 0) (0, 0)
      NAME       'Örter'       (1, 0) (1, 5)
      OP         '='           (1, 6) (1, 7)
@@ -650,7 +632,7 @@ Legacy unicode literals:
      NEWLINE    '\\n'          (1, 17) (1, 18)
      NAME       'grün'        (2, 0) (2, 4)
      OP         '='           (2, 5) (2, 6)
-    STRING     "UR'green'"   (2, 7) (2, 16)
+    STRING     "U'green'"    (2, 7) (2, 15)
  """
  
  from test import support
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index e41cd6eea42fd4f54e704cabd839529f54b309da..0a53435583a56bd69171d505bd7580f7d8588389 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -127,7 +127,7 @@ Floatnumber = group(Pointfloat, Expfloat)
  Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
  Number = group(Imagnumber, Floatnumber, Intnumber)
  
-StringPrefix = r'(?:[uUbB][rR]?|[rR][bB]?)?'
+StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
  
  # Tail end of ' string.
  Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
@@ -183,12 +183,8 @@ endpats = {"'": Single, '"': Double,
             "rB'''": Single3, 'rB"""': Double3,
             "RB'''": Single3, 'RB"""': Double3,
             "u'''": Single3, 'u"""': Double3,
-           "ur'''": Single3, 'ur"""': Double3,
             "R'''": Single3, 'R"""': Double3,
             "U'''": Single3, 'U"""': Double3,
-           "uR'''": Single3, 'uR"""': Double3,
-           "Ur'''": Single3, 'Ur"""': Double3,
-           "UR'''": Single3, 'UR"""': Double3,
             'r': None, 'R': None, 'b': None, 'B': None,
             'u': None, 'U': None}
  
@@ -201,8 +197,7 @@ for t in ("'''", '"""',
            "rb'''", 'rb"""', "rB'''", 'rB"""',
            "Rb'''", 'Rb"""', "RB'''", 'RB"""',
            "u'''", 'u"""', "U'''", 'U"""',
-          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
-          "uR'''", 'uR"""', "UR'''", 'UR"""'):
+          ):
      triple_quoted[t] = t
  single_quoted = {}
  for t in ("'", '"',
@@ -213,8 +208,7 @@ for t in ("'", '"',
            "rb'", 'rb"', "rB'", 'rB"',
            "Rb'", 'Rb"', "RB'", 'RB"' ,
            "u'", 'u"', "U'", 'U"',
-          "ur'", 'ur"', "Ur'", 'Ur"',
-          "uR'", 'uR"', "UR'", 'UR"' ):
+          ):
      single_quoted[t] = t
  
  tabsize = 8
diff --git a/Misc/NEWS b/Misc/NEWS

index bcf28bd173763e9f11d39e1402e8997387721980..2f9236ff36b8d1eece7eb3c8af3bf5b0c230d03f 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ What's New in Python 3.3.0 Beta 1?
  Core and Builtins
  -----------------
  
+- Issue #15096: Removed support for ur'' as the raw notation isn't
+  compatible with Python 2.x's raw unicode strings.
+
  - Issue #13783: Generator objects now use the identifier APIs internally
  
  - Issue #14874: Restore charmap decoding speed to pre-PEP 393 levels.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index 36ca0791cf100fc2559353b63e16e7f4e63ead89..93a4a5ccb479010c8eaeaba22ebab3374c8db4e7 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1412,7 +1412,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
      /* Identifier (most frequent token!) */
      nonascii = 0;
      if (is_potential_identifier_start(c)) {
-        /* Process b"", r"", u"", br"", rb"" and ur"" */
+        /* Process b"", r"", u"", br"" and rb"" */
          int saw_b = 0, saw_r = 0, saw_u = 0;
          while (1) {
              if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
@@ -1421,7 +1421,8 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
                 want to support it in arbitrary order like byte literals. */
              else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
                  saw_u = 1;
-            else if (!saw_r && (c == 'r' || c == 'R'))
+            /* ur"" and ru"" are not supported */
+            else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
                  saw_r = 1;
              else
                  break;
author	Christian Heimes <c.heimes@cheimes.de>
	Wed, 20 Jun 2012 09:17:58 +0000 (11:17 +0200)
committer	Christian Heimes <c.heimes@cheimes.de>
	Wed, 20 Jun 2012 09:17:58 +0000 (11:17 +0200)
Doc/reference/lexical_analysis.rst		patch \| blob \| history
Lib/test/test_strlit.py		patch \| blob \| history
Lib/test/test_tokenize.py		patch \| blob \| history
Lib/tokenize.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Parser/tokenizer.c		patch \| blob \| history