]> granicus.if.org Git - python/commitdiff
bpo-30215: Make re.compile() locale agnostic. (#1361)
authorSerhiy Storchaka <storchaka@gmail.com>
Fri, 5 May 2017 05:53:40 +0000 (08:53 +0300)
committerGitHub <noreply@github.com>
Fri, 5 May 2017 05:53:40 +0000 (08:53 +0300)
Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time.  Only the locale at matching
time affects the result of matching.

Doc/library/re.rst
Lib/re.py
Lib/sre_compile.py
Lib/sre_constants.py
Lib/test/test_re.py
Misc/NEWS
Modules/_sre.c
Modules/sre_constants.h
Modules/sre_lib.h

index 0fa7196148dee0fb8c88e643e256be6d0551e657..131f3722c3fa16b91e56fc074693822d7678db09 100644 (file)
@@ -559,6 +559,11 @@ form.
       :const:`re.LOCALE` can be used only with bytes patterns and is
       not compatible with :const:`re.ASCII`.
 
+   .. versionchanged:: 3.7
+      Compiled regular expression objects with the :const:`re.LOCALE` flag no
+      longer depend on the locale at compile time.  Only the locale at
+      matching time affects the result of matching.
+
 
 .. data:: M
           MULTILINE
index 7053eddbe027e67ac8e0f4232a15c683a6ad250b..d0ee5db175b5faeae16ebebf0c467d9370b9f016 100644 (file)
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -268,9 +268,7 @@ _MAXCACHE = 512
 def _compile(pattern, flags):
     # internal: compile pattern
     try:
-        p, loc = _cache[type(pattern), pattern, flags]
-        if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
-            return p
+        return _cache[type(pattern), pattern, flags]
     except KeyError:
         pass
     if isinstance(pattern, _pattern_type):
@@ -284,13 +282,7 @@ def _compile(pattern, flags):
     if not (flags & DEBUG):
         if len(_cache) >= _MAXCACHE:
             _cache.clear()
-        if p.flags & LOCALE:
-            if not _locale:
-                return p
-            loc = _locale.setlocale(_locale.LC_CTYPE)
-        else:
-            loc = None
-        _cache[type(pattern), pattern, flags] = p, loc
+        _cache[type(pattern), pattern, flags] = p
     return p
 
 @functools.lru_cache(_MAXCACHE)
index 2cc39007ac71288d8deb620a65b8d21c32aa6524..d7ee4e8cb6e4ce99e241a3a3a5950741decd6dfa 100644 (file)
@@ -78,7 +78,13 @@ def _compile(code, pattern, flags):
         fixes = None
     for op, av in pattern:
         if op in LITERAL_CODES:
-            if flags & SRE_FLAG_IGNORECASE:
+            if not flags & SRE_FLAG_IGNORECASE:
+                emit(op)
+                emit(av)
+            elif flags & SRE_FLAG_LOCALE:
+                emit(OP_LOC_IGNORE[op])
+                emit(av)
+            else:
                 lo = _sre.getlower(av, flags)
                 if fixes and lo in fixes:
                     emit(IN_IGNORE)
@@ -93,17 +99,17 @@ def _compile(code, pattern, flags):
                 else:
                     emit(OP_IGNORE[op])
                     emit(lo)
-            else:
-                emit(op)
-                emit(av)
         elif op is IN:
-            if flags & SRE_FLAG_IGNORECASE:
-                emit(OP_IGNORE[op])
-                def fixup(literal, flags=flags):
-                    return _sre.getlower(literal, flags)
-            else:
+            if not flags & SRE_FLAG_IGNORECASE:
                 emit(op)
                 fixup = None
+            elif flags & SRE_FLAG_LOCALE:
+                emit(IN_LOC_IGNORE)
+                fixup = None
+            else:
+                emit(IN_IGNORE)
+                def fixup(literal, flags=flags):
+                    return _sre.getlower(literal, flags)
             skip = _len(code); emit(0)
             _compile_charset(av, flags, code, fixup, fixes)
             code[skip] = _len(code) - skip
index fc684ae96fd30a67ca129628663d769912e0a9d7..b0164312d0357e66608710a97705fbc6824eeb43 100644 (file)
@@ -13,7 +13,7 @@
 
 # update when constants are added or removed
 
-MAGIC = 20140917
+MAGIC = 20170530
 
 from _sre import MAXREPEAT, MAXGROUPS
 
@@ -87,6 +87,9 @@ OPCODES = _makecodes("""
     SUBPATTERN
     MIN_REPEAT_ONE
     RANGE_IGNORE
+    LITERAL_LOC_IGNORE
+    NOT_LITERAL_LOC_IGNORE
+    IN_LOC_IGNORE
 
     MIN_REPEAT MAX_REPEAT
 """)
@@ -124,6 +127,11 @@ OP_IGNORE = {
     RANGE: RANGE_IGNORE,
 }
 
+OP_LOC_IGNORE = {
+    LITERAL: LITERAL_LOC_IGNORE,
+    NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
+}
+
 AT_MULTILINE = {
     AT_BEGINNING: AT_BEGINNING_LINE,
     AT_END: AT_END_LINE
index da5c953ced0cd9000f96e7751b65e3e03868ed7f..7601dc88c7afaed3ed1945c8b87e7b7e9f222a80 100644 (file)
@@ -1730,6 +1730,38 @@ SUBPATTERN None 0 0
         self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
         self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
 
+    def test_locale_compiled(self):
+        oldlocale = locale.setlocale(locale.LC_CTYPE)
+        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
+        for loc in 'en_US.iso88591', 'en_US.utf8':
+            try:
+                locale.setlocale(locale.LC_CTYPE, loc)
+            except locale.Error:
+                # Unsupported locale on this system
+                self.skipTest('test needs %s locale' % loc)
+
+        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
+        p1 = re.compile(b'\xc5\xe5', re.L|re.I)
+        p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
+        p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
+        p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
+        for p in p1, p2, p3:
+            self.assertTrue(p.match(b'\xc5\xe5'))
+            self.assertTrue(p.match(b'\xe5\xe5'))
+            self.assertTrue(p.match(b'\xc5\xc5'))
+        self.assertIsNone(p4.match(b'\xe5\xc5'))
+        self.assertIsNone(p4.match(b'\xe5\xe5'))
+        self.assertIsNone(p4.match(b'\xc5\xc5'))
+
+        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
+        for p in p1, p2, p3:
+            self.assertTrue(p.match(b'\xc5\xe5'))
+            self.assertIsNone(p.match(b'\xe5\xe5'))
+            self.assertIsNone(p.match(b'\xc5\xc5'))
+        self.assertTrue(p4.match(b'\xe5\xc5'))
+        self.assertIsNone(p4.match(b'\xe5\xe5'))
+        self.assertIsNone(p4.match(b'\xc5\xc5'))
+
     def test_error(self):
         with self.assertRaises(re.error) as cm:
             re.compile('(\u20ac))')
index d76c76be8d74d26f67da2a4ff6c771d2d33fa7be..f2c1994ccf6939ede1734cac01b451840527607d 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -317,6 +317,10 @@ Extension Modules
 Library
 -------
 
+- bpo-30215: Compiled regular expression objects with the re.LOCALE flag no
+  longer depend on the locale at compile time.  Only the locale at matching
+  time affects the result of matching.
+
 - bpo-30185: Avoid KeyboardInterrupt tracebacks in forkserver helper process
   when Ctrl-C is received.
 
index 03a138ee0150272126355fde44a1a396ff5eed59..afb2bce77b073486afefa90c12dcdb6369f31d63 100644 (file)
@@ -1588,6 +1588,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
         case SRE_OP_NOT_LITERAL:
         case SRE_OP_LITERAL_IGNORE:
         case SRE_OP_NOT_LITERAL_IGNORE:
+        case SRE_OP_LITERAL_LOC_IGNORE:
+        case SRE_OP_NOT_LITERAL_LOC_IGNORE:
             GET_ARG;
             /* The arg is just a character, nothing to check */
             break;
@@ -1625,6 +1627,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
 
         case SRE_OP_IN:
         case SRE_OP_IN_IGNORE:
+        case SRE_OP_IN_LOC_IGNORE:
             GET_SKIP;
             /* Stop 1 before the end; we check the FAILURE below */
             if (!_validate_charset(code, code+skip-2))
index 6632442efe5698bcfe91d4202632ae9f4de35e89..6d6d21efd04bc2c660c5dd5775760a3d1d2c30d8 100644 (file)
@@ -11,7 +11,7 @@
  * See the _sre.c file for information on usage and redistribution.
  */
 
-#define SRE_MAGIC 20140917
+#define SRE_MAGIC 20170530
 #define SRE_OP_FAILURE 0
 #define SRE_OP_SUCCESS 1
 #define SRE_OP_ANY 2
@@ -45,6 +45,9 @@
 #define SRE_OP_SUBPATTERN 30
 #define SRE_OP_MIN_REPEAT_ONE 31
 #define SRE_OP_RANGE_IGNORE 32
+#define SRE_OP_LITERAL_LOC_IGNORE 33
+#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34
+#define SRE_OP_IN_LOC_IGNORE 35
 #define SRE_AT_BEGINNING 0
 #define SRE_AT_BEGINNING_LINE 1
 #define SRE_AT_BEGINNING_STRING 2
index 0865fc63a004d57daba2347470ca2f09d0f17d34..b540d219dde20bab54c12aa5dde848ad50f1f242 100644 (file)
@@ -100,6 +100,14 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
     return 0;
 }
 
+LOCAL(int)
+SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
+{
+    return ch == pattern
+        || (SRE_CODE) state->lower(ch) == pattern
+        || (SRE_CODE) state->upper(ch) == pattern;
+}
+
 LOCAL(int)
 SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
 {
@@ -187,6 +195,18 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
     }
 }
 
+LOCAL(int)
+SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
+{
+    SRE_CODE lo, up;
+    lo = state->lower(ch);
+    if (SRE(charset)(state, set, lo))
+       return 1;
+
+    up = state->upper(ch);
+    return up != lo && SRE(charset)(state, set, up);
+}
+
 LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
 
 LOCAL(Py_ssize_t)
@@ -247,6 +267,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
             ptr++;
         break;
 
+    case SRE_OP_LITERAL_LOC_IGNORE:
+        /* repeated literal */
+        chr = pattern[1];
+        TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
+        while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
+            ptr++;
+        break;
+
     case SRE_OP_NOT_LITERAL:
         /* repeated non-literal */
         chr = pattern[1];
@@ -269,6 +297,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
             ptr++;
         break;
 
+    case SRE_OP_NOT_LITERAL_LOC_IGNORE:
+        /* repeated non-literal */
+        chr = pattern[1];
+        TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
+        while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
+            ptr++;
+        break;
+
     default:
         /* repeated single character pattern */
         TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
@@ -651,7 +687,17 @@ entrance:
             TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
                    ctx->pattern, ctx->ptr, ctx->pattern[0]));
             if (ctx->ptr >= end ||
-                state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
+                state->lower(*ctx->ptr) != *ctx->pattern)
+                RETURN_FAILURE;
+            ctx->pattern++;
+            ctx->ptr++;
+            break;
+
+        case SRE_OP_LITERAL_LOC_IGNORE:
+            TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
+                   ctx->pattern, ctx->ptr, ctx->pattern[0]));
+            if (ctx->ptr >= end
+                || !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
                 RETURN_FAILURE;
             ctx->pattern++;
             ctx->ptr++;
@@ -661,7 +707,17 @@ entrance:
             TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
                    ctx->pattern, ctx->ptr, *ctx->pattern));
             if (ctx->ptr >= end ||
-                state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
+                state->lower(*ctx->ptr) == *ctx->pattern)
+                RETURN_FAILURE;
+            ctx->pattern++;
+            ctx->ptr++;
+            break;
+
+        case SRE_OP_NOT_LITERAL_LOC_IGNORE:
+            TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
+                   ctx->pattern, ctx->ptr, *ctx->pattern));
+            if (ctx->ptr >= end
+                || SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
                 RETURN_FAILURE;
             ctx->pattern++;
             ctx->ptr++;
@@ -677,6 +733,15 @@ entrance:
             ctx->ptr++;
             break;
 
+        case SRE_OP_IN_LOC_IGNORE:
+            TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
+            if (ctx->ptr >= end
+                || !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
+                RETURN_FAILURE;
+            ctx->pattern += ctx->pattern[0];
+            ctx->ptr++;
+            break;
+
         case SRE_OP_JUMP:
         case SRE_OP_INFO:
             /* jump forward */