]> granicus.if.org Git - python/commitdiff
Issue #12728: Different Unicode characters having the same uppercase but
authorSerhiy Storchaka <storchaka@gmail.com>
Mon, 10 Nov 2014 10:43:14 +0000 (12:43 +0200)
committerSerhiy Storchaka <storchaka@gmail.com>
Mon, 10 Nov 2014 10:43:14 +0000 (12:43 +0200)
different lowercase are now matched in case-insensitive regular expressions.

1  2 
Lib/sre_compile.py
Lib/test/test_re.py
Misc/NEWS

index 7d8f775ac0ad6cd6df7e4ad279232b7d7aea3576,53baa0dc468a8a0f985558e5183447d7e1c7e0da..5ecd33a77bbdfff1ef147972c278bf5e658a3d36
@@@ -21,11 -22,51 +21,51 @@@ if _sre.CODESIZE == 2
  else:
      MAXCODE = 0xFFFFFFFF
  
 -_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
 -_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
 -_SUCCESS_CODES = set([SUCCESS, FAILURE])
 -_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
 +_LITERAL_CODES = {LITERAL, NOT_LITERAL}
 +_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
 +_SUCCESS_CODES = {SUCCESS, FAILURE}
 +_ASSERT_CODES = {ASSERT, ASSERT_NOT}
  
+ # Sets of lowercase characters which have the same uppercase.
+ _equivalences = (
+     # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
+     (0x69, 0x131), # iı
+     # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
+     (0x73, 0x17f), # sſ
+     # MICRO SIGN, GREEK SMALL LETTER MU
+     (0xb5, 0x3bc), # µμ
+     # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
+     (0x345, 0x3b9, 0x1fbe), # \u0345ιι
+     # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+     (0x390, 0x1fd3), # ΐΐ
+     # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+     (0x3b0, 0x1fe3), # ΰΰ
+     # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
+     (0x3b2, 0x3d0), # βϐ
+     # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
+     (0x3b5, 0x3f5), # εϵ
+     # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
+     (0x3b8, 0x3d1), # θϑ
+     # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
+     (0x3ba, 0x3f0), # κϰ
+     # GREEK SMALL LETTER PI, GREEK PI SYMBOL
+     (0x3c0, 0x3d6), # πϖ
+     # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
+     (0x3c1, 0x3f1), # ρϱ
+     # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
+     (0x3c2, 0x3c3), # ςσ
+     # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
+     (0x3c6, 0x3d5), # φϕ
+     # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
+     (0x1e61, 0x1e9b), # ṡẛ
+     # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
+     (0xfb05, 0xfb06), # ſtst
+ )
+ # Maps the lowercase code to lowercase codes which have the same uppercase.
+ _ignorecase_fixes = {i: tuple(j for j in t if i != j)
+                      for t in _equivalences for i in t}
  def _compile(code, pattern, flags):
      # internal: compile a (sub)pattern
      emit = code.append
      for op, av in pattern:
          if op in LITERAL_CODES:
              if flags & SRE_FLAG_IGNORECASE:
-                 emit(OP_IGNORE[op])
-                 emit(_sre.getlower(av, flags))
+                 lo = _sre.getlower(av, flags)
+                 if fixes and lo in fixes:
 -                    emit(OPCODES[IN_IGNORE])
++                    emit(IN_IGNORE)
+                     skip = _len(code); emit(0)
+                     if op is NOT_LITERAL:
 -                        emit(OPCODES[NEGATE])
++                        emit(NEGATE)
+                     for k in (lo,) + fixes[lo]:
 -                        emit(OPCODES[LITERAL])
++                        emit(LITERAL)
+                         emit(k)
 -                    emit(OPCODES[FAILURE])
++                    emit(FAILURE)
+                     code[skip] = _len(code) - skip
+                 else:
 -                    emit(OPCODES[OP_IGNORE[op]])
++                    emit(OP_IGNORE[op])
+                     emit(lo)
              else:
 -                emit(OPCODES[op])
 +                emit(op)
                  emit(av)
          elif op is IN:
              if flags & SRE_FLAG_IGNORECASE:
                  def fixup(literal, flags=flags):
                      return _sre.getlower(literal, flags)
              else:
 -                emit(OPCODES[op])
 +                emit(op)
                  fixup = None
              skip = _len(code); emit(0)
-             _compile_charset(av, flags, code, fixup)
+             _compile_charset(av, flags, code, fixup, fixes)
              code[skip] = _len(code) - skip
          elif op is ANY:
              if flags & SRE_FLAG_DOTALL:
          else:
              raise ValueError("unsupported operand type", op)
  
- def _compile_charset(charset, flags, code, fixup=None):
+ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
      # compile charset subprogram
      emit = code.append
-     for op, av in _optimize_charset(charset, fixup):
 -    for op, av in _optimize_charset(charset, fixup, fixes,
 -                                    flags & SRE_FLAG_UNICODE):
 -        emit(OPCODES[op])
++    for op, av in _optimize_charset(charset, fixup, fixes):
 +        emit(op)
          if op is NEGATE:
              pass
          elif op is LITERAL:
              code.extend(av)
          elif op is CATEGORY:
              if flags & SRE_FLAG_LOCALE:
 -                emit(CHCODES[CH_LOCALE[av]])
 +                emit(CH_LOCALE[av])
              elif flags & SRE_FLAG_UNICODE:
 -                emit(CHCODES[CH_UNICODE[av]])
 +                emit(CH_UNICODE[av])
              else:
 -                emit(CHCODES[av])
 +                emit(av)
          else:
              raise error("internal: unsupported set operator")
 -    emit(OPCODES[FAILURE])
 +    emit(FAILURE)
  
- def _optimize_charset(charset, fixup):
 -def _optimize_charset(charset, fixup, fixes, isunicode):
++def _optimize_charset(charset, fixup, fixes):
      # internal: optimize character set
      out = []
      tail = []
              try:
                  if op is LITERAL:
                      if fixup:
-                         av = fixup(av)
-                     charmap[av] = 1
 -                        i = fixup(av)
 -                        charmap[i] = 1
 -                        if fixes and i in fixes:
 -                            for k in fixes[i]:
++                        lo = fixup(av)
++                        charmap[lo] = 1
++                        if fixes and lo in fixes:
++                            for k in fixes[lo]:
+                                 charmap[k] = 1
+                     else:
+                         charmap[av] = 1
                  elif op is RANGE:
                      r = range(av[0], av[1]+1)
                      if fixup:
Simple merge
diff --cc Misc/NEWS
Simple merge