From: Serhiy Storchaka Date: Mon, 10 Nov 2014 10:43:14 +0000 (+0200) Subject: Issue #12728: Different Unicode characters having the same uppercase but X-Git-Tag: v3.5.0a1~498 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5619ab926b83b3ce8ed8f24d52bcedc96835cb35;p=python Issue #12728: Different Unicode characters having the same uppercase but different lowercase are now matched in case-insensitive regular expressions. --- 5619ab926b83b3ce8ed8f24d52bcedc96835cb35 diff --cc Lib/sre_compile.py index 7d8f775ac0,53baa0dc46..5ecd33a77b --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@@ -21,11 -22,51 +21,51 @@@ if _sre.CODESIZE == 2 else: MAXCODE = 0xFFFFFFFF -_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) -_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) -_SUCCESS_CODES = set([SUCCESS, FAILURE]) -_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) +_LITERAL_CODES = {LITERAL, NOT_LITERAL} +_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} +_SUCCESS_CODES = {SUCCESS, FAILURE} +_ASSERT_CODES = {ASSERT, ASSERT_NOT} + # Sets of lowercase characters which have the same uppercase. + _equivalences = ( + # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I + (0x69, 0x131), # iı + # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S + (0x73, 0x17f), # sſ + # MICRO SIGN, GREEK SMALL LETTER MU + (0xb5, 0x3bc), # µμ + # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI + (0x345, 0x3b9, 0x1fbe), # \u0345ιι + # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA + (0x390, 0x1fd3), # ΐΐ + # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA + (0x3b0, 0x1fe3), # ΰΰ + # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL + (0x3b2, 0x3d0), # βϐ + # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL + (0x3b5, 0x3f5), # εϵ + # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL + (0x3b8, 0x3d1), # θϑ + # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL + (0x3ba, 0x3f0), # κϰ + # GREEK SMALL LETTER PI, GREEK PI SYMBOL + (0x3c0, 0x3d6), # πϖ + # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL + (0x3c1, 0x3f1), # ρϱ + # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA + (0x3c2, 0x3c3), # ςσ + # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL + (0x3c6, 0x3d5), # φϕ + # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE + (0x1e61, 0x1e9b), # ṡẛ + # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST + (0xfb05, 0xfb06), # ſtst + ) + + # Maps the lowercase code to lowercase codes which have the same uppercase. + _ignorecase_fixes = {i: tuple(j for j in t if i != j) + for t in _equivalences for i in t} + def _compile(code, pattern, flags): # internal: compile a (sub)pattern emit = code.append @@@ -37,10 -84,22 +83,22 @@@ for op, av in pattern: if op in LITERAL_CODES: if flags & SRE_FLAG_IGNORECASE: - emit(OP_IGNORE[op]) - emit(_sre.getlower(av, flags)) + lo = _sre.getlower(av, flags) + if fixes and lo in fixes: - emit(OPCODES[IN_IGNORE]) ++ emit(IN_IGNORE) + skip = _len(code); emit(0) + if op is NOT_LITERAL: - emit(OPCODES[NEGATE]) ++ emit(NEGATE) + for k in (lo,) + fixes[lo]: - emit(OPCODES[LITERAL]) ++ emit(LITERAL) + emit(k) - emit(OPCODES[FAILURE]) ++ emit(FAILURE) + code[skip] = _len(code) - skip + else: - emit(OPCODES[OP_IGNORE[op]]) ++ emit(OP_IGNORE[op]) + emit(lo) else: - emit(OPCODES[op]) + emit(op) emit(av) elif op is IN: if flags & SRE_FLAG_IGNORECASE: @@@ -48,10 -107,10 +106,10 @@@ def fixup(literal, flags=flags): return _sre.getlower(literal, flags) else: - emit(OPCODES[op]) + emit(op) fixup = None skip = _len(code); emit(0) - _compile_charset(av, flags, code, fixup) + _compile_charset(av, flags, code, fixup, fixes) code[skip] = _len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: @@@ -165,11 -224,12 +223,11 @@@ else: raise ValueError("unsupported operand type", op) - def _compile_charset(charset, flags, code, fixup=None): + def _compile_charset(charset, flags, code, fixup=None, fixes=None): # compile charset subprogram emit = code.append - for op, av in _optimize_charset(charset, fixup): - for op, av in _optimize_charset(charset, fixup, fixes, - flags & SRE_FLAG_UNICODE): - emit(OPCODES[op]) ++ for op, av in _optimize_charset(charset, fixup, fixes): + emit(op) if op is NEGATE: pass elif op is LITERAL: @@@ -183,16 -243,16 +241,16 @@@ code.extend(av) elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: - emit(CHCODES[CH_LOCALE[av]]) + emit(CH_LOCALE[av]) elif flags & SRE_FLAG_UNICODE: - emit(CHCODES[CH_UNICODE[av]]) + emit(CH_UNICODE[av]) else: - emit(CHCODES[av]) + emit(av) else: raise error("internal: unsupported set operator") - emit(OPCODES[FAILURE]) + emit(FAILURE) - def _optimize_charset(charset, fixup): -def _optimize_charset(charset, fixup, fixes, isunicode): ++def _optimize_charset(charset, fixup, fixes): # internal: optimize character set out = [] tail = [] @@@ -202,8 -262,13 +260,13 @@@ try: if op is LITERAL: if fixup: - av = fixup(av) - charmap[av] = 1 - i = fixup(av) - charmap[i] = 1 - if fixes and i in fixes: - for k in fixes[i]: ++ lo = fixup(av) ++ charmap[lo] = 1 ++ if fixes and lo in fixes: ++ for k in fixes[lo]: + charmap[k] = 1 + else: + charmap[av] = 1 elif op is RANGE: r = range(av[0], av[1]+1) if fixup: