Issue 2986: Add autojunk paramater to SequenceMatcher to turn off heuristic. Patch...

author Terry Reedy <tjreedy@udel.edu>

Thu, 25 Nov 2010 06:12:34 +0000 (06:12 +0000)

committer Terry Reedy <tjreedy@udel.edu>

Thu, 25 Nov 2010 06:12:34 +0000 (06:12 +0000)
author Terry Reedy <tjreedy@udel.edu>
Thu, 25 Nov 2010 06:12:34 +0000 (06:12 +0000)
committer Terry Reedy <tjreedy@udel.edu>
Thu, 25 Nov 2010 06:12:34 +0000 (06:12 +0000)
diff --git a/Doc/library/difflib.rst b/Doc/library/difflib.rst

index 58bbe4509c3ddcac6630639356ad2a3ab3bb2e04..c7efa967b00c97ec437ac035100a6c470013967f 100644 (file)
--- a/Doc/library/difflib.rst
+++ b/Doc/library/difflib.rst
@@ -17,6 +17,7 @@ can be used for example, for comparing files, and can produce difference
  information in various formats, including HTML and context and unified
  diffs. For comparing directories and files, see also, the :mod:`filecmp` module.
  
+
  .. class:: SequenceMatcher
  
     This is a flexible class for comparing pairs of sequences of any type, so long
@@ -35,6 +36,14 @@ diffs. For comparing directories and files, see also, the :mod:`filecmp` module.
     complicated way on how many elements the sequences have in common; best case
     time is linear.
  
+   **Automatic junk heuristic:** :class:`SequenceMatcher` supports a heuristic that
+   automatically treats certain sequence items as junk. The heuristic counts how many
+   times each individual item appears in the sequence. If an item's duplicates (after
+   the first one) account for more than 1% of the sequence and the sequence is at least
+   200 items long, this item is marked as "popular" and is treated as junk for
+   the purpose of sequence matching. This heuristic can be turned off by setting
+   the ``autojunk`` argument to ``False`` when creating the :class:`SequenceMatcher`.
+
  
  .. class:: Differ
  
@@ -324,7 +333,7 @@ SequenceMatcher Objects
  The :class:`SequenceMatcher` class has this constructor:
  
  
-.. class:: SequenceMatcher(isjunk=None, a='', b='')
+.. class:: SequenceMatcher(isjunk=None, a='', b='', autojunk=True)
  
     Optional argument *isjunk* must be ``None`` (the default) or a one-argument
     function that takes a sequence element and returns true if and only if the
@@ -340,6 +349,9 @@ The :class:`SequenceMatcher` class has this constructor:
     The optional arguments *a* and *b* are sequences to be compared; both default to
     empty strings.  The elements of both sequences must be :term:`hashable`.
  
+   The optional argument *autojunk* can be used to disable the automatic junk
+   heuristic.
+
     :class:`SequenceMatcher` objects have the following methods:
  
  
diff --git a/Lib/difflib.py b/Lib/difflib.py

index 92d58fab80ae01b4bc8549cabecc0d4b9cbdfd07..fa8c287d4714cf1b093560f55c52cbc34a729471 100644 (file)
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -150,7 +150,7 @@ class SequenceMatcher:
          Return an upper bound on ratio() very quickly.
      """
  
-    def __init__(self, isjunk=None, a='', b=''):
+    def __init__(self, isjunk=None, a='', b='', autojunk=True):
          """Construct a SequenceMatcher.
  
          Optional arg isjunk is None (the default), or a one-argument
@@ -168,6 +168,10 @@ class SequenceMatcher:
          Optional arg b is the second of two sequences to be compared.  By
          default, an empty string.  The elements of b must be hashable. See
          also .set_seqs() and .set_seq2().
+
+        Optional arg autojunk should be set to False to disable the
+        "automatic junk heuristic" that treats popular elements as junk
+        (see module documentation for more information).
          """
  
          # Members:
@@ -206,11 +210,13 @@ class SequenceMatcher:
          #      DOES NOT WORK for x in a!
          # isbpopular
          #      for x in b, isbpopular(x) is true iff b is reasonably long
-        #      (at least 200 elements) and x accounts for more than 1% of
-        #      its elements.  DOES NOT WORK for x in a!
+        #      (at least 200 elements) and x accounts for more than 1 + 1% of
+        #      its elements (when autojunk is enabled).
+        #      DOES NOT WORK for x in a!
  
          self.isjunk = isjunk
          self.a = self.b = None
+        self.autojunk = autojunk
          self.set_seqs(a, b)
  
      def set_seqs(self, a, b):
@@ -287,7 +293,7 @@ class SequenceMatcher:
      # from starting any matching block at a junk element ...
      # also creates the fast isbjunk function ...
      # b2j also does not contain entries for "popular" elements, meaning
-    # elements that account for more than 1% of the total elements, and
+    # elements that account for more than 1 + 1% of the total elements, and
      # when the sequence is reasonably large (>= 200 elements); this can
      # be viewed as an adaptive notion of semi-junk, and yields an enormous
      # speedup when, e.g., comparing program files with hundreds of
@@ -308,44 +314,37 @@ class SequenceMatcher:
          # out the junk later is much cheaper than building b2j "right"
          # from the start.
          b = self.b
-        n = len(b)
          self.b2j = b2j = {}
-        populardict = {}
-        for i, elt in enumerate(b):
-            if elt in b2j:
-                indices = b2j[elt]
-                if n >= 200 and len(indices) * 100 > n:
-                    populardict[elt] = 1
-                    del indices[:]
-                else:
-                    indices.append(i)
-            else:
-                b2j[elt] = [i]
  
-        # Purge leftover indices for popular elements.
-        for elt in populardict:
-            del b2j[elt]
+        for i, elt in enumerate(b):
+            indices = b2j.setdefault(elt, [])
+            indices.append(i)
  
-        # Now b2j.keys() contains elements uniquely, and especially when
-        # the sequence is a string, that's usually a good deal smaller
-        # than len(string).  The difference is the number of isjunk calls
-        # saved.
+        # Purge junk elements
+        junk = set()
          isjunk = self.isjunk
-        junkdict = {}
          if isjunk:
-            for d in populardict, b2j:
-                for elt in list(d.keys()):
-                    if isjunk(elt):
-                        junkdict[elt] = 1
-                        del d[elt]
-
-        # Now for x in b, isjunk(x) == x in junkdict, but the
-        # latter is much faster.  Note too that while there may be a
-        # lot of junk in the sequence, the number of *unique* junk
-        # elements is probably small.  So the memory burden of keeping
-        # this dict alive is likely trivial compared to the size of b2j.
-        self.isbjunk = junkdict.__contains__
-        self.isbpopular = populardict.__contains__
+            for elt in list(b2j.keys()):  # using list() since b2j is modified
+                if isjunk(elt):
+                    junk.add(elt)
+                    del b2j[elt]
+
+        # Purge popular elements that are not junk
+        popular = set()
+        n = len(b)
+        if self.autojunk and n >= 200:
+            ntest = n // 100 + 1
+            for elt, idxs in list(b2j.items()):
+                if len(idxs) > ntest:
+                    popular.add(elt)
+                    del b2j[elt]
+
+        # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
+        # Since the number of *unique* junk elements is probably small, the
+        # memory burden of keeping this set alive is likely trivial compared to
+        # the size of b2j.
+        self.isbjunk = junk.__contains__
+        self.isbpopular = popular.__contains__
  
      def find_longest_match(self, alo, ahi, blo, bhi):
          """Find longest matching block in a[alo:ahi] and b[blo:bhi].
diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py

index 6c9bde61a4a5f27f16b4be8fd25011964b9d3907..e72df26c9ad15a535443731462b21de3fa1dc4f8 100644 (file)
--- a/Lib/test/test_difflib.py
+++ b/Lib/test/test_difflib.py
@@ -4,8 +4,47 @@ import unittest
  import doctest
  import sys
  
-class TestSFbugs(unittest.TestCase):
  
+class TestWithAscii(unittest.TestCase):
+    def test_one_insert(self):
+        sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100)
+        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
+        self.assertEqual(list(sm.get_opcodes()),
+            [   ('insert', 0, 0, 0, 1),
+                ('equal', 0, 100, 1, 101)])
+        sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
+        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
+        self.assertEqual(list(sm.get_opcodes()),
+            [   ('equal', 0, 50, 0, 50),
+                ('insert', 50, 50, 50, 51),
+                ('equal', 50, 100, 51, 101)])
+
+    def test_one_delete(self):
+        sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
+        self.assertAlmostEqual(sm.ratio(), 0.994, places=3)
+        self.assertEqual(list(sm.get_opcodes()),
+            [   ('equal', 0, 40, 0, 40),
+                ('delete', 40, 41, 40, 40),
+                ('equal', 41, 81, 40, 80)])
+
+
+class TestAutojunk(unittest.TestCase):
+    """Tests for the autojunk parameter added in 2.7"""
+    def test_one_insert_homogenous_sequence(self):
+        # By default autojunk=True and the heuristic kicks in for a sequence
+        # of length 200+
+        seq1 = 'b' * 200
+        seq2 = 'a' + 'b' * 200
+
+        sm = difflib.SequenceMatcher(None, seq1, seq2)
+        self.assertAlmostEqual(sm.ratio(), 0, places=3)
+
+        # Now turn the heuristic off
+        sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False)
+        self.assertAlmostEqual(sm.ratio(), 0.9975, places=3)
+
+
+class TestSFbugs(unittest.TestCase):
      def test_ratio_for_null_seqn(self):
          # Check clearing of SF bug 763023
          s = difflib.SequenceMatcher(None, [], [])
@@ -184,7 +223,9 @@ class TestOutputFormat(unittest.TestCase):
  def test_main():
      difflib.HtmlDiff._default_prefix = 0
      Doctests = doctest.DocTestSuite(difflib)
-    run_unittest(TestSFpatches, TestSFbugs, TestOutputFormat, Doctests)
+    run_unittest(
+        TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs,
+        TestOutputFormat, Doctests)
  
  if __name__ == '__main__':
      test_main()
diff --git a/Misc/NEWS b/Misc/NEWS

index bd1e731533d4eae8f7134fda2cb20e3b6a832769..6e27e83162e7b1f3cd360b9fe1d31401319ce328 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -37,6 +37,10 @@ Core and Builtins
  Library
  -------
  
+- Issue #2986: difflib.SequenceMatcher gets a new parameter, autojunk, which
+  can be set to False to turn off the previously undocumented 'popularity'
+  heuristic. Patch by Terry Reedy and Eli Bendersky.
+
  - Issue #9846: zipfile is now correctly closing underlying file objects.
  
  - Issue #10459: Update CJK character names to Unicode 6.0.
author	Terry Reedy <tjreedy@udel.edu>
	Thu, 25 Nov 2010 06:12:34 +0000 (06:12 +0000)
committer	Terry Reedy <tjreedy@udel.edu>
	Thu, 25 Nov 2010 06:12:34 +0000 (06:12 +0000)
Doc/library/difflib.rst		patch \| blob \| history
Lib/difflib.py		patch \| blob \| history
Lib/test/test_difflib.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history