From b8baf6360f7e93f5caa2a20292a868f890056215 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 14 Jan 2009 02:20:07 +0000 Subject: [PATCH] Issue #1696199: Add collections.Counter(). Forward port from Py2.7. --- Doc/library/collections.rst | 134 +++++++++++++++++++++++++++++ Lib/collections.py | 159 +++++++++++++++++++++++++++++++++++ Lib/test/test_collections.py | 97 ++++++++++++++++++++- Misc/NEWS | 3 + 4 files changed, 391 insertions(+), 2 deletions(-) diff --git a/Doc/library/collections.rst b/Doc/library/collections.rst index 4045d2e950..816d814d4a 100644 --- a/Doc/library/collections.rst +++ b/Doc/library/collections.rst @@ -140,6 +140,140 @@ Notes on using :class:`Set` and :class:`MutableSet` as a mixin: (For more about ABCs, see the :mod:`abc` module and :pep:`3119`.) +.. _counter-objects: + +:class:`Counter` objects +------------------------ + +A counter tool is provided to support convenient and rapid tallies. +For example:: + + # Tally repeated words in a list + >>> words = ['red', 'blue', 'red', 'green', 'blue', 'blue'] + >>> cnt = Counter() + >>> for word in words: + ... cnt[word] += 1 + >>> cnt + Counter({'blue': 3, 'red': 2, 'green': 1}) + + # Find the ten most common words in Hamlet + >>> import re + >>> words = re.findall('\w+', open('hamlet.txt').read().lower()) + >>> Counter(hamlet_words).most_common(10) + [('the', 1143), ('and', 966), ('to', 762), ('of', 669), ('i', 631), + ('you', 554), ('a', 546), ('my', 514), ('hamlet', 471), ('in', 451)] + +.. class:: Counter([iterable-or-mapping]) + + A :class:`Counter` is a :class:`dict` subclass for counting hashable items. + It is an unordered collection where elements are stored as dictionary keys + and their counts are stored as dictionary values. Counts are allowed to be + any integer value including zero or negative counts. The :class:`Counter` + class is similar to bags or multisets in other languages. + + Elements are counted from an *iterable* or initialized from another + *mapping* (or counter):: + + >>> c = Counter() # a new, empty counter + >>> c = Counter('gallahad') # a new counter from an iterable + >>> c = Counter({'red': 4, 'blue': 2}) # a new counter from a mapping + + The returned object has a dictionary style interface except that it returns + a zero count for missing items (instead of raising a :exc:`KeyError` like a + dictionary would):: + + >>> c = Counter(['egg', 'ham']) + >>> c['bacon'] # count of a missing element is zero + 0 + + Assigning a count of zero or reducing the count to zero leaves the + element in the dictionary. Use ``del`` to remove the entry entirely: + + >>> c = Counter(['arthur', 'gwain']) + >>> c['arthur'] = 0 # set the count of 'arthur' to zero + >>> 'arthur' in c # but 'arthur' is still in the counter + True + >>> del c['arthur'] # del will completely remove the entry + + .. versionadded:: 2.7 + + + Counter objects support two methods beyond those available for all + dictionaries: + + .. method:: elements() + + Return an iterator over elements repeating each as many times as its count. + Elements are returned in arbitrary order. If an element's count has been + set to zero or a negative number, :meth:`elements` will ignore it. + + >>> c = Counter({'a': 4, 'b': 2, 'd': 0, 'e': -2}) + >>> list(c.elements()) + ['a', 'a', 'a', 'a', 'b', 'b'] + + .. method:: most_common([n]) + + Return a list of the *n* most common elements and their counts from + the most common to the least. If *n* is not specified or is ``None``, + return a list of all element counts in decreasing order of frequency. + Elements with equal counts are ordered arbitrarily:: + + >>> Counter('abracadabra').most_common(3) + [('a', 5), ('r', 2), ('b', 2)] + + The usual dictionary methods are available for :class:`Counter` objects. + All of those work the same as they do for dictionaries except for two + which work differently for counters. + + .. method:: fromkeys(iterable) + + There is no equivalent class method for :class:`Counter` objects. + Raises a :exc:`NotImplementedError` when called. + + .. method:: update([iterable-or-mapping]) + + Like :meth:`dict.update` but adds-in counts instead of replacing them. + + Elements are counted from an *iterable* or added-in from another + *mapping* (or counter):: + + >>> c = Counter('which') + >>> c.update('witch') # add elements from another iterable + >>> d = Counter('watch') + >>> c.update(d) # add elements from another counter + >>> c['h'] # four 'h' in which, witch, and watch + 4 + +Common patterns for working with :class:`Counter` objects:: + + sum(c.values()) # total of all counts + c.clear() # reset all counts + list(c) # list unique elements + set(c) # convert to a set + dict(c) # convert to a regular dictionary + c.items() # convert to a list of (elem, cnt) pairs + Counter(dict(list_of_pairs)) # convert from a list of (elem, cnt) pairs + c.most_common()[:-n:-1] # n least common elements + +**References**: + +* Wikipedia entry for `Multisets `_ + +* `Bag class `_ + in Smalltalk +* `C++ multisets `_ + tutorial with standalone examples + +* An early Python `Bag `_ recipe + for Python 2.4 and a `Counter `_ + comformant recipe for Python 2.5 and later + +* Use cases for multisets and mathematical operations on multisets. + Knuth, Donald. The Art of Computer Programming Volume II, + Section 4.6.3, Exercise 19 + + + .. _deque-objects: :class:`deque` objects diff --git a/Lib/collections.py b/Lib/collections.py index c3faa9a44d..6c1abce981 100644 --- a/Lib/collections.py +++ b/Lib/collections.py @@ -10,6 +10,9 @@ from _collections import deque, defaultdict from operator import itemgetter as _itemgetter from keyword import iskeyword as _iskeyword import sys as _sys +import heapq as _heapq +from itertools import repeat as _repeat, chain as _chain, starmap as _starmap + ################################################################################ ### namedtuple @@ -113,6 +116,162 @@ def namedtuple(typename, field_names, verbose=False): return result +######################################################################## +### Counter +######################################################################## + +class Counter(dict): + '''Dict subclass for counting hashable items. Sometimes called a bag + or multiset. Elements are stored as dictionary keys and their counts + are stored as dictionary values. + + >>> c = Counter('abracadabra') # count elements from a string + + >>> c.most_common(3) # three most common elements + [('a', 5), ('r', 2), ('b', 2)] + >>> sorted(c) # list all unique elements + ['a', 'b', 'c', 'd', 'r'] + >>> ''.join(sorted(c.elements())) # list elements with repetitions + 'aaaaabbcdrr' + >>> sum(c.values()) # total of all counts + 11 + + >>> c['a'] # count of letter 'a' + 5 + >>> for elem in 'shazam': # update counts from an iterable + ... c[elem] += 1 # by adding 1 to each element's count + >>> c['a'] # now there are seven 'a' + 7 + >>> del c['r'] # remove all 'r' + >>> c['r'] # now there are zero 'r' + 0 + + >>> d = Counter('simsalabim') # make another counter + >>> c.update(d) # add in the second counter + >>> c['a'] # now there are nine 'a' + 9 + + >>> c.clear() # empty the counter + >>> c + Counter() + + Note: If a count is set to zero or reduced to zero, it will remain + in the counter until the entry is deleted or the counter is cleared: + + >>> c = Counter('aaabbc') + >>> c['b'] -= 2 # reduce the count of 'b' by two + >>> c.most_common() # 'b' is still in, but its count is zero + [('a', 3), ('c', 1), ('b', 0)] + + ''' + # References: + # http://en.wikipedia.org/wiki/Multiset + # http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html + # http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm + # http://code.activestate.com/recipes/259174/ + # Knuth, TAOCP Vol. II section 4.6.3 + + def __init__(self, iterable=None): + '''Create a new, empty Counter object. And if given, count elements + from an input iterable. Or, initialize the count from another mapping + of elements to their counts. + + >>> c = Counter() # a new, empty counter + >>> c = Counter('gallahad') # a new counter from an iterable + >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping + + ''' + self.update(iterable) + + def __missing__(self, key): + 'The count of elements not in the Counter is zero.' + # Needed so that self[missing_item] does not raise KeyError + return 0 + + def most_common(self, n=None): + '''List the n most common elements and their counts from the most + common to the least. If n is None, then list all element counts. + + >>> Counter('abracadabra').most_common(3) + [('a', 5), ('r', 2), ('b', 2)] + + ''' + # Emulate Bag.sortedByCount from Smalltalk + if n is None: + return sorted(self.items(), key=_itemgetter(1), reverse=True) + return _heapq.nlargest(n, self.items(), key=_itemgetter(1)) + + def elements(self): + '''Iterator over elements repeating each as many times as its count. + + >>> c = Counter('ABCABC') + >>> sorted(c.elements()) + ['A', 'A', 'B', 'B', 'C', 'C'] + + # Knuth's example for prime factors of 1836: 2**2 * 3**3 * 17**1 + >>> prime_factors = Counter({2: 2, 3: 3, 17: 1}) + >>> product = 1 + >>> for factor in prime_factors.elements(): # loop over factors + ... product *= factor # and multiply them + >>> product + 1836 + + Note, if an element's count has been set to zero or is a negative + number, elements() will ignore it. + + ''' + # Emulate Bag.do from Smalltalk and Multiset.begin from C++. + return _chain.from_iterable(_starmap(_repeat, self.items())) + + # Override dict methods where necessary + + @classmethod + def fromkeys(cls, iterable, v=None): + # There is no equivalent method for counters because setting v=1 + # means that no element can have a count greater than one. + raise NotImplementedError( + 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.') + + def update(self, iterable=None): + '''Like dict.update() but add counts instead of replacing them. + + Source can be an iterable, a dictionary, or another Counter instance. + + >>> c = Counter('which') + >>> c.update('witch') # add elements from another iterable + >>> d = Counter('watch') + >>> c.update(d) # add elements from another counter + >>> c['h'] # four 'h' in which, witch, and watch + 4 + + ''' + # The regular dict.update() operation makes no sense here because the + # replace behavior results in the some of original untouched counts + # being mixed-in with all of the other counts for a mismash that + # doesn't have a straight-forward interpretation in most counting + # contexts. Instead, we look to Knuth for suggested operations on + # multisets and implement the union-add operation discussed in + # TAOCP Volume II section 4.6.3 exercise 19. The Wikipedia entry for + # multisets calls that operation a sum or join. + + if iterable is not None: + if isinstance(iterable, Mapping): + for elem, count in iterable.items(): + self[elem] += count + else: + for elem in iterable: + self[elem] += 1 + + def copy(self): + 'Like dict.copy() but returns a Counter instance instead of a dict.' + return Counter(self) + + def __repr__(self): + if not self: + return '%s()' % self.__class__.__name__ + items = ', '.join(map('%r: %r'.__mod__, self.most_common())) + return '%s({%s})' % (self.__class__.__name__, items) + ################################################################################ ### UserDict diff --git a/Lib/test/test_collections.py b/Lib/test/test_collections.py index 60072cc770..153059adb0 100644 --- a/Lib/test/test_collections.py +++ b/Lib/test/test_collections.py @@ -2,7 +2,7 @@ import unittest, doctest from test import support -from collections import namedtuple +from collections import namedtuple, Counter, Mapping import pickle, copy from collections import Hashable, Iterable, Iterator from collections import Sized, Container, Callable @@ -357,11 +357,104 @@ class TestCollectionABCs(unittest.TestCase): self.failUnless(issubclass(sample, MutableSequence)) self.failIf(issubclass(str, MutableSequence)) +class TestCounter(unittest.TestCase): + + def test_basics(self): + c = Counter('abcaba') + self.assert_(isinstance(c, dict)) + self.assert_(isinstance(c, Mapping)) + self.assert_(issubclass(Counter, dict)) + self.assert_(issubclass(Counter, Mapping)) + self.assertEqual(len(c), 3) + self.assertEqual(sum(c.values()), 6) + self.assertEqual(sorted(c.values()), [1, 2, 3]) + self.assertEqual(sorted(c.keys()), ['a', 'b', 'c']) + self.assertEqual(sorted(c), ['a', 'b', 'c']) + self.assertEqual(sorted(c.items()), + [('a', 3), ('b', 2), ('c', 1)]) + self.assertEqual(c['b'], 2) + self.assertEqual(c['z'], 0) + self.assertEqual(c.__contains__('c'), True) + self.assertEqual(c.__contains__('z'), False) + self.assertEqual(c.get('b', 10), 2) + self.assertEqual(c.get('z', 10), 10) + self.assertEqual(c, dict(a=3, b=2, c=1)) + self.assertEqual(repr(c), "Counter({'a': 3, 'b': 2, 'c': 1})") + self.assertEqual(c.most_common(), [('a', 3), ('b', 2), ('c', 1)]) + for i in range(5): + self.assertEqual(c.most_common(i), + [('a', 3), ('b', 2), ('c', 1)][:i]) + self.assertEqual(''.join(sorted(c.elements())), 'aaabbc') + c['a'] += 1 # increment an existing value + c['b'] -= 2 # sub existing value to zero + del c['c'] # remove an entry + c['d'] -= 2 # sub from a missing value + c['e'] = -5 # directly assign a missing value + c['f'] += 4 # add to a missing value + self.assertEqual(c, dict(a=4, b=0, d=-2, e=-5, f=4)) + self.assertEqual(''.join(sorted(c.elements())), 'aaaaffff') + self.assertEqual(c.pop('f'), 4) + self.assertEqual('f' in c, False) + for i in range(3): + elem, cnt = c.popitem() + self.assertEqual(elem in c, False) + c.clear() + self.assertEqual(c, {}) + self.assertEqual(repr(c), 'Counter()') + self.assertRaises(NotImplementedError, Counter.fromkeys, 'abc') + self.assertRaises(TypeError, hash, c) + c.update(dict(a=5, b=3, c=1)) + c.update(Counter('a' * 50 + 'b' * 30)) + c.update() # test case with no args + c.__init__('a' * 500 + 'b' * 300) + c.__init__('cdc') + c.__init__() + self.assertEqual(c, dict(a=555, b=333, c=3, d=1)) + self.assertEqual(c.setdefault('d', 5), 1) + self.assertEqual(c['d'], 1) + self.assertEqual(c.setdefault('e', 5), 5) + self.assertEqual(c['e'], 5) + + def test_copying(self): + # Check that counters are copyable, deepcopyable, picklable, and + #have a repr/eval round-trip + words = Counter('which witch had which witches wrist watch'.split()) + update_test = Counter() + update_test.update(words) + for i, dup in enumerate([ + words.copy(), + copy.copy(words), + copy.deepcopy(words), + pickle.loads(pickle.dumps(words, 0)), + pickle.loads(pickle.dumps(words, 1)), + pickle.loads(pickle.dumps(words, 2)), + pickle.loads(pickle.dumps(words, -1)), + eval(repr(words)), + update_test, + Counter(words), + ]): + msg = (i, dup, words) + self.assert_(dup is not words) + self.assertEquals(dup, words) + self.assertEquals(len(dup), len(words)) + self.assertEquals(type(dup), type(words)) + + def test_conversions(self): + # Convert to: set, list, dict + s = 'she sells sea shells by the sea shore' + self.assertEqual(sorted(Counter(s).elements()), sorted(s)) + self.assertEqual(sorted(Counter(s)), sorted(set(s))) + self.assertEqual(dict(Counter(s)), dict(Counter(s).items())) + self.assertEqual(set(Counter(s)), set(s)) + + + import doctest, collections def test_main(verbose=None): NamedTupleDocs = doctest.DocTestSuite(module=collections) - test_classes = [TestNamedTuple, NamedTupleDocs, TestOneTrickPonyABCs, TestCollectionABCs] + test_classes = [TestNamedTuple, NamedTupleDocs, TestOneTrickPonyABCs, + TestCollectionABCs, TestCounter] support.run_unittest(*test_classes) support.run_doctest(collections, verbose) diff --git a/Misc/NEWS b/Misc/NEWS index ac308973a1..e5369a3e40 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -130,6 +130,9 @@ Library appropriately when it is being used via socket.makefile() objects rather than delaying the close by waiting for garbage collection to do it. +- Issue #1696199: Add collections.Counter() for rapid and convenient + counting. + - Issue #3860: GzipFile and BZ2File now support the context manager protocol. - Issue #4867: Fixed a crash in ctypes when passing a string to a -- 2.40.0