SF patch 629637: Add sample(population, k) method to the random module.

author Raymond Hettinger <python@rcn.com>

Tue, 12 Nov 2002 17:41:57 +0000 (17:41 +0000)

committer Raymond Hettinger <python@rcn.com>

Tue, 12 Nov 2002 17:41:57 +0000 (17:41 +0000)
author Raymond Hettinger <python@rcn.com>
Tue, 12 Nov 2002 17:41:57 +0000 (17:41 +0000)
committer Raymond Hettinger <python@rcn.com>
Tue, 12 Nov 2002 17:41:57 +0000 (17:41 +0000)
diff --git a/Doc/lib/librandom.tex b/Doc/lib/librandom.tex

index 846b931008c05706192cdfb28a6ee14a984a1472..15e477b373ea061b04458fb1b47cb09d7a7ef161 100644 (file)
--- a/Doc/lib/librandom.tex
+++ b/Doc/lib/librandom.tex
@@ -179,6 +179,25 @@ Functions for sequences:
    long sequence can never be generated.
  \end{funcdesc}
  
+\begin{funcdesc}{sample}{population, k}
+  Return a \var{k} length list of unique elements chosen from the
+  population sequence.  Used for random sampling without replacement.
+
+  Returns a new list containing elements from the population.  The
+  list itself is in random order so that all sub-slices are also
+  random samples.  The original sequence is left undisturbed.
+
+  If the population has repeated elements, then each occurence is a
+  possible selection in the sample.
+
+  If indices are needed for a large population, use \function{xrange}
+  as an argument:  \code{sample(xrange(10000000), 60)}.
+
+  Optional argument random is a 0-argument function returning a random
+  float in [0.0, 1.0); by default, the standard random.random.                            
+  \versionadded{2.3}
+\end{funcdesc}
+
  
  The following functions generate specific real-valued distributions.
  Function parameters are named after the corresponding variables in the
diff --git a/Lib/random.py b/Lib/random.py

index 4d290804caf614dcec35ecca0d211b877c12369b..e2c675c553643a603de9d0a9f3f05a06237e0415 100644 (file)
--- a/Lib/random.py
+++ b/Lib/random.py
@@ -7,6 +7,7 @@
      sequences
      ---------
             pick random element
+           pick random sample
             generate random permutation
  
      distributions on the real line:
@@ -77,7 +78,7 @@ from math import log as _log, exp as _exp, pi as _pi, e as _e
  from math import sqrt as _sqrt, acos as _acos, cos as _cos, sin as _sin
  from math import floor as _floor
  
-__all__ = ["Random","seed","random","uniform","randint","choice",
+__all__ = ["Random","seed","random","uniform","randint","choice","sample",
             "randrange","shuffle","normalvariate","lognormvariate",
             "cunifvariate","expovariate","vonmisesvariate","gammavariate",
             "stdgamma","gauss","betavariate","paretovariate","weibullvariate",
@@ -373,6 +374,43 @@ class Random:
              j = int(random() * (i+1))
              x[i], x[j] = x[j], x[i]
  
+    def sample(self, population, k, random=None, int=int):
+        """Chooses k unique random elements from a population sequence.
+
+        Returns a new list containing elements from the population.  The
+        list itself is in random order so that all sub-slices are also
+        random samples.  The original sequence is left undisturbed.
+
+        If the population has repeated elements, then each occurence is
+        a possible selection in the sample.
+
+        If indices are needed for a large population, use xrange as an
+        argument:  sample(xrange(10000000), 60)
+
+        Optional arg random is a 0-argument function returning a random
+        float in [0.0, 1.0); by default, the standard random.random.
+        """
+
+        n = len(population)
+        if not 0 <= k <= n:
+            raise ValueError, "sample larger than population"
+        if random is None:
+            random = self.random
+        if n < 6 * k:     # if n len list takes less space than a k len dict
+            pool = list(population)
+            for i in xrange(n-1, n-k-1, -1):
+                j = int(random() * (i+1))
+                pool[i], pool[j] = pool[j], pool[i]
+            return pool[-k:]
+        inorder = [None] * k
+        selections = {}
+        for i in xrange(k):
+            j = int(random() * n)
+            while j in selections:
+                j = int(random() * n)
+            selections[j] = inorder[i] = population[j]
+        return inorder     # return selections in the order they were picked
+
  ## -------------------- real-valued distributions  -------------------
  
  ## -------------------- uniform distribution -------------------
@@ -711,7 +749,19 @@ def _test_generator(n, funccall):
      print 'avg %g, stddev %g, min %g, max %g' % \
                (avg, stddev, smallest, largest)
  
-def _test(N=20000):
+def _test_sample(n):
+    # For the entire allowable range of 0 <= k <= n, validate that
+    # the sample is of the correct length and contains only unique items
+    population = xrange(n)
+    for k in xrange(n+1):
+        s = sample(population, k)
+        assert len(dict([(elem,True) for elem in s])) == len(s) == k
+
+def _sample_generator(n, k):
+    # Return a fixed element from the sample.  Validates random ordering.
+    return sample(xrange(n), k)[k//2]
+
+def _test(N=2000):
      print 'TWOPI         =', TWOPI
      print 'LOG4          =', LOG4
      print 'NV_MAGICCONST =', NV_MAGICCONST
@@ -735,6 +785,9 @@ def _test(N=20000):
      _test_generator(N, 'betavariate(3.0, 3.0)')
      _test_generator(N, 'paretovariate(1.0)')
      _test_generator(N, 'weibullvariate(1.0, 1.0)')
+    _test_generator(N, '_sample_generator(50, 5)')  # expected s.d.: 14.4
+    _test_generator(N, '_sample_generator(50, 45)') # expected s.d.: 14.4
+    _test_sample(1000)
  
      # Test jumpahead.
      s = getstate()
@@ -760,6 +813,7 @@ uniform = _inst.uniform
  randint = _inst.randint
  choice = _inst.choice
  randrange = _inst.randrange
+sample = _inst.sample
  shuffle = _inst.shuffle
  normalvariate = _inst.normalvariate
  lognormvariate = _inst.lognormvariate
diff --git a/Misc/NEWS b/Misc/NEWS

index 89f1e2ff3fc028bbe54d714f8489ce4953f9ff07..addd5ae55f3ece08a63393348a17fc8dd66ca581 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -427,6 +427,9 @@ Library
  
  - Added operator.pow(a,b) which is equivalent to a**b.
  
+- Added random.sample(population,k) for random sampling without replacement.
+  Returns a k length list of unique elements chosen from the population.                                                                   
+
  - random.randrange(-sys.maxint-1, sys.maxint) no longer raises
    OverflowError.  That is, it now accepts any combination of 'start'
    and 'stop' arguments so long as each is in the range of Python's
author	Raymond Hettinger <python@rcn.com>
	Tue, 12 Nov 2002 17:41:57 +0000 (17:41 +0000)
committer	Raymond Hettinger <python@rcn.com>
	Tue, 12 Nov 2002 17:41:57 +0000 (17:41 +0000)
Doc/lib/librandom.tex		patch \| blob \| history
Lib/random.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history