bpo-36546: Add more tests and expand docs (#13406)

author Raymond Hettinger <rhettinger@users.noreply.github.com>

Sat, 18 May 2019 17:18:29 +0000 (10:18 -0700)

committer GitHub <noreply@github.com>

Sat, 18 May 2019 17:18:29 +0000 (10:18 -0700)
author Raymond Hettinger <rhettinger@users.noreply.github.com>
Sat, 18 May 2019 17:18:29 +0000 (10:18 -0700)
committer GitHub <noreply@github.com>
Sat, 18 May 2019 17:18:29 +0000 (10:18 -0700)
diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst

index fb7df4e7188a077cd06f99da5ddf9a618e3fee2a..bc841fda72f887fb8cd40a7d9fe8d3bdac68f68f 100644 (file)
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -511,22 +511,33 @@ However, for reading convenience, most of the examples show sorted sequences.
     is not least 1.
  
     The *dist* can be any iterable containing sample data or it can be an
-   instance of a class that defines an :meth:`~inv_cdf` method.
+   instance of a class that defines an :meth:`~inv_cdf` method.  For meaningful
+   results, the number of data points in *dist* should be larger than *n*.
     Raises :exc:`StatisticsError` if there are not at least two data points.
  
     For sample data, the cut points are linearly interpolated from the
     two nearest data points.  For example, if a cut point falls one-third
     of the distance between two sample values, ``100`` and ``112``, the
-   cut-point will evaluate to ``104``.  Other selection methods may be
-   offered in the future (for example choose ``100`` as the nearest
-   value or compute ``106`` as the midpoint).  This might matter if
-   there are too few samples for a given number of cut points.
-
-   If *method* is set to *inclusive*, *dist* is treated as population data.
-   The minimum value is treated as the 0th percentile and the maximum
-   value is treated as the 100th percentile.  If *dist* is an instance of
-   a class that defines an :meth:`~inv_cdf` method, setting *method*
-   has no effect.
+   cut-point will evaluate to ``104``.
+
+   The *method* for computing quantiles can be varied depending on
+   whether the data in *dist* includes or excludes the lowest and
+   highest possible values from the population.
+
+   The default *method* is "exclusive" and is used for data sampled from
+   a population that can have more extreme values than found in the
+   samples.  The portion of the population falling below the *i-th* of
+   *m* data points is computed as ``i / (m + 1)``.
+
+   Setting the *method* to "inclusive" is used for describing population
+   data or for samples that include the extreme points.  The minimum
+   value in *dist* is treated as the 0th percentile and the maximum
+   value is treated as the 100th percentile.  The portion of the
+   population falling below the *i-th* of *m* data points is computed as
+   ``(i - 1) / (m - 1)``.
+
+   If *dist* is an instance of a class that defines an
+   :meth:`~inv_cdf` method, setting *method* has no effect.
  
     .. doctest::
  
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py

index 1922de5df4b0c5ebb0e85f58022bf13d61232c7a..946c7428c61311b0847dfc923fb4b18b25c98486 100644 (file)
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2161,17 +2161,18 @@ class TestQuantiles(unittest.TestCase):
              # Quantiles should be idempotent
              if len(expected) >= 2:
                  self.assertEqual(quantiles(expected, n=n), expected)
-            # Cross-check against other methods
-            if len(data) >= n:
-                # After end caps are added, method='inclusive' should
-                # give the same result as method='exclusive' whenever
-                # there are more data points than desired cut points.
-                padded_data = [min(data) - 1000] + data + [max(data) + 1000]
-                self.assertEqual(
-                    quantiles(data, n=n),
-                    quantiles(padded_data, n=n, method='inclusive'),
-                    (n, data),
-                )
+            # Cross-check against method='inclusive' which should give
+            # the same result after adding in minimum and maximum values
+            # extrapolated from the two lowest and two highest points.
+            sdata = sorted(data)
+            lo = 2 * sdata[0] - sdata[1]
+            hi = 2 * sdata[-1] - sdata[-2]
+            padded_data = data + [lo, hi]
+            self.assertEqual(
+                quantiles(data, n=n),
+                quantiles(padded_data, n=n, method='inclusive'),
+                (n, data),
+            )
              # Invariant under tranlation and scaling
              def f(x):
                  return 3.5 * x - 1234.675
@@ -2188,6 +2189,11 @@ class TestQuantiles(unittest.TestCase):
              actual = quantiles(statistics.NormalDist(), n=n)
              self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001)
                              for e, a in zip(expected, actual)))
+        # Q2 agrees with median()
+        for k in range(2, 60):
+            data = random.choices(range(100), k=k)
+            q1, q2, q3 = quantiles(data)
+            self.assertEqual(q2, statistics.median(data))
  
      def test_specific_cases_inclusive(self):
          # Match results computed by hand and cross-checked
@@ -2233,6 +2239,11 @@ class TestQuantiles(unittest.TestCase):
              actual = quantiles(statistics.NormalDist(), n=n, method="inclusive")
              self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001)
                              for e, a in zip(expected, actual)))
+        # Natural deciles
+        self.assertEqual(quantiles([0, 100], n=10, method='inclusive'),
+                         [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0])
+        self.assertEqual(quantiles(range(0, 101), n=10, method='inclusive'),
+                         [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0])
          # Whenever n is smaller than the number of data points, running
          # method='inclusive' should give the same result as method='exclusive'
          # after the two included extreme points are removed.
@@ -2242,6 +2253,11 @@ class TestQuantiles(unittest.TestCase):
          data.remove(max(data))
          expected = quantiles(data, n=32)
          self.assertEqual(expected, actual)
+        # Q2 agrees with median()
+        for k in range(2, 60):
+            data = random.choices(range(100), k=k)
+            q1, q2, q3 = quantiles(data, method='inclusive')
+            self.assertEqual(q2, statistics.median(data))
  
      def test_equal_inputs(self):
          quantiles = statistics.quantiles
author	Raymond Hettinger <rhettinger@users.noreply.github.com>
	Sat, 18 May 2019 17:18:29 +0000 (10:18 -0700)
committer	GitHub <noreply@github.com>
	Sat, 18 May 2019 17:18:29 +0000 (10:18 -0700)
Doc/library/statistics.rst		patch \| blob \| history
Lib/test/test_statistics.py		patch \| blob \| history