From 9013ccf6d8037f6ae78145a42d194141cb10d332 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 23 Apr 2019 00:06:35 -0700 Subject: [PATCH] bpo-36546: Add statistics.quantiles() (#12710) --- Doc/library/statistics.rst | 54 ++++++- Doc/whatsnew/3.8.rst | 4 + Lib/statistics.py | 57 ++++++- Lib/test/test_statistics.py | 142 ++++++++++++++++++ .../2019-04-06-14-23-00.bpo-36546.YXjbyY.rst | 1 + 5 files changed, 251 insertions(+), 7 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-04-06-14-23-00.bpo-36546.YXjbyY.rst diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 8bb2bdf7b6..b62bcfdffd 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -48,6 +48,7 @@ or sample. :func:`median_grouped` Median, or 50th percentile, of grouped data. :func:`mode` Single mode (most common value) of discrete or nominal data. :func:`multimode` List of modes (most common values) of discrete or nomimal data. +:func:`quantiles` Divide data into intervals with equal probability. ======================= =============================================================== Measures of spread @@ -499,6 +500,53 @@ However, for reading convenience, most of the examples show sorted sequences. :func:`pvariance` function as the *mu* parameter to get the variance of a sample. +.. function:: quantiles(dist, *, n=4, method='exclusive') + + Divide *dist* into *n* continuous intervals with equal probability. + Returns a list of ``n - 1`` cut points separating the intervals. + + Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. Set + *n* to 100 for percentiles which gives the 99 cuts points that separate + *dist* in to 100 equal sized groups. Raises :exc:`StatisticsError` if *n* + is not least 1. + + The *dist* can be any iterable containing sample data or it can be an + instance of a class that defines an :meth:`~inv_cdf` method. + Raises :exc:`StatisticsError` if there are not at least two data points. + + For sample data, the cut points are linearly interpolated from the + two nearest data points. For example, if a cut point falls one-third + of the distance between two sample values, ``100`` and ``112``, the + cut-point will evaluate to ``104``. Other selection methods may be + offered in the future (for example choose ``100`` as the nearest + value or compute ``106`` as the midpoint). This might matter if + there are too few samples for a given number of cut points. + + If *method* is set to *inclusive*, *dist* is treated as population data. + The minimum value is treated as the 0th percentile and the maximum + value is treated as the 100th percentile. If *dist* is an instance of + a class that defines an :meth:`~inv_cdf` method, setting *method* + has no effect. + + .. doctest:: + + # Decile cut points for empirically sampled data + >>> data = [105, 129, 87, 86, 111, 111, 89, 81, 108, 92, 110, + ... 100, 75, 105, 103, 109, 76, 119, 99, 91, 103, 129, + ... 106, 101, 84, 111, 74, 87, 86, 103, 103, 106, 86, + ... 111, 75, 87, 102, 121, 111, 88, 89, 101, 106, 95, + ... 103, 107, 101, 81, 109, 104] + >>> [round(q, 1) for q in quantiles(data, n=10)] + [81.0, 86.2, 89.0, 99.4, 102.5, 103.6, 106.0, 109.8, 111.0] + + >>> # Quartile cut points for the standard normal distibution + >>> Z = NormalDist() + >>> [round(q, 4) for q in quantiles(Z, n=4)] + [-0.6745, 0.0, 0.6745] + + .. versionadded:: 3.8 + + Exceptions ---------- @@ -606,7 +654,7 @@ of applications in statistics. `_ between two normal distributions, giving a measure of agreement. Returns a value between 0.0 and 1.0 giving `the overlapping area for - two probability density functions + the two probability density functions `_. Instances of :class:`NormalDist` support addition, subtraction, @@ -649,8 +697,8 @@ of applications in statistics. For example, given `historical data for SAT exams `_ showing that scores are normally distributed with a mean of 1060 and a standard deviation of 192, -determine the percentage of students with scores between 1100 and 1200, after -rounding to the nearest whole number: +determine the percentage of students with test scores between 1100 and +1200, after rounding to the nearest whole number: .. doctest:: diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index f866f9ccb8..bd7ad3f87c 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -337,6 +337,10 @@ Added :func:`statistics.geometric_mean()` Added :func:`statistics.multimode` that returns a list of the most common values. (Contributed by Raymond Hettinger in :issue:`35892`.) +Added :func:`statistics.quantiles` that divides data or a distribution +in to equiprobable intervals (e.g. quartiles, deciles, or percentiles). +(Contributed by Raymond Hettinger in :issue:`36546`.) + Added :class:`statistics.NormalDist`, a tool for creating and manipulating normal distributions of a random variable. (Contributed by Raymond Hettinger in :issue:`36018`.) diff --git a/Lib/statistics.py b/Lib/statistics.py index 262ad976b6..05edfdf98e 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -7,9 +7,9 @@ averages, variance, and standard deviation. Calculating averages -------------------- -================== ============================================= +================== ================================================== Function Description -================== ============================================= +================== ================================================== mean Arithmetic mean (average) of data. geometric_mean Geometric mean of data. harmonic_mean Harmonic mean of data. @@ -19,7 +19,8 @@ median_high High median of data. median_grouped Median, or 50th percentile, of grouped data. mode Mode (most common value) of data. multimode List of modes (most common values of data). -================== ============================================= +quantiles Divide data into intervals with equal probability. +================== ================================================== Calculate the arithmetic mean ("the average") of data: @@ -78,7 +79,7 @@ A single exception is defined: StatisticsError is a subclass of ValueError. """ -__all__ = [ 'StatisticsError', 'NormalDist', +__all__ = [ 'StatisticsError', 'NormalDist', 'quantiles', 'pstdev', 'pvariance', 'stdev', 'variance', 'median', 'median_low', 'median_high', 'median_grouped', 'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean', @@ -562,6 +563,54 @@ def multimode(data): maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, [])) return list(map(itemgetter(0), mode_items)) +def quantiles(dist, *, n=4, method='exclusive'): + '''Divide *dist* into *n* continuous intervals with equal probability. + + Returns a list of (n - 1) cut points separating the intervals. + + Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. + Set *n* to 100 for percentiles which gives the 99 cuts points that + separate *dist* in to 100 equal sized groups. + + The *dist* can be any iterable containing sample data or it can be + an instance of a class that defines an inv_cdf() method. For sample + data, the cut points are linearly interpolated between data points. + + If *method* is set to *inclusive*, *dist* is treated as population + data. The minimum value is treated as the 0th percentile and the + maximum value is treated as the 100th percentile. + ''' + # Possible future API extensions: + # quantiles(data, already_sorted=True) + # quantiles(data, cut_points=[0.02, 0.25, 0.50, 0.75, 0.98]) + if n < 1: + raise StatisticsError('n must be at least 1') + if hasattr(dist, 'inv_cdf'): + return [dist.inv_cdf(i / n) for i in range(1, n)] + data = sorted(dist) + ld = len(data) + if ld < 2: + raise StatisticsError('must have at least two data points') + if method == 'inclusive': + m = ld - 1 + result = [] + for i in range(1, n): + j = i * m // n + delta = i*m - j*n + interpolated = (data[j] * (n - delta) + data[j+1] * delta) / n + result.append(interpolated) + return result + if method == 'exclusive': + m = ld + 1 + result = [] + for i in range(1, n): + j = i * m // n # rescale i to m/n + j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1 + delta = i*m - j*n # exact integer math + interpolated = (data[j-1] * (n - delta) + data[j] * delta) / n + result.append(interpolated) + return result + raise ValueError(f'Unknown method: {method!r}') # === Measures of spread === diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 4d397eb126..c988d7fd8b 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -3,6 +3,7 @@ approx_equal function. """ +import bisect import collections import collections.abc import copy @@ -2038,6 +2039,7 @@ class TestStdev(VarianceStdevMixin, NumericTestCase): expected = math.sqrt(statistics.variance(data)) self.assertEqual(self.func(data), expected) + class TestGeometricMean(unittest.TestCase): def test_basics(self): @@ -2126,6 +2128,146 @@ class TestGeometricMean(unittest.TestCase): with self.assertRaises(ValueError): geometric_mean([Inf, -Inf]) + +class TestQuantiles(unittest.TestCase): + + def test_specific_cases(self): + # Match results computed by hand and cross-checked + # against the PERCENTILE.EXC function in MS Excel. + quantiles = statistics.quantiles + data = [120, 200, 250, 320, 350] + random.shuffle(data) + for n, expected in [ + (1, []), + (2, [250.0]), + (3, [200.0, 320.0]), + (4, [160.0, 250.0, 335.0]), + (5, [136.0, 220.0, 292.0, 344.0]), + (6, [120.0, 200.0, 250.0, 320.0, 350.0]), + (8, [100.0, 160.0, 212.5, 250.0, 302.5, 335.0, 357.5]), + (10, [88.0, 136.0, 184.0, 220.0, 250.0, 292.0, 326.0, 344.0, 362.0]), + (12, [80.0, 120.0, 160.0, 200.0, 225.0, 250.0, 285.0, 320.0, 335.0, + 350.0, 365.0]), + (15, [72.0, 104.0, 136.0, 168.0, 200.0, 220.0, 240.0, 264.0, 292.0, + 320.0, 332.0, 344.0, 356.0, 368.0]), + ]: + self.assertEqual(expected, quantiles(data, n=n)) + self.assertEqual(len(quantiles(data, n=n)), n - 1) + self.assertEqual(list(map(float, expected)), + quantiles(map(Decimal, data), n=n)) + self.assertEqual(list(map(Decimal, expected)), + quantiles(map(Decimal, data), n=n)) + self.assertEqual(list(map(Fraction, expected)), + quantiles(map(Fraction, data), n=n)) + # Invariant under tranlation and scaling + def f(x): + return 3.5 * x - 1234.675 + exp = list(map(f, expected)) + act = quantiles(map(f, data), n=n) + self.assertTrue(all(math.isclose(e, a) for e, a in zip(exp, act))) + # Quartiles of a standard normal distribution + for n, expected in [ + (1, []), + (2, [0.0]), + (3, [-0.4307, 0.4307]), + (4 ,[-0.6745, 0.0, 0.6745]), + ]: + actual = quantiles(statistics.NormalDist(), n=n) + self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001) + for e, a in zip(expected, actual))) + + def test_specific_cases_inclusive(self): + # Match results computed by hand and cross-checked + # against the PERCENTILE.INC function in MS Excel + # and against the quaatile() function in SciPy. + quantiles = statistics.quantiles + data = [100, 200, 400, 800] + random.shuffle(data) + for n, expected in [ + (1, []), + (2, [300.0]), + (3, [200.0, 400.0]), + (4, [175.0, 300.0, 500.0]), + (5, [160.0, 240.0, 360.0, 560.0]), + (6, [150.0, 200.0, 300.0, 400.0, 600.0]), + (8, [137.5, 175, 225.0, 300.0, 375.0, 500.0,650.0]), + (10, [130.0, 160.0, 190.0, 240.0, 300.0, 360.0, 440.0, 560.0, 680.0]), + (12, [125.0, 150.0, 175.0, 200.0, 250.0, 300.0, 350.0, 400.0, + 500.0, 600.0, 700.0]), + (15, [120.0, 140.0, 160.0, 180.0, 200.0, 240.0, 280.0, 320.0, 360.0, + 400.0, 480.0, 560.0, 640.0, 720.0]), + ]: + self.assertEqual(expected, quantiles(data, n=n, method="inclusive")) + self.assertEqual(len(quantiles(data, n=n, method="inclusive")), n - 1) + self.assertEqual(list(map(float, expected)), + quantiles(map(Decimal, data), n=n, method="inclusive")) + self.assertEqual(list(map(Decimal, expected)), + quantiles(map(Decimal, data), n=n, method="inclusive")) + self.assertEqual(list(map(Fraction, expected)), + quantiles(map(Fraction, data), n=n, method="inclusive")) + # Invariant under tranlation and scaling + def f(x): + return 3.5 * x - 1234.675 + exp = list(map(f, expected)) + act = quantiles(map(f, data), n=n, method="inclusive") + self.assertTrue(all(math.isclose(e, a) for e, a in zip(exp, act))) + # Quartiles of a standard normal distribution + for n, expected in [ + (1, []), + (2, [0.0]), + (3, [-0.4307, 0.4307]), + (4 ,[-0.6745, 0.0, 0.6745]), + ]: + actual = quantiles(statistics.NormalDist(), n=n, method="inclusive") + self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001) + for e, a in zip(expected, actual))) + + def test_equal_sized_groups(self): + quantiles = statistics.quantiles + total = 10_000 + data = [random.expovariate(0.2) for i in range(total)] + while len(set(data)) != total: + data.append(random.expovariate(0.2)) + data.sort() + + # Cases where the group size exactly divides the total + for n in (1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000): + group_size = total // n + self.assertEqual( + [bisect.bisect(data, q) for q in quantiles(data, n=n)], + list(range(group_size, total, group_size))) + + # When the group sizes can't be exactly equal, they should + # differ by no more than one + for n in (13, 19, 59, 109, 211, 571, 1019, 1907, 5261, 9769): + group_sizes = {total // n, total // n + 1} + pos = [bisect.bisect(data, q) for q in quantiles(data, n=n)] + sizes = {q - p for p, q in zip(pos, pos[1:])} + self.assertTrue(sizes <= group_sizes) + + def test_error_cases(self): + quantiles = statistics.quantiles + StatisticsError = statistics.StatisticsError + with self.assertRaises(TypeError): + quantiles() # Missing arguments + with self.assertRaises(TypeError): + quantiles([10, 20, 30], 13, n=4) # Too many arguments + with self.assertRaises(TypeError): + quantiles([10, 20, 30], 4) # n is a positional argument + with self.assertRaises(StatisticsError): + quantiles([10, 20, 30], n=0) # n is zero + with self.assertRaises(StatisticsError): + quantiles([10, 20, 30], n=-1) # n is negative + with self.assertRaises(TypeError): + quantiles([10, 20, 30], n=1.5) # n is not an integer + with self.assertRaises(ValueError): + quantiles([10, 20, 30], method='X') # method is unknown + with self.assertRaises(StatisticsError): + quantiles([10], n=4) # not enough data points + with self.assertRaises(TypeError): + quantiles([10, None, 30], n=4) # data is non-numeric + + class TestNormalDist(unittest.TestCase): # General note on precision: The pdf(), cdf(), and overlap() methods diff --git a/Misc/NEWS.d/next/Library/2019-04-06-14-23-00.bpo-36546.YXjbyY.rst b/Misc/NEWS.d/next/Library/2019-04-06-14-23-00.bpo-36546.YXjbyY.rst new file mode 100644 index 0000000000..c69aadf3b6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-04-06-14-23-00.bpo-36546.YXjbyY.rst @@ -0,0 +1 @@ +Add statistics.quantiles() -- 2.40.0