#!/usr/bin/env pypy3 # TASK: Look for gross error in given data use q-dixon and grubbs tests, compare results import math # [95% confidence] Critical Q values for Dixon's Q test Q95 = { 3: 0.970, 4: 0.829, 5: 0.710, 6: 0.625, 7: 0.568, 8: 0.526, 9: 0.493, 10: 0.466, 11: 0.444, 12: 0.426, 13: 0.410, 14: 0.396, 15: 0.384, 16: 0.374, 17: 0.365, 18: 0.356, 19: 0.349, 20: 0.342, 21: 0.337, 22: 0.331, 23: 0.326, 24: 0.321, 25: 0.317, 26: 0.312, 27: 0.308, 28: 0.305, 29: 0.301, 30: 0.290 } # [95% confidence] t-distribution critical values GRUBBS_CRITICAL = { 3: 1.155, 4: 1.481, 5: 1.715, 6: 1.887, 7: 2.020, 8: 2.126, 9: 2.215, 10: 2.290, 11: 2.355, 12: 2.412, 13: 2.462, 14: 2.507, 15: 2.549, 16: 2.585, 17: 2.620, 18: 2.651, 19: 2.681, 20: 2.709, 21: 2.733, 22: 2.758, 23: 2.781, 24: 2.802, 25: 2.822, 26: 2.841, 27: 2.859, 28: 2.876, 29: 2.893, 30: 2.908, 31: 2.924, 32: 2.938, 33: 2.952, 34: 2.965, 35: 2.979, 36: 2.991, 37: 3.003, 38: 3.014, 39: 3.025, 40: 3.036 } def dixon_test(data, left=True, right=True, q_dict=Q95): """ Keyword arguments: data = A ordered or unordered list of data points (int or float). left = Q-test of minimum value in the ordered list if True. right = Q-test of maximum value in the ordered list if True. q_dict = A dictionary of Q-values for a given confidence level, where the dict. keys are sample sizes N, and the associated values are the corresponding critical Q values. E.g., {3: 0.97, 4: 0.829, 5: 0.71, 6: 0.625, ...} Returns a list of 2 values for the outliers, or None. E.g., for [1,1,1] -> [None, None] for [5,1,1] -> [None, 5] for [5,1,5] -> [1, None] """ assert(left or right), 'At least one of the variables, `left` or `right`, must be True.' assert(len(data) >= 3), 'At least 3 data points are required' assert(len(data) <= max(q_dict.keys())), 'Sample size too large' sdata = sorted(data) Q_mindiff, Q_maxdiff = (0,0), (0,0) if left: Q_min = (sdata[1] - sdata[0]) try: Q_min /= (sdata[-1] - sdata[0]) except ZeroDivisionError: pass Q_mindiff = (Q_min - q_dict[len(data)], sdata[0]) if right: Q_max = abs((sdata[-2] - sdata[-1])) try: Q_max /= abs((sdata[0] - sdata[-1])) except ZeroDivisionError: pass Q_maxdiff = (Q_max - q_dict[len(data)], sdata[-1]) if not Q_mindiff[0] > 0 and not Q_maxdiff[0] > 0: outliers = [None, None] elif Q_mindiff[0] == Q_maxdiff[0]: outliers = [Q_mindiff[1], Q_maxdiff[1]] elif Q_mindiff[0] > Q_maxdiff[0]: outliers = [Q_mindiff[1], None] else: outliers = [None, Q_maxdiff[1]] return outliers def grubbs_test(data: list[float], alpha=0.05): """ Grubbs' test (ESD method) for detecting a single outlier. Keyword arguments: data = A list of data points (int or float). alpha = Significance level (default 0.05 for 95% confidence). Returns a tuple: (is_outlier, outlier_value, G_statistic, G_critical) """ assert len(data) >= 3, 'Grubbs test requires at least 3 data points' assert len(data) <= max(GRUBBS_CRITICAL.keys()), 'Sample size too large' n = len(data) mean = sum(data) / n std_dev = math.sqrt(sum((x - mean) ** 2 for x in data) / (n - 1)) if std_dev == 0: return (False, None, 0, 0) g_values = [(abs(x - mean) / std_dev, x) for x in data] g_max, extreme_value = max(g_values) g_critical = GRUBBS_CRITICAL[n] is_outlier = g_max > g_critical return (is_outlier, extreme_value if is_outlier else None, g_max, g_critical) def solve(data: list[float]): print(f"Data: {data}") mean = sum(data) / len(data) std_dev = math.sqrt(sum((x - mean) ** 2 for x in data) / (len(data) - 1)) print(f"Mean: {mean:.4f}") print(f"Standard deviation: {std_dev:.4f}\n") # Grubbs print("=" * 60) print("GRUBBS' TEST") print("=" * 60) is_outlier, outlier_value, g_stat, g_crit = grubbs_test(data) print(f"G statistic: {g_stat:.4f}") print(f"G critical (α=0.05): {g_crit:.4f}") print(f"Most extreme value: {outlier_value if outlier_value else 'N/A'}") print(f"Is outlier? {is_outlier}") if is_outlier: print(f"Detected outlier: {outlier_value}") else: print("No outlier detected") # Dixon print("\n" + "=" * 60) print("DIXON'S Q TEST") print("=" * 60) dixon_outliers = dixon_test(data) print(f"Q critical (α=0.05, n={len(data)}): {Q95[len(data)]:.4f}") print(f"Left outlier (minimum): {dixon_outliers[0] if dixon_outliers[0] else 'None'}") print(f"Right outlier (maximum): {dixon_outliers[1] if dixon_outliers[1] else 'None'}") print("\n" + "=" * 60) print("Comparison") print("=" * 60) grubbs_outliers = {outlier_value} if is_outlier else set() q_outliers = {x for x in dixon_outliers if x is not None} print(f"Grubbs outlier: {grubbs_outliers if grubbs_outliers else 'None'}") print(f"Dixon outliers: {q_outliers if q_outliers else 'None'}") if grubbs_outliers == q_outliers: print("\nBoth agree! ^^") elif len(grubbs_outliers & q_outliers) > 0: print(f"\nPartially agree on: {grubbs_outliers & q_outliers}") else: print("\nTests disagree:") if grubbs_outliers: print(f" Only Grubbs detected: {grubbs_outliers}") if q_outliers - grubbs_outliers: print(f" Only Dixon detected: {q_outliers - grubbs_outliers}") return {"grubbs": (is_outlier, outlier_value, g_stat, g_crit), "dixon": dixon_outliers} if __name__ == "__main__": data = [0.51, 0.49, 0.45, 0.44, 0.45, 0.50, 0.57, 0.47, 0.45, 0.48] solve(data)