| #!/usr/bin/env pypy3
|
|
|
| # TASK: Look for gross error in given data use q-dixon and grubbs tests, compare results
|
|
|
| import math
|
|
|
| # [95% confidence] Critical Q values for Dixon's Q test
|
| Q95 = {
|
| 3: 0.970, 4: 0.829, 5: 0.710, 6: 0.625, 7: 0.568,
|
| 8: 0.526, 9: 0.493, 10: 0.466, 11: 0.444, 12: 0.426,
|
| 13: 0.410, 14: 0.396, 15: 0.384, 16: 0.374, 17: 0.365,
|
| 18: 0.356, 19: 0.349, 20: 0.342, 21: 0.337, 22: 0.331,
|
| 23: 0.326, 24: 0.321, 25: 0.317, 26: 0.312, 27: 0.308,
|
| 28: 0.305, 29: 0.301, 30: 0.290
|
| }
|
|
|
| # [95% confidence] t-distribution critical values
|
| GRUBBS_CRITICAL = {
|
| 3: 1.155, 4: 1.481, 5: 1.715, 6: 1.887, 7: 2.020, 8: 2.126,
|
| 9: 2.215, 10: 2.290, 11: 2.355, 12: 2.412, 13: 2.462, 14: 2.507,
|
| 15: 2.549, 16: 2.585, 17: 2.620, 18: 2.651, 19: 2.681, 20: 2.709,
|
| 21: 2.733, 22: 2.758, 23: 2.781, 24: 2.802, 25: 2.822, 26: 2.841,
|
| 27: 2.859, 28: 2.876, 29: 2.893, 30: 2.908, 31: 2.924, 32: 2.938,
|
| 33: 2.952, 34: 2.965, 35: 2.979, 36: 2.991, 37: 3.003, 38: 3.014,
|
| 39: 3.025, 40: 3.036
|
| }
|
|
|
| def dixon_test(data, left=True, right=True, q_dict=Q95):
|
| """
|
| Keyword arguments:
|
| data = A ordered or unordered list of data points (int or float).
|
| left = Q-test of minimum value in the ordered list if True.
|
| right = Q-test of maximum value in the ordered list if True.
|
| q_dict = A dictionary of Q-values for a given confidence level,
|
| where the dict. keys are sample sizes N, and the associated values
|
| are the corresponding critical Q values. E.g.,
|
| {3: 0.97, 4: 0.829, 5: 0.71, 6: 0.625, ...}
|
|
|
| Returns a list of 2 values for the outliers, or None.
|
| E.g.,
|
| for [1,1,1] -> [None, None]
|
| for [5,1,1] -> [None, 5]
|
| for [5,1,5] -> [1, None]
|
|
|
| """
|
| assert(left or right), 'At least one of the variables, `left` or `right`, must be True.'
|
| assert(len(data) >= 3), 'At least 3 data points are required'
|
| assert(len(data) <= max(q_dict.keys())), 'Sample size too large'
|
|
|
| sdata = sorted(data)
|
| Q_mindiff, Q_maxdiff = (0,0), (0,0)
|
|
|
| if left:
|
| Q_min = (sdata[1] - sdata[0])
|
| try:
|
| Q_min /= (sdata[-1] - sdata[0])
|
| except ZeroDivisionError:
|
| pass
|
| Q_mindiff = (Q_min - q_dict[len(data)], sdata[0])
|
|
|
| if right:
|
| Q_max = abs((sdata[-2] - sdata[-1]))
|
| try:
|
| Q_max /= abs((sdata[0] - sdata[-1]))
|
| except ZeroDivisionError:
|
| pass
|
| Q_maxdiff = (Q_max - q_dict[len(data)], sdata[-1])
|
|
|
| if not Q_mindiff[0] > 0 and not Q_maxdiff[0] > 0:
|
| outliers = [None, None]
|
|
|
| elif Q_mindiff[0] == Q_maxdiff[0]:
|
| outliers = [Q_mindiff[1], Q_maxdiff[1]]
|
|
|
| elif Q_mindiff[0] > Q_maxdiff[0]:
|
| outliers = [Q_mindiff[1], None]
|
|
|
| else:
|
| outliers = [None, Q_maxdiff[1]]
|
|
|
| return outliers
|
|
|
|
|
| def grubbs_test(data: list[float], alpha=0.05):
|
| """
|
| Grubbs' test (ESD method) for detecting a single outlier.
|
|
|
| Keyword arguments:
|
| data = A list of data points (int or float).
|
| alpha = Significance level (default 0.05 for 95% confidence).
|
|
|
| Returns a tuple: (is_outlier, outlier_value, G_statistic, G_critical)
|
| """
|
| assert len(data) >= 3, 'Grubbs test requires at least 3 data points'
|
| assert len(data) <= max(GRUBBS_CRITICAL.keys()), 'Sample size too large'
|
|
|
| n = len(data)
|
| mean = sum(data) / n
|
|
|
| std_dev = math.sqrt(sum((x - mean) ** 2 for x in data) / (n - 1))
|
|
|
| if std_dev == 0:
|
| return (False, None, 0, 0)
|
|
|
| g_values = [(abs(x - mean) / std_dev, x) for x in data]
|
| g_max, extreme_value = max(g_values)
|
|
|
| g_critical = GRUBBS_CRITICAL[n]
|
|
|
| is_outlier = g_max > g_critical
|
|
|
| return (is_outlier, extreme_value if is_outlier else None, g_max, g_critical)
|
|
|
| def solve(data: list[float]):
|
| print(f"Data: {data}")
|
|
|
| mean = sum(data) / len(data)
|
| std_dev = math.sqrt(sum((x - mean) ** 2 for x in data) / (len(data) - 1))
|
| print(f"Mean: {mean:.4f}")
|
| print(f"Standard deviation: {std_dev:.4f}\n")
|
|
|
| # Grubbs
|
| print("=" * 60)
|
| print("GRUBBS' TEST")
|
| print("=" * 60)
|
| is_outlier, outlier_value, g_stat, g_crit = grubbs_test(data)
|
| print(f"G statistic: {g_stat:.4f}")
|
| print(f"G critical (α=0.05): {g_crit:.4f}")
|
| print(f"Most extreme value: {outlier_value if outlier_value else 'N/A'}")
|
| print(f"Is outlier? {is_outlier}")
|
| if is_outlier:
|
| print(f"Detected outlier: {outlier_value}")
|
| else:
|
| print("No outlier detected")
|
|
|
| # Dixon
|
| print("\n" + "=" * 60)
|
| print("DIXON'S Q TEST")
|
| print("=" * 60)
|
| dixon_outliers = dixon_test(data)
|
| print(f"Q critical (α=0.05, n={len(data)}): {Q95[len(data)]:.4f}")
|
| print(f"Left outlier (minimum): {dixon_outliers[0] if dixon_outliers[0] else 'None'}")
|
| print(f"Right outlier (maximum): {dixon_outliers[1] if dixon_outliers[1] else 'None'}")
|
|
|
| print("\n" + "=" * 60)
|
| print("Comparison")
|
| print("=" * 60)
|
|
|
| grubbs_outliers = {outlier_value} if is_outlier else set()
|
| q_outliers = {x for x in dixon_outliers if x is not None}
|
|
|
| print(f"Grubbs outlier: {grubbs_outliers if grubbs_outliers else 'None'}")
|
| print(f"Dixon outliers: {q_outliers if q_outliers else 'None'}")
|
|
|
| if grubbs_outliers == q_outliers:
|
| print("\nBoth agree! ^^")
|
| elif len(grubbs_outliers & q_outliers) > 0:
|
| print(f"\nPartially agree on: {grubbs_outliers & q_outliers}")
|
| else:
|
| print("\nTests disagree:")
|
| if grubbs_outliers:
|
| print(f" Only Grubbs detected: {grubbs_outliers}")
|
| if q_outliers - grubbs_outliers:
|
| print(f" Only Dixon detected: {q_outliers - grubbs_outliers}")
|
|
|
| return {"grubbs": (is_outlier, outlier_value, g_stat, g_crit), "dixon": dixon_outliers}
|
|
|
|
|
| if __name__ == "__main__":
|
| data = [0.51, 0.49, 0.45, 0.44, 0.45, 0.50, 0.57, 0.47, 0.45, 0.48]
|
| solve(data)
|