from __future__ import division, print_function, absolute_import import numpy as np from numpy.testing import (run_module_suite, assert_equal, assert_array_equal, assert_array_almost_equal, assert_approx_equal, assert_raises, assert_allclose) from scipy.special import xlogy from scipy.stats.contingency import margins, expected_freq, chi2_contingency def test_margins(): a = np.array([1]) m = margins(a) assert_equal(len(m), 1) m0 = m[0] assert_array_equal(m0, np.array([1])) a = np.array([[1]]) m0, m1 = margins(a) expected0 = np.array([[1]]) expected1 = np.array([[1]]) assert_array_equal(m0, expected0) assert_array_equal(m1, expected1) a = np.arange(12).reshape(2, 6) m0, m1 = margins(a) expected0 = np.array([[15], [51]]) expected1 = np.array([[6, 8, 10, 12, 14, 16]]) assert_array_equal(m0, expected0) assert_array_equal(m1, expected1) a = np.arange(24).reshape(2, 3, 4) m0, m1, m2 = margins(a) expected0 = np.array([[[66]], [[210]]]) expected1 = np.array([[[60], [92], [124]]]) expected2 = np.array([[[60, 66, 72, 78]]]) assert_array_equal(m0, expected0) assert_array_equal(m1, expected1) assert_array_equal(m2, expected2) def test_expected_freq(): assert_array_equal(expected_freq([1]), np.array([1.0])) observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]]) e = expected_freq(observed) assert_array_equal(e, np.ones_like(observed)) observed = np.array([[10, 10, 20], [20, 20, 20]]) e = expected_freq(observed) correct = np.array([[12., 12., 16.], [18., 18., 24.]]) assert_array_almost_equal(e, correct) def test_chi2_contingency_trivial(): # Some very simple tests for chi2_contingency. # A trivial case obs = np.array([[1, 2], [1, 2]]) chi2, p, dof, expected = chi2_contingency(obs, correction=False) assert_equal(chi2, 0.0) assert_equal(p, 1.0) assert_equal(dof, 1) assert_array_equal(obs, expected) # A *really* trivial case: 1-D data. obs = np.array([1, 2, 3]) chi2, p, dof, expected = chi2_contingency(obs, correction=False) assert_equal(chi2, 0.0) assert_equal(p, 1.0) assert_equal(dof, 0) assert_array_equal(obs, expected) def test_chi2_contingency_R(): # Some test cases that were computed independently, using R. Rcode = \ """ # Data vector. data <- c( 12, 34, 23, 4, 47, 11, 35, 31, 11, 34, 10, 18, 12, 32, 9, 18, 13, 19, 12, 12, 14, 9, 33, 25 ) # Create factor tags:r=rows, c=columns, t=tiers r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4"))) c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3"))) t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2"))) # 3-way Chi squared test of independence s = summary(xtabs(data~r+c+t)) print(s) """ Routput = \ """ Call: xtabs(formula = data ~ r + c + t) Number of cases in table: 478 Number of factors: 3 Test for independence of all factors: Chisq = 102.17, df = 17, p-value = 3.514e-14 """ obs = np.array( [[[12, 34, 23], [35, 31, 11], [12, 32, 9], [12, 12, 14]], [[4, 47, 11], [34, 10, 18], [18, 13, 19], [9, 33, 25]]]) chi2, p, dof, expected = chi2_contingency(obs) assert_approx_equal(chi2, 102.17, significant=5) assert_approx_equal(p, 3.514e-14, significant=4) assert_equal(dof, 17) Rcode = \ """ # Data vector. data <- c( # 12, 17, 11, 16, # 11, 12, 15, 16, # 23, 15, 30, 22, # 14, 17, 15, 16 ) # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2"))) c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2"))) d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2"))) t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2"))) # 4-way Chi squared test of independence s = summary(xtabs(data~r+c+d+t)) print(s) """ Routput = \ """ Call: xtabs(formula = data ~ r + c + d + t) Number of cases in table: 262 Number of factors: 4 Test for independence of all factors: Chisq = 8.758, df = 11, p-value = 0.6442 """ obs = np.array( [[[[12, 17], [11, 16]], [[11, 12], [15, 16]]], [[[23, 15], [30, 22]], [[14, 17], [15, 16]]]]) chi2, p, dof, expected = chi2_contingency(obs) assert_approx_equal(chi2, 8.758, significant=4) assert_approx_equal(p, 0.6442, significant=4) assert_equal(dof, 11) def test_chi2_contingency_g(): c = np.array([[15, 60], [15, 90]]) g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False) assert_allclose(g, 2*xlogy(c, c/e).sum()) g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True) c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]]) assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum()) c = np.array([[10, 12, 10], [12, 10, 10]]) g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood') assert_allclose(g, 2*xlogy(c, c/e).sum()) def test_chi2_contingency_bad_args(): # Test that "bad" inputs raise a ValueError. # Negative value in the array of observed frequencies. obs = np.array([[-1, 10], [1, 2]]) assert_raises(ValueError, chi2_contingency, obs) # The zeros in this will result in zeros in the array # of expected frequencies. obs = np.array([[0, 1], [0, 1]]) assert_raises(ValueError, chi2_contingency, obs) # A degenerate case: `observed` has size 0. obs = np.empty((0, 8)) assert_raises(ValueError, chi2_contingency, obs) if __name__ == "__main__": run_module_suite()