""" Testing for the nearest centroid module. """ import numpy as np from scipy import sparse as sp from numpy.testing import assert_array_equal from numpy.testing import assert_equal from sklearn.neighbors import NearestCentroid from sklearn import datasets from sklearn.metrics.pairwise import pairwise_distances # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] X_csr = sp.csr_matrix(X) # Sparse matrix y = [-1, -1, -1, 1, 1, 1] T = [[-1, -1], [2, 2], [3, 2]] T_csr = sp.csr_matrix(T) true_result = [-1, 1, 1] # also load the iris dataset # and randomly permute it iris = datasets.load_iris() rng = np.random.RandomState(1) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] def test_classification_toy(): # Check classification on a toy dataset, including sparse versions. clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # Same test, but with a sparse matrix to fit and test. clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit with sparse, test with non-sparse clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T), true_result) # Fit with non-sparse, test with sparse clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit and predict with non-CSR sparse matrices clf = NearestCentroid() clf.fit(X_csr.tocoo(), y) assert_array_equal(clf.predict(T_csr.tolil()), true_result) def test_precomputed(): clf = NearestCentroid(metric="precomputed") clf.fit(X, y) S = pairwise_distances(T, clf.centroids_) assert_array_equal(clf.predict(S), true_result) def test_iris(): # Check consistency on dataset iris. for metric in ('euclidean', 'cosine'): clf = NearestCentroid(metric=metric).fit(iris.data, iris.target) score = np.mean(clf.predict(iris.data) == iris.target) assert score > 0.9, "Failed with score = " + str(score) def test_iris_shrinkage(): # Check consistency on dataset iris, when using shrinkage. for metric in ('euclidean', 'cosine'): for shrink_threshold in [None, 0.1, 0.5]: clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold) clf = clf.fit(iris.data, iris.target) score = np.mean(clf.predict(iris.data) == iris.target) assert score > 0.8, "Failed with score = " + str(score) def test_pickle(): import pickle # classification obj = NearestCentroid() obj.fit(iris.data, iris.target) score = obj.score(iris.data, iris.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(iris.data, iris.target) assert_array_equal(score, score2, "Failed to generate same score" " after pickling (classification).") def test_shrinkage_threshold_decoded_y(): clf = NearestCentroid(shrink_threshold=0.01) y_ind = np.asarray(y) y_ind[y_ind == -1] = 0 clf.fit(X, y_ind) centroid_encoded = clf.centroids_ clf.fit(X, y) assert_array_equal(centroid_encoded, clf.centroids_) def test_predict_translated_data(): # Test that NearestCentroid gives same results on translated data rng = np.random.RandomState(0) X = rng.rand(50, 50) y = rng.randint(0, 3, 50) noise = rng.rand(50) clf = NearestCentroid(shrink_threshold=0.1) clf.fit(X, y) y_init = clf.predict(X) clf = NearestCentroid(shrink_threshold=0.1) X_noise = X + noise clf.fit(X_noise, y) y_translate = clf.predict(X_noise) assert_array_equal(y_init, y_translate) def test_manhattan_metric(): # Test the manhattan metric. clf = NearestCentroid(metric='manhattan') clf.fit(X, y) dense_centroid = clf.centroids_ clf.fit(X_csr, y) assert_array_equal(clf.centroids_, dense_centroid) assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])