"""Affinity Propagation clustering algorithm.""" # Author: Alexandre Gramfort alexandre.gramfort@inria.fr # Gael Varoquaux gael.varoquaux@normalesup.org # License: BSD 3 clause import numpy as np from ..base import BaseEstimator, ClusterMixin from ..utils import as_float_array, check_array from ..utils.validation import check_is_fitted from ..metrics import euclidean_distances from ..metrics import pairwise_distances_argmin def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200, damping=0.5, copy=True, verbose=False, return_n_iter=False): """Perform Affinity Propagation Clustering of data Read more in the :ref:`User Guide `. Parameters ---------- S : array-like, shape (n_samples, n_samples) Matrix of similarities between points preference : array-like, shape (n_samples,) or float, optional Preferences for each point - points with larger values of preferences are more likely to be chosen as exemplars. The number of exemplars, i.e. of clusters, is influenced by the input preferences value. If the preferences are not passed as arguments, they will be set to the median of the input similarities (resulting in a moderate number of clusters). For a smaller amount of clusters, this can be set to the minimum value of the similarities. convergence_iter : int, optional, default: 15 Number of iterations with no change in the number of estimated clusters that stops the convergence. max_iter : int, optional, default: 200 Maximum number of iterations damping : float, optional, default: 0.5 Damping factor between 0.5 and 1. copy : boolean, optional, default: True If copy is False, the affinity matrix is modified inplace by the algorithm, for memory efficiency verbose : boolean, optional, default: False The verbosity level return_n_iter : bool, default False Whether or not to return the number of iterations. Returns ------- cluster_centers_indices : array, shape (n_clusters,) index of clusters centers labels : array, shape (n_samples,) cluster labels for each point n_iter : int number of iterations run. Returned only if `return_n_iter` is set to True. Notes ----- See examples/cluster/plot_affinity_propagation.py for an example. References ---------- Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages Between Data Points", Science Feb. 2007 """ S = as_float_array(S, copy=copy) n_samples = S.shape[0] if S.shape[0] != S.shape[1]: raise ValueError("S must be a square array (shape=%s)" % repr(S.shape)) if preference is None: preference = np.median(S) if damping < 0.5 or damping >= 1: raise ValueError('damping must be >= 0.5 and < 1') random_state = np.random.RandomState(0) # Place preference on the diagonal of S S.flat[::(n_samples + 1)] = preference A = np.zeros((n_samples, n_samples)) R = np.zeros((n_samples, n_samples)) # Initialize messages # Intermediate results tmp = np.zeros((n_samples, n_samples)) # Remove degeneracies S += ((np.finfo(np.double).eps * S + np.finfo(np.double).tiny * 100) * random_state.randn(n_samples, n_samples)) # Execute parallel affinity propagation updates e = np.zeros((n_samples, convergence_iter)) ind = np.arange(n_samples) for it in range(max_iter): # tmp = A + S; compute responsibilities np.add(A, S, tmp) I = np.argmax(tmp, axis=1) Y = tmp[ind, I] # np.max(A + S, axis=1) tmp[ind, I] = -np.inf Y2 = np.max(tmp, axis=1) # tmp = Rnew np.subtract(S, Y[:, None], tmp) tmp[ind, I] = S[ind, I] - Y2 # Damping tmp *= 1 - damping R *= damping R += tmp # tmp = Rp; compute availabilities np.maximum(R, 0, tmp) tmp.flat[::n_samples + 1] = R.flat[::n_samples + 1] # tmp = -Anew tmp -= np.sum(tmp, axis=0) dA = np.diag(tmp).copy() tmp.clip(0, np.inf, tmp) tmp.flat[::n_samples + 1] = dA # Damping tmp *= 1 - damping A *= damping A -= tmp # Check for convergence E = (np.diag(A) + np.diag(R)) > 0 e[:, it % convergence_iter] = E K = np.sum(E, axis=0) if it >= convergence_iter: se = np.sum(e, axis=1) unconverged = (np.sum((se == convergence_iter) + (se == 0)) != n_samples) if (not unconverged and (K > 0)) or (it == max_iter): if verbose: print("Converged after %d iterations." % it) break else: if verbose: print("Did not converge") I = np.where(np.diag(A + R) > 0)[0] K = I.size # Identify exemplars if K > 0: c = np.argmax(S[:, I], axis=1) c[I] = np.arange(K) # Identify clusters # Refine the final set of exemplars and clusters and return results for k in range(K): ii = np.where(c == k)[0] j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0)) I[k] = ii[j] c = np.argmax(S[:, I], axis=1) c[I] = np.arange(K) labels = I[c] # Reduce labels to a sorted, gapless, list cluster_centers_indices = np.unique(labels) labels = np.searchsorted(cluster_centers_indices, labels) else: labels = np.empty((n_samples, 1)) cluster_centers_indices = None labels.fill(np.nan) if return_n_iter: return cluster_centers_indices, labels, it + 1 else: return cluster_centers_indices, labels ############################################################################### class AffinityPropagation(BaseEstimator, ClusterMixin): """Perform Affinity Propagation Clustering of data. Read more in the :ref:`User Guide `. Parameters ---------- damping : float, optional, default: 0.5 Damping factor between 0.5 and 1. convergence_iter : int, optional, default: 15 Number of iterations with no change in the number of estimated clusters that stops the convergence. max_iter : int, optional, default: 200 Maximum number of iterations. copy : boolean, optional, default: True Make a copy of input data. preference : array-like, shape (n_samples,) or float, optional Preferences for each point - points with larger values of preferences are more likely to be chosen as exemplars. The number of exemplars, ie of clusters, is influenced by the input preferences value. If the preferences are not passed as arguments, they will be set to the median of the input similarities. affinity : string, optional, default=``euclidean`` Which affinity to use. At the moment ``precomputed`` and ``euclidean`` are supported. ``euclidean`` uses the negative squared euclidean distance between points. verbose : boolean, optional, default: False Whether to be verbose. Attributes ---------- cluster_centers_indices_ : array, shape (n_clusters,) Indices of cluster centers cluster_centers_ : array, shape (n_clusters, n_features) Cluster centers (if affinity != ``precomputed``). labels_ : array, shape (n_samples,) Labels of each point affinity_matrix_ : array, shape (n_samples, n_samples) Stores the affinity matrix used in ``fit``. n_iter_ : int Number of iterations taken to converge. Notes ----- See examples/cluster/plot_affinity_propagation.py for an example. The algorithmic complexity of affinity propagation is quadratic in the number of points. References ---------- Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages Between Data Points", Science Feb. 2007 """ def __init__(self, damping=.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False): self.damping = damping self.max_iter = max_iter self.convergence_iter = convergence_iter self.copy = copy self.verbose = verbose self.preference = preference self.affinity = affinity @property def _pairwise(self): return self.affinity == "precomputed" def fit(self, X, y=None): """ Create affinity matrix from negative euclidean distances, then apply affinity propagation clustering. Parameters ---------- X: array-like, shape (n_samples, n_features) or (n_samples, n_samples) Data matrix or, if affinity is ``precomputed``, matrix of similarities / affinities. """ X = check_array(X, accept_sparse='csr') if self.affinity == "precomputed": self.affinity_matrix_ = X elif self.affinity == "euclidean": self.affinity_matrix_ = -euclidean_distances(X, squared=True) else: raise ValueError("Affinity must be 'precomputed' or " "'euclidean'. Got %s instead" % str(self.affinity)) self.cluster_centers_indices_, self.labels_, self.n_iter_ = \ affinity_propagation( self.affinity_matrix_, self.preference, max_iter=self.max_iter, convergence_iter=self.convergence_iter, damping=self.damping, copy=self.copy, verbose=self.verbose, return_n_iter=True) if self.affinity != "precomputed": self.cluster_centers_ = X[self.cluster_centers_indices_].copy() return self def predict(self, X): """Predict the closest cluster each sample in X belongs to. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) New data to predict. Returns ------- labels : array, shape (n_samples,) Index of the cluster each sample belongs to. """ check_is_fitted(self, "cluster_centers_indices_") if not hasattr(self, "cluster_centers_"): raise ValueError("Predict method is not supported when " "affinity='precomputed'.") return pairwise_distances_argmin(X, self.cluster_centers_)