Source code for validclust.validclust

import re

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

from validclust.indices import (
    _dunn, cop, _davies_bouldin_score2, _silhouette_score2,
    _calinski_harabaz_score2
)


[docs]class ValidClust: """Validate clustering results Parameters ---------- k : int or list of int The number of clusters to partition your data into. indices : str or list of str, optional The cluster validity indices to calculate. Acceptable values include 'silhouette', 'calinski', 'davies', 'dunn', and 'cop'. You can use a three-character abbreviation for these values as well. For example, you could specify ``indices=['cal', 'dav', 'dun']``. methods : str or list of str, optional The clustering algorithm(s) to use. Acceptable values are 'hierarchical' and 'kmeans'. linkage : {'ward', 'complete', 'average', 'single'}, optional Which linkage criterion to use for hierarchical clustering. See the `sklean docs <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering>`_ for more details. affinity : {'euclidean', 'l1', 'l2', 'manhattan', 'cosine'}, optional The metric used to compute the linkage for hierarchical clustering. Note, you must specify ``affinity='euclidean'`` when ``linkage='ward'``. See the sklearn docs linked above for more details. Attributes ---------- score_df : DataFrame A Pandas DataFrame with the computed cluster validity index values. """ def __init__(self, k, # No big deal that these are lists (i.e., mutable), given that # we don't mutate them inside the class. indices=['silhouette', 'calinski', 'davies', 'dunn'], methods=['hierarchical', 'kmeans'], linkage='ward', affinity='euclidean'): k, indices, methods = ( [i] if type(i) in [int, str] else i for i in [k, indices, methods] ) if linkage == 'ward' and affinity != 'euclidean': raise ValueError( "You must specify `affinity='euclidean'` when using the " "ward linkage type" ) ok_indices = ['silhouette', 'calinski', 'davies', 'dunn', 'cop'] ind_aliases = {i[0:3]: i for i in ok_indices} indices = [ ind_aliases[i] if i in ind_aliases else i for i in indices ] for i in indices: if i not in ok_indices: raise ValueError('{0} is not a valid index value'.format(i)) self.k = k self.indices = indices self.methods = methods self.linkage = linkage self.affinity = affinity self.score_df = None def __repr__(self): argspec = [ '{}={}'.format(' ' + key, value) for key, value in self.__dict__.items() if key != 'score_df' ] argspec = ',\n'.join(argspec) argspec = re.sub('(linkage|affinity)=(\\w*)', "\\1='\\2'", argspec) return 'ValidClust(\n' + argspec + '\n)' def _get_method_objs(self): method_switcher = { 'hierarchical': AgglomerativeClustering(), 'kmeans': KMeans() } objs = {i: method_switcher[i] for i in self.methods} for key, value in objs.items(): if key == 'hierarchical': value.set_params(linkage=self.linkage, affinity=self.affinity) return objs def _get_index_funs(self): index_fun_switcher = { 'silhouette': _silhouette_score2, 'calinski': _calinski_harabaz_score2, 'davies': _davies_bouldin_score2, 'dunn': _dunn, 'cop': cop } return {i: index_fun_switcher[i] for i in self.indices}
[docs] def fit(self, data): """Fit the clustering algorithm(s) to the data and calculate the CVI scores Parameters ---------- data : array-like, shape = [n_samples, n_features] The data to cluster. Returns ------- self A ``ValidClust`` object whose ``score_df`` attribute contains the calculated CVI scores. """ method_objs = self._get_method_objs() index_funs = self._get_index_funs() dist_inds = ['silhouette', 'dunn'] d_overlap = [i for i in self.indices if i in dist_inds] if d_overlap: dist = pairwise_distances(data) np.fill_diagonal(dist, 0) else: dist = None index = pd.MultiIndex.from_product( [self.methods, self.indices], names=['method', 'index'] ) output_df = pd.DataFrame( index=index, columns=self.k, dtype=np.float64 ) for k in self.k: for alg_name, alg_obj in method_objs.items(): alg_obj.set_params(n_clusters=k) labels = alg_obj.fit_predict(data) # have to iterate over self.indices here so that ordering of # validity indices is same in scores list as it is in output_df scores = [ index_funs[key](data, dist, labels) for key in self.indices ] output_df.loc[(alg_name, self.indices), k] = scores self.score_df = output_df return self
[docs] def fit_predict(self, data): """Fit the clustering algorithm(s) to the data and calculate the CVI scores Parameters ---------- data : array-like, shape = [n_samples, n_features] The data to cluster. Returns ------- DataFrame A Pandas DataFrame with the computed cluster validity index values (``self.score_df``). """ return self.fit(data).score_df
def _normalize(self): score_df_norm = self.score_df.copy() for i in ['davies', 'cop']: if i in self.indices: score_df_norm.loc[(slice(None), i), :] = \ 1 - score_df_norm.loc[(slice(None), i), :] normalize(score_df_norm, norm='max', copy=False) return score_df_norm
[docs] def plot(self): """Plot normalized CVI scores in a heatmap The CVI scores are normalized along each method/index pair using the `max norm <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html>`_. Note that, because the scores are normalized along each method/index pair, you should compare the colors of the cells in the heatmap only within a given row. You should not, for instance, compare the color of the cells in the "kmeans, dunn" row with those in the "kmeans, silhouette" row. Returns ------- None Nothing is returned. Instead, a plot is rendered using a graphics backend. """ norm_df = self._normalize() yticklabels = [',\n'.join(i) for i in norm_df.index.values] hmap = sns.heatmap( norm_df, cmap='Blues', cbar=False, yticklabels=yticklabels ) hmap.set_xlabel('\nNumber of clusters') hmap.set_ylabel('Method, index\n') plt.tight_layout()