Source code for clustering_metrics.metrics

# -*- coding: utf-8 -*-

"""
"""

import warnings
import numpy as np
from math import log, sqrt, copysign
from collections import Set, namedtuple
from pymaptools.containers import CrossTab, OrderedCrossTab
from pymaptools.iter import iter_items, isiterable
from pymaptools.sample import randround
from clustering_metrics.utils import _div, _log
from clustering_metrics.entropy import fentropy, fnum_pairs, fsum_pairs, \
    emi_from_margins, assignment_cost
from scipy.stats import fisher_exact


[docs]def jaccard_similarity(iterable1, iterable2): """Jaccard similarity between two sets Parameters ---------- iterable1 : collections.Iterable first bag of items (order irrelevant) iterable2 : collections.Iterable second bag of items (order irrelevant) Returns ------- jaccard_similarity : float """ t = ConfusionMatrix2.from_sets(iterable1, iterable2) return t.jaccard_coeff()
[docs]def ratio2weights(ratio): """Numerically accurate conversion of ratio of two weights to weights """ if ratio <= 1.0: lweight = ratio / (1.0 + ratio) else: lweight = 1.0 / (1.0 + 1.0 / ratio) return lweight, 1.0 - lweight
[docs]def geometric_mean(x, y): """Geometric mean of two numbers. Always returns a float Although geometric mean is defined for negative numbers, Scipy function doesn't allow it. Hence this function """ prod = x * y if prod < 0.0: raise ValueError("x and y have different signs") return copysign(1, x) * sqrt(prod)
[docs]def geometric_mean_weighted(x, y, ratio=1.0): """Geometric mean of two numbers with a weight ratio. Returns a float :: >>> geometric_mean_weighted(1, 4, ratio=1.0) 2.0 >>> geometric_mean_weighted(1, 4, ratio=0.0) 1.0 >>> geometric_mean_weighted(1, 4, ratio=float('inf')) 4.0 """ lweight, rweight = ratio2weights(ratio) lsign = copysign(1, x) rsign = copysign(1, y) if lsign != rsign and x != y: raise ValueError("x and y have different signs") return lsign * (abs(x) ** rweight) * (abs(y) ** lweight)
[docs]def unitsq_sigmoid(x, s=0.5): """Unit square sigmoid (for transforming P-like scales) :: >>> round(unitsq_sigmoid(0.1), 4) 0.25 >>> round(unitsq_sigmoid(0.9), 4) 0.75 """ a = x ** s b = (1 - x) ** s return a / (a + b)
[docs]def harmonic_mean(x, y): """Harmonic mean of two numbers. Always returns a float """ return float(x) if x == y else 2.0 * (x * y) / (x + y)
[docs]def harmonic_mean_weighted(x, y, ratio=1.0): """Harmonic mean of two numbers with a weight ratio. Returns a float :: >>> harmonic_mean_weighted(1, 3, ratio=1.0) 1.5 >>> harmonic_mean_weighted(1, 3, ratio=0.0) 1.0 >>> harmonic_mean_weighted(1, 3, ratio=float('inf')) 3.0 """ lweight, rweight = ratio2weights(ratio) return float(x) if x == y else (x * y) / (lweight * x + rweight * y)
[docs]class ContingencyTable(CrossTab): # Note: not subclassing Pandas DataFrame because the goal is to specifically # optimize for sparse use cases when >90% of the table consists of zeros. # As of today, Pandas 'crosstab' implementation of frequency tables forces # one to iterate on all the zeros, which is horrible... def __init__(self, *args, **kwargs): CrossTab.__init__(self, *args, **kwargs) self._assignment_cost = None self._expected_freqs_ = {} self._expected_freqs_discrete_ = {}
[docs] def to_array(self, default=0, cpad=False, rpad=False): """Convert to NumPy array """ rows = self.to_rows(default=default, cpad=cpad, rpad=rpad) return np.array(rows)
# Factory methods
[docs] def expected_freqs_(self, model='m3'): table = self._expected_freqs_.get(model) if table is not None: return table N = float(self.grand_total) row_margin = self.row_totals col_margin = self.col_totals R, C = self.shape if model == 'm1': # no fixed margin rows = self._row_type_2d() expected = N / float(len(row_margin) * len(col_margin)) for (ri, ci), _ in self.iter_all(): rows[ri][ci] = expected elif model == 'm2r': # fixed row margin rows = self._row_type_2d() cm = N / float(len(col_margin)) for (ri, ci), _ in self.iter_all(): rm = row_margin[ri] numer = rm * cm if numer != 0: rows[ri][ci] = numer / N elif model == 'm2c': # fixed column margin rows = self._row_type_2d() rm = N / float(len(row_margin)) for (ri, ci), _ in self.iter_all(): cm = col_margin[ci] numer = rm * cm if numer != 0: rows[ri][ci] = numer / N elif model == 'm3': # fixed row *and* column margin rows = self._row_type_2d() for (ri, ci), _ in self.iter_all(): rm = row_margin[ri] cm = col_margin[ci] numer = rm * cm if numer != 0: rows[ri][ci] = numer / N elif model == 'x1': # no fixed margin rows = np.zeros((R, C), dtype=int) rows[0][0] = self.grand_total elif model == 'x2r': # fixed row margin rows = np.zeros((R, C), dtype=int) for idx, rm in enumerate(self.iter_row_totals()): rows[idx, 0] = rm elif model == 'x2c': # fixed column margin rows = np.zeros((R, C), dtype=int) for idx, cm in enumerate(self.iter_col_totals()): rows[0, idx] = cm else: raise NotImplementedError(model) self._expected_freqs_[model] = table = self.__class__(rows=rows) return table
[docs] def expected(self, model='m3', discrete=False, redraw=False): """Factory creating expected table given current margins """ if discrete: table = self._expected_freqs_discrete_.get(model) else: table = self.expected_freqs_(model) # continuous if not discrete: if table is None: raise RuntimeError("line should be unreachable") return table # discrete if table is None or redraw: continuous = self.expected_freqs_(model) rows = self._row_type_2d() # create a sparse instance for (ri, ci), expected in continuous.iteritems(): expected = randround(expected) if expected != 0: rows[ri][ci] = expected self._expected_freqs_discrete_[model] = table = self.__class__(rows=rows) return table
@staticmethod def _normalize_measure(value, maximum=1.0, center=0.0): """Normalize to maximum with optional centering """ if isiterable(value): value = np.asarray(value) if isiterable(center): center = np.asarray(center) if isiterable(maximum): maximum = np.asarray(maximum) return np.divide(value - center, maximum - center)
[docs] def adjust_to_null(self, measure, model='m3', with_warnings=False): """Adjust a measure to null model The general formula for chance correction of an association measure :math:`M` is: .. math:: M_{adj} = \\frac{M - E(M)}{M_{max} - E(M)}, where :math:`M_{max}` is the maximum value a measure :math:`M` can achieve, and :math:`E(M)` is the expected value of :math:`M` under statistical independence given fixed table margins. In simple cases, the expected value of a measure is the same as the value of the measure given a null model. This is not always the case, however, and, to properly adjust for chance, sometimes one has average over all possible contingency tables using hypergeometric distribution for example. The method returns a tuple for two different measure ceilings: row- diagonal and column-diagonal. For symmetric measures, the two values will be the same. """ if callable(measure): measure = measure.__name__ actual = getattr(self, measure)() model = getattr(self.expected(model=model), measure)() if with_warnings and np.isclose(np.sum(model), 0.0): warnings.warn("'%s' is already centered" % measure) max_row = getattr(self.row_diag(), measure)() if with_warnings and np.isclose(np.average(max_row), 1.0): warnings.warn("'%s' is already row-normalized" % measure) max_col = getattr(self.col_diag(), measure)() if with_warnings and np.isclose(np.average(max_col), 1.0): warnings.warn("'%s' is already column-normalized" % measure) row_adjusted = self._normalize_measure(actual, max_row, model) col_adjusted = self._normalize_measure(actual, max_col, model) return row_adjusted, col_adjusted
[docs] def row_diag(self): """Factory creating diagonal table given current row margin """ rows = self._row_type_2d() for i, m in iter_items(self.row_totals): rows[i][i] = m return self.__class__(rows=rows)
[docs] def col_diag(self): """Factory creating diagonal table given current column margin """ rows = self._row_type_2d() for i, m in iter_items(self.col_totals): rows[i][i] = m return self.__class__(rows=rows)
# Misc metrics
[docs] def chisq_score(self): """Pearson's chi-square statistic >>> r = {1: {1: 16, 3: 2}, 2: {1: 1, 2: 3}, 3: {1: 4, 2: 5, 3: 5}} >>> cm = ContingencyTable(rows=r) >>> round(cm.chisq_score(), 3) 19.256 """ N = float(self.grand_total) score = 0.0 for rm, cm, observed in self.iter_all_with_margins(): numer = rm * cm if numer != 0: expected = numer / N score += (observed - expected) ** 2 / expected return score
[docs] def g_score(self): """G-statistic for RxC contingency table This method does not perform any corrections to this statistic (e.g. Williams', Yates' corrections). The statistic is equivalent to the negative of Mutual Information times two. Mututal Information on a contingency table is defined as the difference between the information in the table and the information in an independent table with the same margins. For application of mutual information (in the form of G-score) to search for collocated words in NLP, see [1]_ and [2]_. References ---------- .. [1] `Dunning, T. (1993). Accurate methods for the statistics of surprise and coincidence. Computational linguistics, 19(1), 61-74. <http://dl.acm.org/citation.cfm?id=972454>`_ .. [2] `Ted Dunning's personal blog entry and the discussion under it. <http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html>`_ """ _, _, I_CK = self._entropies() return 2.0 * I_CK
def _entropies(self): """Return H_C, H_K, and mutual information Not normalized by N """ H_C = fentropy(self.row_totals) H_K = fentropy(self.col_totals) H_actual = fentropy(self.itervalues()) H_expected = H_C + H_K I_CK = H_expected - H_actual return H_C, H_K, I_CK
[docs] def mutual_info_score(self): """Mutual Information Score Mutual Information (divided by N). The metric is equal to the Kullback-Leibler divergence of the joint distribution with the product distribution of the marginals. """ _, _, I_CK = self._entropies() return I_CK / self.grand_total
[docs] def entropy_scores(self, mean='harmonic'): """Gives three entropy-based metrics for a RxC table The metrics are: Homogeneity, Completeness, and V-measure The V-measure metric is also known as Normalized Mutual Information (NMI), and is calculated here as the harmonic mean of Homogeneity and Completeness (:math:`NMI_{sum}`). There exist other definitions of NMI (see Table 2 in [1]_ for a good review). Homogeneity and Completeness are duals of each other and can be thought of (although this is not technically accurate) as squared regression coefficients of a given clustering vs true labels (homogeneity) and of the dual problem of true labels vs given clustering (completeness). Because of the dual property, in a symmetric matrix, all three scores are the same. Homogeneity has an overall profile similar to that of precision in information retrieval. Completeness roughly corresponds to recall. This method replaces ``homogeneity_completeness_v_measure`` method in Scikit-Learn. The Scikit-Learn version takes up :math:`O(n^2)` space because it stores data in a dense NumPy array, while the given version is sub-quadratic because of sparse underlying storage. Note that the entropy variables H in the code below are improperly defined because they ought to be divided by N (the grand total for the contingency table). However, the N variable cancels out during normalization. References ---------- .. [1] `Vinh, N. X., Epps, J., & Bailey, J. (2010). Information theoretic measures for clusterings comparison: Variants, properties, normalization and correction for chance. The Journal of Machine Learning Research, 11, 2837-2854. <http://www.jmlr.org/papers/v11/vinh10a.html>`_ """ # ensure non-negative values by taking max of 0 and given value H_C, H_K, I_CK = self._entropies() h = 1.0 if H_C == 0.0 else max(0.0, I_CK / H_C) c = 1.0 if H_K == 0.0 else max(0.0, I_CK / H_K) if mean == 'harmonic': rsquare = harmonic_mean(h, c) elif mean == 'geometric': rsquare = geometric_mean(h, c) else: raise NotImplementedError(mean) return h, c, rsquare
[docs] def adjusted_mutual_info(self): """Adjusted Mutual Information for two partitions For a mathematical definition, see [1]_, [2]_, and [2]_. References ---------- .. [1] `Vinh, N. X., Epps, J., & Bailey, J. (2009, June). Information theoretic measures for clusterings comparison: is a correction for chance necessary?. In Proceedings of the 26th Annual International Conference on Machine Learning (pp. 1073-1080). ACM. <https://doi.org/10.1145/1553374.1553511>`_ .. [2] `Vinh, N. X., & Epps, J. (2009, June). A novel approach for automatic number of clusters detection in microarray data based on consensus clustering. In Bioinformatics and BioEngineering, 2009. BIBE'09. Ninth IEEE International Conference on (pp. 84-91). IEEE. <http://dx.doi.org/10.1109/BIBE.2009.19>`_ .. [3] `Vinh, N. X., Epps, J., & Bailey, J. (2010). Information theoretic measures for clusterings comparison: Variants, properties, normalization and correction for chance. The Journal of Machine Learning Research, 11, 2837-2854. <http://www.jmlr.org/papers/v11/vinh10a.html>`_ """ # Prepare row totals and check for special cases row_totals = np.fromiter(self.iter_row_totals(), dtype=np.int64) col_totals = np.fromiter(self.iter_col_totals(), dtype=np.int64) R = len(row_totals) C = len(col_totals) if R == C == 1 or R == C == 0: # No clustering since the data is not split. This is a perfect match # hence return 1.0. return 1.0 # In one step, calculate entropy for each labeling and mutual # information h_true, h_pred, mi = self._entropies() mi_max = max(h_true, h_pred) # Calculate the expected value for the MI emi = emi_from_margins(row_totals, col_totals) # Calculate the adjusted MI score ami = (mi - emi) / (mi_max - emi) return ami
[docs] def assignment_score_m1(self, normalize=True, redraw=False): return self.assignment_score( normalize=normalize, model='m1', discrete=False, redraw=redraw)
[docs] def assignment_score_m2r(self, normalize=True, redraw=False): return self.assignment_score( normalize=normalize, model='m2r', discrete=False, redraw=redraw)
[docs] def assignment_score_m2c(self, normalize=True, redraw=False): return self.assignment_score( normalize=normalize, model='m2c', discrete=False, redraw=redraw)
[docs] def assignment_score_m3(self, normalize=True, redraw=False): return self.assignment_score( normalize=normalize, model='m3', discrete=False, redraw=redraw)
[docs] def assignment_score(self, normalize=True, model='m1', discrete=False, redraw=False): """Similarity score by solving the Linear Sum Assignment Problem This metric is uniformly more powerful than the similarly behaved ``split_join_similarity`` which relies on an approximation to the optimal solution evaluated here. The split-join approximation asymptotically approaches the optimal solution as the clustering quality improves. On the ``model`` parameter: adjusting assignment cost for chance by relying on the hypergeometric distribution is extremely computationally expensive, but one way to get a better behaved metric is to just subtract the cost of a null model from the obtained score (in case of normalization, the null cost also has to be subtracted from the maximum cost). Note that on large tables even finding the null cost is too expensive, since expected tables have a lot less sparsity. Hence the parameter is off by default. Alternatively this problem can be recast as that of finding a *maximum weighted bipartite match* [1]_. This method of partition comparison was first mentioned in [2]_, given an approximation in [3]_, formally elaborated in [4]_ and empirically compared with other measures in [5]_. See Also -------- split_join_similarity References ---------- .. [1] `Wikipedia entry on weighted bipartite graph matching <https://en.wikipedia.org/wiki/Matching_%28graph_theory%28#In_weighted_bipartite_graphs>`_ .. [2] `Almudevar, A., & Field, C. (1999). Estimation of single-generation sibling relationships based on DNA markers. Journal of agricultural, biological, and environmental statistics, 136-165. <http://www.jstor.org/stable/1400594>`_ .. [3] `Ben-Hur, A., & Guyon, I. (2003). Detecting stable clusters using principal component analysis. In Functional Genomics (pp. 159-182). Humana press. <http://doi.org/10.1385/1-59259-364-X:159>`_ .. [4] `Gusfield, D. (2002). Partition-distance: A problem and class of perfect graphs arising in clustering. Information Processing Letters, 82(3), 159-164. <http://doi.org/10.1016/S0020-0190%2801%2900263-0>`_ .. [5] `Giurcaneanu, C. D., & Tabus, I. (2004). Cluster structure inference based on clustering stability with applications to microarray data analysis. EURASIP Journal on Applied Signal Processing, 2004, 64-80. <http://dl.acm.org/citation.cfm?id=1289345>`_ """ # computing assignment cost is expensive so we cache it cost = self._assignment_cost if cost is None: # guess matrix dtype cost = assignment_cost(self.to_rows(), maximize=True) self._assignment_cost = cost N = self.grand_total R, C = self.shape if model is None: null_cost = 0 elif (not discrete) and model == 'm1': # No margin is fixed, assignment doesn't matter (all cells are # equal under this assumption), so we can calculate expected cost # directly null_cost = N / float(max(R, C)) elif (not discrete) and model == 'm2r': # fixed row margin, assignment also doesn't matter sum_top_rows = N if R <= C else \ sum(sorted(self.row_totals.itervalues(), reverse=True)[:C]) null_cost = sum_top_rows / float(C) elif (not discrete) and model == 'm2c': # fixed column margin, assignment also doesn't matter sum_top_cols = N if C <= R else \ sum(sorted(self.col_totals.itervalues(), reverse=True)[:R]) null_cost = sum_top_cols / float(R) else: # all margins fixed, assignment matters expected = self.expected( model=model, discrete=discrete, redraw=redraw) null_cost = expected.assignment_score( model=None, normalize=False) cost -= null_cost if normalize: max_cost = N - null_cost cost = 1.0 if cost == max_cost else _div(cost, max_cost) return cost
[docs] def vi_distance(self, normalize=True): """Variation of Information distance Defined in [1]_. This is one of several possible entropy-based distance measures that could be defined on a RxC matrix. The given measure is equivalent to :math:`2 D_{sum}` as listed in Table 2 in [2]_. Note that the entropy variables H below are calculated using natural logs, so a base correction may be necessary if you need your result in base 2 for example. References ---------- .. [1] `Meila, M. (2007). Comparing clusterings -- an information based distance. Journal of multivariate analysis, 98(5), 873-895. <https://doi.org/10.1016/j.jmva.2006.11.013>`_ .. [2] `Vinh, N. X., Epps, J., & Bailey, J. (2010). Information theoretic measures for clusterings comparison: Variants, properties, normalization and correction for chance. The Journal of Machine Learning Research, 11, 2837-2854. <http://www.jmlr.org/papers/v11/vinh10a.html>`_ """ H_C, H_K, I_CK = self._entropies() N = self.grand_total score = (H_C + H_K - 2 * I_CK) / N if normalize: score = _div(score, log(N)) return score
[docs] def vi_similarity_m1(self, normalize=True): return self.vi_similarity(normalize=normalize, model='m1')
[docs] def vi_similarity_m2r(self, normalize=True): return self.vi_similarity(normalize=normalize, model='m2r')
[docs] def vi_similarity_m2c(self, normalize=True): return self.vi_similarity(normalize=normalize, model='m2c')
[docs] def vi_similarity_m3(self, normalize=True): return self.vi_similarity(normalize=normalize, model='m3')
[docs] def vi_similarity(self, normalize=True, model='m1'): """Inverse of ``vi_distance`` The m1 adjustment is monotonic for tables of fixed size. The m3 adjustment turns this measure into Normalized Mutual Information (NMI) """ R, C = self.shape N = self.grand_total max_dist = log(N) dist = self.vi_distance(normalize=False) score = max_dist - dist if model is None: null_score = 0 elif model == 'm1': # only N is fixed null_dist = log(R) + log(C) null_score = max_dist - null_dist elif model == 'm2r': # fixed row margin null_dist = log(C) + fentropy(self.row_totals) / N null_score = max_dist - null_dist elif model == 'm2c': # fixed column margin null_dist = log(R) + fentropy(self.col_totals) / N null_score = max_dist - null_dist elif model == 'm3': # both row and column margins fixed null_dist = (fentropy(self.row_totals) + fentropy(self.col_totals)) / N null_score = max_dist - null_dist else: expected = self.expected(model) null_score = expected.vi_similarity(normalize=False, model=None) score -= null_score if normalize: max_score = max_dist - null_score score = 1.0 if score == max_score else _div(score, max_score) return score
[docs] def split_join_distance(self, normalize=True): """Distance metric based on ``split_join_similarity`` """ sim = self.split_join_similarity(normalize=False, model=None) max_sim = 2 * self.grand_total score = max_sim - sim if normalize: score = _div(score, max_sim) return score
[docs] def split_join_similarity_m1(self, normalize=True): return self.split_join_similarity(normalize=normalize, model='m1')
[docs] def split_join_similarity_m2r(self, normalize=True): return self.split_join_similarity(normalize=normalize, model='m2r')
[docs] def split_join_similarity_m2c(self, normalize=True): return self.split_join_similarity(normalize=normalize, model='m2c')
[docs] def split_join_similarity_m3(self, normalize=True): return self.split_join_similarity(normalize=normalize, model='m3')
[docs] def split_join_similarity(self, normalize=True, model='m1'): """Split-join similarity score Split-join similarity is a two-way assignment-based score first proposed in [1]_. The distance variant of this measure has metric properties. Like the better known purity score (a one-way coefficient), this measure implicitly performs class-cluster assignment, except the assignment is performed twice: based on the corresponding maximum frequency in the contingency table, each class is given a cluster with the assignment weighted according to the frequency, then the procedure is inversed to assign a class to each cluster. The final unnormalized distance score comprises of a simple sum of the two one-way assignment scores. By default, ``m1`` null model is subtracted, to make the final score independent of the number of clusters:: >>> t2 = ClusteringMetrics(rows=10 * np.ones((2, 2), dtype=int)) >>> t2.split_join_similarity(model=None) 0.5 >>> t2.split_join_similarity(model='m1') 0.0 >>> t8 = ClusteringMetrics(rows=10 * np.ones((8, 8), dtype=int)) >>> t8.split_join_similarity(model=None) 0.125 >>> t8.split_join_similarity(model='m1') 0.0 See Also -------- assignment_score References ---------- .. [1] `Dongen, S. V. (2000). Performance criteria for graph clustering and Markov cluster experiments. Information Systems [INS], (R 0012), 1-36. <http://dl.acm.org/citation.cfm?id=868979>`_ """ pa_B = sum(max(row) for row in self.iter_rows()) pb_A = sum(max(col) for col in self.iter_cols()) score = pa_B + pb_A N = self.grand_total R, C = self.shape if model is None: null_score = 0 elif model == 'm1': # only N is fixed null_score = N / float(R) + N / float(C) elif model == 'm2r': # fixed row margin null_score = max(self.row_totals.itervalues()) + N / float(C) elif model == 'm2c': # fixed column margin null_score = N / float(R) + max(self.col_totals.itervalues()) elif model == 'm3': # both row and column margins fixed null_score = \ max(self.row_totals.itervalues()) + \ max(self.col_totals.itervalues()) else: expected = self.expected(model) null_score = expected.split_join_similarity(normalize=False, model=None) score -= null_score if normalize: max_score = 2 * N - null_score score = 1.0 if score == max_score else _div(score, max_score) return score
[docs] def talburt_wang_index(self): """Talburt-Wang index of similarity of two partitions On sparse matrices, the resolving power of this measure asymptotically approaches that of assignment-based scores such as ``assignment_score`` and ``split_join_similarity``, however on dense matrices this measure will not perform well due to its reliance on category cardinalities (how many types were seen) rather than on observation counts (how many instances of each type were seen). A relatively decent clustering:: >>> a = [ 1, 1, 1, 2, 2, 2, 2, 3, 3, 4] >>> b = [43, 56, 56, 5, 36, 36, 36, 74, 74, 66] >>> t = ContingencyTable.from_labels(a, b) >>> round(t.talburt_wang_index(), 3) 0.816 Less good clustering (example from [1]_):: >>> clusters = [[1, 1], [1, 1, 1, 1], [2, 3], [2, 2, 3, 3], ... [3, 3, 4], [3, 4, 4, 4, 4, 4, 4, 4, 4, 4]] >>> t = ContingencyTable.from_clusters(clusters) >>> round(t.talburt_wang_index(), 2) 0.49 References ---------- .. [1] `Talburt, J., Wang, R., Hess, K., & Kuo, E. (2007). An algebraic approach to data quality metrics for entity resolution over large datasets. Information quality management: Theory and applications, 1-22. <http://www.igi-global.com/chapter/algebraic-approach-data-quality-metrics/23022>`_ """ A_card, B_card = self.shape V_card = len(self) return _div(sqrt(A_card * B_card), V_card)
[docs] def muc_scores(self): """MUC similarity indices for coreference scoring Implemented after description in [1]_. The compound fscore-like metric has good resolving power on sparse models, similar to ``fowlkes_mallows`` (pairwise ``ochiai_coeff``), however it becomes useless on dense matrices as it relies on category cardinalities (how many types were seen) rather than on observation counts (how many instances of each type were seen). :: >>> p1 = [x.split() for x in ["A B C", "D E F G"]] >>> p2 = [x.split() for x in ["A B", "C", "D", "E", "F G"]] >>> cm = ClusteringMetrics.from_partitions(p1, p2) >>> cm.muc_scores()[:2] (1.0, 0.4) Elements that are part of neither partition (in this case, E) are excluded from consideration:: >>> p1 = [x.split() for x in ["A B", "C", "D", "F G", "H"]] >>> p2 = [x.split() for x in ["A B", "C D", "F G H"]] >>> cm = ClusteringMetrics.from_partitions(p1, p2) >>> cm.muc_scores()[:2] (0.5, 1.0) References ---------- .. [1] `Vilain, M., Burger, J., Aberdeen, J., Connolly, D., & Hirschman, L. (1995, November). A model-theoretic coreference scoring scheme. In Proceedings of the 6th conference on Message understanding (pp. 45-52). Association for Computational Linguistics. <http://www.aclweb.org/anthology/M/M95/M95-1005.pdf>`_ """ A_card, B_card = self.shape V_card = len(self) N = self.grand_total recall = _div(N - V_card, N - A_card) precision = _div(N - V_card, N - B_card) fscore = harmonic_mean(recall, precision) return precision, recall, fscore
[docs] def bc_metrics(self): """'B-cubed' precision, recall, and fscore As described in [1]_ and [2]_. Was extended to overlapping clusters in [3]_. These metrics perform very similarly to normalized entropy metrics (homogeneity, completeness, V-measure). References ---------- .. [1] `Bagga, A., & Baldwin, B. (1998, August). Entity-based cross- document coreferencing using the vector space model. In Proceedings of the 36th Annual Meeting of the Association for Computational Linguistics and 17th International Conference on Computational Linguistics-Volume 1 (pp. 79-85). Association for Computational Linguistics. <https://aclweb.org/anthology/P/P98/P98-1012.pdf>`_ .. [2] `Bagga, A., & Baldwin, B. (1998, May). Algorithms for scoring coreference chains. In The first international conference on language resources and evaluation workshop on linguistics coreference (Vol. 1, pp. 563-566). <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.47.5848>`_ .. [3] `Amigó, E., Gonzalo, J., Artiles, J., & Verdejo, F. (2009). A comparison of extrinsic clustering evaluation metrics based on formal constraints. Information retrieval, 12(4), 461-486. <http://doi.org/10.1007/s10791-008-9066-8>`_ """ precision = 0.0 recall = 0.0 for rm, cm, observed in self.iter_vals_with_margins(): precision += (observed ** 2) / float(cm) recall += (observed ** 2) / float(rm) N = self.grand_total precision = _div(precision, N) recall = _div(recall, N) fscore = harmonic_mean(recall, precision) return precision, recall, fscore
[docs]class ClusteringMetrics(ContingencyTable): """Provides external clustering evaluation metrics A subclass of ContingencyTable that builds a pairwise co-association matrix for clustering comparisons. :: >>> Y1 = {(1, 2, 3), (4, 5, 6)} >>> Y2 = {(1, 2), (3, 4, 5), (6,)} >>> cm = ClusteringMetrics.from_partitions(Y1, Y2) >>> cm.split_join_similarity(model=None) 0.75 """ def __init__(self, *args, **kwargs): ContingencyTable.__init__(self, *args, **kwargs) self._pairwise = None self._pairwise_models = {} @property def pairwise(self): """Confusion matrix on all pair assignments from two partitions A partition of N is a set of disjoint clusters s.t. every point in N belongs to one and only one cluster and every cluster consists of at least one point. Given two partitions A and B and a co-occurrence matrix of point pairs, == ============================================================= TP count of pairs found in the same partition in both A and B FP count of pairs found in the same partition in A but not in B FN count of pairs found in the same partition in B but not in A TN count of pairs in different partitions in both A and B == ============================================================= Note that although the resulting confusion matrix has the form of a correlation table for two binary variables, it is not symmetric if the original partitions are not symmetric. """ pairwise = self._pairwise if pairwise is None: actual_positives = fsum_pairs(self.iter_row_totals()) called_positives = fsum_pairs(self.iter_col_totals()) TP = fsum_pairs(self.itervalues()) FN = actual_positives - TP FP = called_positives - TP TN = fnum_pairs(self.grand_total) - TP - FP - FN pairwise = self._pairwise = ConfusionMatrix2.from_ccw(TP, FP, TN, FN) return pairwise
[docs] def get_score(self, scoring_method, *args, **kwargs): """Evaluate specified scoring method """ try: method = getattr(self, scoring_method) except AttributeError: method = getattr(self.pairwise, scoring_method) return method(*args, **kwargs)
[docs] def adjusted_rand_index(self): """Rand score (accuracy) corrected for chance This is a memory-efficient replacement for a similar Scikit-Learn function. """ return self.pairwise.kappa()
[docs] def rand_index(self): """Pairwise accuracy (uncorrected for chance) Don't use this metric; it is only added here as the "worst reference" """ return self.pairwise.accuracy()
[docs] def fowlkes_mallows(self): """Fowlkes-Mallows index for partition comparison Defined as the Ochiai coefficient on the pairwise matrix """ return self.pairwise.ochiai_coeff()
[docs] def adjusted_fowlkes_mallows(self): """Fowlkes-Mallows index adjusted for chance Adjustmend for chance done by subtracting the expected (Model 3) pairwise matrix from the actual one. This coefficient appears to be uniformly more powerful than the unadjusted version. Compared to ARI and product-moment correlation coefficients, this index is generally less powerful except in particularly poorly specified cases, e.g. clusters of unequal size sampled with high error rate from a large population. """ return self.pairwise.ochiai_coeff_adj()
[docs] def mirkin_match_coeff(self, normalize=True): """Equivalence match (similarity) coefficient Derivation of distance variant described in [1]_. This measure is nearly identical to pairwise unadjusted Rand index, as can be seen from the definition (Mirkin match formula uses square while pairwise accuracy uses n choose 2). :: >>> C3 = [{1, 2, 3, 4}, {5, 6, 7, 8, 9, 10}, {11, 12, 13, 14, 15, 16}] >>> C4 = [{1, 2, 3, 4}, {5, 6, 7, 8, 9, 10, 11, 12}, {13, 14, 15, 16}] >>> t = ClusteringMetrics.from_partitions(C3, C4) >>> t.mirkin_match_coeff(normalize=False) 216.0 References ---------- .. [1] `Mirkin, B (1996). Mathematical Classification and Clustering. Kluwer Academic Press: Boston-Dordrecht. <http://www.amazon.com/dp/0792341597>`_ """ max_score = self.grand_total ** 2 score = max_score - self.mirkin_mismatch_coeff(normalize=False) if normalize: score = _div(score, max_score) return score
[docs] def mirkin_mismatch_coeff(self, normalize=True): """Equivalence mismatch (distance) coefficient Direct formulation (without the pairwise abstraction): .. math:: M = \\sum_{i=1}^{R} r_{i}^2 + \\sum_{j=1}^{C} c_{j}^2 - \\sum_{i=1}^{R}\\sum_{j=1}^{C} n_{ij}^2, where :math:`r` and :math:`c` are row and column margins, respectively, with :math:`R` and :math:`C` cardinalities. :: >>> C1 = [{1, 2, 3, 4, 5, 6, 7, 8}, {9, 10, 11, 12, 13, 14, 15, 16}] >>> C2 = [{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {11, 12, 13, 14, 15, 16}] >>> t = ClusteringMetrics.from_partitions(C1, C2) >>> t.mirkin_mismatch_coeff(normalize=False) 56.0 """ score = 2 * (self.pairwise.FN + self.pairwise.FP) if normalize: score = _div(score, self.grand_total ** 2) return score
confmat2_type = namedtuple("ConfusionMatrix2", "TP FP TN FN")
[docs]class ConfusionMatrix2(ContingencyTable, OrderedCrossTab): """A confusion matrix (2x2 contingency table) For a binary variable (where one is measuring either presence vs absence of a particular feature), a confusion matrix where the ground truth levels are rows looks like this:: >>> cm = ConfusionMatrix2(TP=20, FN=31, FP=14, TN=156) >>> cm ConfusionMatrix2(rows=[[20, 31], [14, 156]]) >>> cm.to_array() array([[ 20, 31], [ 14, 156]]) For a nominal variable, the negative class becomes a distinct label, and TP/FP/FN/TN terminology does not apply, although the algorithms should work the same way (with the obvious distinction that different assumptions will be made). For a convenient reference about some of the attributes and methods defined here see [1]_. Attributes ---------- TP : True positive count FP : False positive count TN : True negative count FN : False negative count References ---------- .. [1] `Wikipedia entry for Confusion Matrix <https://en.wikipedia.org/wiki/Confusion_matrix>`_ """ def __repr__(self): return "ConfusionMatrix2(rows=%s)" % repr(self.to_rows()) def __init__(self, TP=None, FN=None, FP=None, TN=None, rows=None): if rows is None: rows = ((TP, FN), (FP, TN)) ContingencyTable.__init__(self, rows=rows)
[docs] def lform(self): """Factory creating L-form version of current table """ a, c, d, b = self.to_ccw() if b < c: a += b b -= b c -= b d += b else: a += c b -= c c -= c d += c return self.__class__.from_ccw(a, c, d, b)
@classmethod
[docs] def from_sets(cls, set1, set2, universe_size=None): """Instantiate from two sets Accepts an optional universe_size parameter which allows us to take into account TN class and use probability-based similarity metrics. Most of the time, however, set comparisons are performed ignoring this parameter and relying instead on non-probabilistic indices such as Jaccard's or Dice. """ if not isinstance(set1, Set): set1 = set(set1) if not isinstance(set2, Set): set2 = set(set2) TP = len(set1 & set2) FP = len(set2) - TP FN = len(set1) - TP if universe_size is None: TN = 0 else: TN = universe_size - TP - FP - FN if TN < 0: raise ValueError( "universe_size must be at least as large as set union") return cls(TP, FN, FP, TN)
@classmethod
[docs] def from_random_counts(cls, low=0, high=100): """Instantiate from random values """ return cls(*np.random.randint(low=low, high=high, size=(4,)))
@classmethod
[docs] def from_ccw(cls, TP, FP, TN, FN): """Instantiate from counter-clockwise form of TP FP TN FN """ return cls(TP, FN, FP, TN)
[docs] def to_ccw(self): """Convert to counter-clockwise form of TP FP TN FN """ return confmat2_type(TP=self.TP, FP=self.FP, TN=self.TN, FN=self.FN)
[docs] def get_score(self, scoring_method, *args, **kwargs): """Evaluate specified scoring method """ method = getattr(self, scoring_method) return method(*args, **kwargs)
@property def TP(self): return self.rows[0][0] @property def FN(self): return self.rows[0][1] @property def FP(self): return self.rows[1][0] @property def TN(self): return self.rows[1][1]
[docs] def hypergeometric(self): """Hypergeometric association score """ covsign = copysign(1, self.covar()) _, pvalue = fisher_exact(self.to_array()) return covsign * (-_log(pvalue))
[docs] def ACC(self): """Accuracy (Rand Index) Synonyms: Simple Matching Coefficient, Rand Index """ return _div(self.TP + self.TN, self.grand_total)
[docs] def PPV(self): """Positive Predictive Value (Precision) Synonyms: precision, frequency of hits, post agreement, success ratio, correct alarm ratio """ return _div(self.TP, self.TP + self.FP)
[docs] def NPV(self): """Negative Predictive Value Synonyms: frequency of correct null forecasts """ return _div(self.TN, self.TN + self.FN)
[docs] def TPR(self): """True Positive Rate (Recall, Sensitivity) Synonyms: recall, sensitivity, hit rate, probability of detection, prefigurance """ return _div(self.TP, self.TP + self.FN)
[docs] def FPR(self): """False Positive Rate Synonyms: fallout """ return _div(self.FP, self.TN + self.FP)
[docs] def TNR(self): """True Negative Rate (Specificity) Synonyms: specificity """ return _div(self.TN, self.FP + self.TN)
[docs] def FNR(self): """False Negative Rate Synonyms: miss rate, frequency of misses """ return _div(self.FN, self.TP + self.FN)
[docs] def FDR(self): """False discovery rate Synonyms: false alarm ratio, probability of false alarm """ return _div(self.FP, self.TP + self.FP)
[docs] def FOR(self): """False omission rate Synonyms: detection failure ratio, miss ratio """ return _div(self.FN, self.TN + self.FN)
[docs] def PLL(self): """Positive likelihood ratio """ return _div(self.TPR(), self.FPR())
[docs] def NLL(self): """Negative likelihood ratio """ return _div(self.FNR(), self.TNR())
[docs] def DOR(self): """Diagnostics odds ratio Defined as .. math:: DOR = \\frac{PLL}{NLL}. Odds ratio has a number of interesting/desirable properties, however its one peculiarity that leaves us looking for an alternative measure is that on L-shaped matrices like, .. math:: \\begin{matrix} 77 & 0 \\\\ 5 & 26 \\end{matrix} its value will be infinity. Also known as: crude odds ratio, Mantel-Haenszel estimate. """ a, c, d, b = self.to_ccw() ad, bc = a * d, b * c return _div(ad, bc)
[docs] def fscore(self, beta=1.0): """F-score As beta tends to infinity, F-score will approach recall. As beta tends to zero, F-score will approach precision. A similarity coefficient that uses a similar definition is called Dice coefficient. See Also -------- dice_coeff """ return harmonic_mean_weighted(self.precision(), self.recall(), beta ** 2)
[docs] def dice_coeff(self): """Dice similarity (Nei-Li coefficient) This is the same as F1-score, but calculated slightly differently here. Note that Dice can be zero if total number of positives is zero, but F-score is undefined in that case (because recall is undefined). When adjusted for chance, this coefficient becomes identical to ``kappa`` [1]_. Since this coefficient is monotonic with respect to Jaccard and Sokal Sneath coefficients, its resolving power is identical to that of the other two. See Also -------- jaccard_coeff, sokal_sneath_coeff References ---------- .. [1] `Albatineh, A. N., Niewiadomska-Bugaj, M., & Mihalko, D. (2006). On similarity indices and correction for chance agreement. Journal of Classification, 23(2), 301-313. <http://doi.org/10.1007/s00357-006-0017-z>`_ """ a, c, _, b = self.to_ccw() return _div(2 * a, 2 * a + b + c)
[docs] def overlap_coeff(self): """Overlap coefficient (Szymkiewicz-Simpson coefficient) Can be obtained by standardizing Dice or Ochiai coefficients by their maximum possible value given fixed marginals. Not corrected for chance. Note that :math:`min(p_1, p_2)` is equal to the maximum value of :math:`a` given fixed marginals. When adjusted for chance, this coefficient turns into Loevinger's H. See Also -------- loevinger_coeff """ a, c, _, b = self.to_ccw() p1, p2 = a + b, a + c a_max = min(p1, p2) return 0.0 if a_max == 0 else _div(a, a_max)
[docs] def jaccard_coeff(self): """Jaccard similarity coefficient Jaccard coefficient has an interesting property in that in L-shaped matrices where either FP or FN are close to zero, its scale becomes equivalent to the scale of either recall or precision respectively. Since this coefficient is monotonic with respect to Dice (F-score) and Sokal Sneath coefficients, its resolving power is identical to that of the other two. Jaccard index does not belong to the L-family of association indices and thus cannot be adjusted for chance by subtracting the its value under fixed-margin null model. Instead, its expectation must be calculated, for which no analytical solution exists [1]_. Synonyms: critical success index See Also -------- dice_coeff, sokal_sneath_coeff References ---------- .. [1] `Albatineh, A. N., & Niewiadomska-Bugaj, M. (2011). Correcting Jaccard and other similarity indices for chance agreement in cluster analysis. Advances in Data Analysis and Classification, 5(3), 179-200. <http://doi.org/10.1007/s11634-011-0090-y>`_ """ a, c, _, b = self.to_ccw() return _div(a, a + b + c)
[docs] def ochiai_coeff_adj(self): """Ochiai coefficient adjusted for chance This index is nearly identical to Mattthews' Correlation Coefficient, which should be used instead. See Also -------- matthews_corr, ochiai_coeff """ a, c, d, b = self.to_ccw() p1, p2 = a + b, a + c n = a + b + c + d if n == 0: return np.nan elif a == n or d == n: # only one (diagonal) cell is non-zero return 0.5 elif p1 == 0 or p2 == 0: # first row or column is zero, second non-zero return 0.0 p1_p2 = p1 * p2 numer = n * a - p1_p2 denom = n * sqrt(p1_p2) - p1_p2 return _div(numer, denom)
[docs] def ochiai_coeff(self): """Ochiai similarity coefficient (Fowlkes-Mallows) One interpretation of this coefficient that it is equal to the geometric mean of the conditional probability of an element (in the case of pairwise clustering comparison, a pair of elements) belonging to the same cluster given that they belong to the same class [1]_. This coefficient is in the L-family, and thus it can be corrected for chance by subtracting its value under fixed-margin null model. The resulting adjusted index is very close to, but not the same as, Matthews Correlation Coefficient. Empirically, the discriminating power of the adjusted coefficient is equal to that of Matthews' Correlation Coefficient to within rounding error. Synonyms: Cosine Similarity, Fowlkes-Mallows Index See Also -------- jaccard_coeff, dice_coeff, ochiai_coeff_adj References ---------- .. [1] `Ramirez, E. H., Brena, R., Magatti, D., & Stella, F. (2012). Topic model validation. Neurocomputing, 76(1), 125-133. <http://dx.doi.org/10.1016/j.neucom.2011.04.032>`_ """ a, c, _, b = self.to_ccw() p1, p2 = a + b, a + c if a == b == c == 0: return np.nan elif a == 0: return 0.0 return _div(a, sqrt(p1 * p2))
[docs] def sokal_sneath_coeff(self): """Sokal and Sneath similarity index In a 2x2 matrix .. math:: \\begin{matrix} a & b \\\\ c & d \\end{matrix} Dice places more weight on :math:`a` component, Jaccard places equal weight on :math:`a` and :math:`b + c`, while Sokal and Sneath places more weight on :math:`b + c`. See Also -------- dice_coeff, jaccard_coeff """ a, c, _, b = self.to_ccw() return _div(a, a + 2 * (b + c))
[docs] def prevalence_index(self): """Prevalence In interrater agreement studies, prevalence is high when the proportion of agreements on the positive classification differs from that of the negative classification. Example of a confusion matrix with high prevalence of negative response (note that this happens regardless of which rater we look at): .. math:: \\begin{matrix} 3 & 27 \\\\ 28 & 132 \\end{matrix} See Also -------- bias_index """ return _div(abs(self.TP - self.TN), self.grand_total)
[docs] def frequency_bias(self): """Frequency bias How much more often is rater B is predicting TP """ return _div(self.TP + self.FP, self.TP + self.FN)
[docs] def bias_index(self): """Bias Index In interrater agreement studies, bias is the extent to which the raters disagree on the positive-negative ratio of the binary variable studied. Example of a confusion matrix with high bias of rater A (represented by rows) towards negative rating: .. math:: \\begin{matrix} 17 & 14 \\\\ 78 & 81 \\end{matrix} See Also -------- prevalence_index """ return _div(abs(self.FN - self.FP), self.grand_total)
[docs] def informedness(self): """Informedness (recall corrected for chance) A complement to markedness. Can be thought of as recall corrected for chance. Alternative formulations: .. math:: Informedness &= Sensitivity + Specificity - 1.0 \\\\ &= TPR - FPR In the case of ranked predictions, TPR can be plotted on the y-axis with FPR on the x-axis. The resulting plot is known as Receiver Operating Characteristic (ROC) curve [1]_. The delta between a point on the ROC curve and the diagonal is equal to the value of informedness at the given FPR threshold. This measure was first proposed for evaluating medical diagnostics tests in [2]_, and was also used in meteorology under the name "True Skill Score" [3]_. Synonyms: Youden's J, True Skill Score, Hannssen-Kuiper Score, Attributable Risk, DeltaP. See Also -------- markedness References ---------- .. [1] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern recognition letters, 27(8), 861-874. <http://doi.org/10.1016/j.patrec.2005.10.010>`_ .. [2] `Youden, W. J. (1950). Index for rating diagnostic tests. Cancer, 3(1), 32-35. <http://www.ncbi.nlm.nih.gov/pubmed/15405679>`_ .. [3] `Doswell III, C. A., Davies-Jones, R., & Keller, D. L. (1990). On summary measures of skill in rare event forecasting based on contingency tables. Weather and Forecasting, 5(4), 576-585. <http://journals.ametsoc.org/doi/abs/10.1175/1520-0434%281990%29005%3C0576%3AOSMOSI%3E2.0.CO%3B2>`_ """ a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d n = p1 + q1 if n == 0: return np.nan elif p1 == n: return 0.0 # return _div(a - b, 2 * (a + b)) elif q1 == n: return 0.0 # return _div(d - c, 2 * (d + c)) else: return _div(self.covar(), p1 * q1)
[docs] def markedness(self): """Markedness (precision corrected for chance) A complement to informedness. Can be thought of as precision corrected for chance. Alternative formulations: .. math:: Markedness &= PPV + NPV - 1.0 \\\\ &= PPV - FOR In the case of ranked predictions, PPV can be plotted on the y-axis with FOR on the x-axis. The resulting plot is known as Relative Operating Level (ROL) curve [1]_. The delta between a point on the ROL curve and the diagonal is equal to the value of markedness at the given FOR threshold. Synonyms: DeltaP′ See Also -------- informedness References ---------- .. [1] `Mason, S. J., & Graham, N. E. (2002). Areas beneath the relative operating characteristics (ROC) and relative operating levels (ROL) curves: Statistical significance and interpretation. Quarterly Journal of the Royal Meteorological Society, 128(584), 2145-2166. <https://doi.org/10.1256/003590002320603584>`_ """ a, c, d, b = self.to_ccw() p2, q2 = a + c, b + d n = p2 + q2 if n == 0: return np.nan elif p2 == n: return 0.0 # return _div(a - c, 2 * (a + c)) elif q2 == n: return 0.0 # return _div(d - b, 2 * (d + b)) else: return _div(self.covar(), p2 * q2)
[docs] def xcoeff(self): """Alternative to ``loevinger_coeff`` but with -1 lower bound """ a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d p2, q2 = a + c, b + d n = p1 + q1 cov = self.covar() if n == 0: return np.nan elif a == n or d == n: return 0.5 elif b == n or c == n: return -1.0 elif cov > 0.0: return _div(cov, min(p1 * q2, p2 * q1)) elif cov < 0.0: return _div(cov, min(n * c, n * b)) else: return 0.0
[docs] def pairwise_hcv(self): """Pairwise homogeneity, completeness, and their geometric mean Each of the two one-sided measures is defined as follows: .. math:: \\hat{M}_{adj} = \\frac{M - E[M]}{M_{max} - min(E[M], M)}. It is clear from the definition above that *iff* :math:`M < E[M]` and :math:`M \\leq M_{max}`, the denominator will switch from the standard normalization interval to a larger one, thereby ensuring that :math:`-1.0 \\leq \\hat{M}_{adj} \\leq 1.0`. The definition for the bottom half of the range can also be expressed in terms of the standard adjusted value: .. math:: \\hat{M}_{adj} = \\frac{M_{adj}}{(1 + |M_{adj}|^n)^{1/n}}, \\quad M_{adj} < 0, n = 1. The resulting measure is not symmetric over its range (negative values are scaled differently from positive values), however this should not matter for applications where negative correlation does not carry any special meaning other than being additional evidence for absence of positive correlation. Such as a situation occurs in pairwise confusion matrices used in cluster analysis. Nevertheless, if more symmetric behavior near zero is desired, the upper part of the negative range can be linearized either by increasing :math:`n` in the definition above or by replacing it with :math:`\\hat{M}_{adj} = tanh(M_{adj})` transform. For the compound measure, the geometric mean was chosen over the harmonic after the results of a Monte Carlo power analysis, due to slightly better discriminating performance. For positive matrices, the geometric mean is equal to ``matthews_corr``, while the harmonic mean would have been equal to ``kappa``. For negative matrices, the harmonic mean would have remained monotonic (though not equal) to Kappa, while the geometric mean is neither monotonic nor equal to MCC, despite the two being closely correlated. The discriminating performance indices of the geometric mean and of MCC are empirically the same (equal to within rounding error). For matrices with negative covariance, it is possible to switch to ``markedness`` and ``informedness`` as one-sided components (homogeneity and completeness, respectively). However, the desirable property of measure orthogonality will not be preserved then, since markedness and informedness exhibit strong correlation under the assumed null model. """ a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d p2, q2 = a + c, b + d n = a + b + c + d cov = self.covar() if n == 0.0: k0, k1, k2 = np.nan, np.nan, np.nan elif a == n or d == n: k0, k1, k2 = 0.5, 0.5, 0.5 elif b == n: k0, k1, k2 = -1.0, -0.0, -0.0 elif c == n: k0, k1, k2 = -0.0, -1.0, -0.0 elif p1 == n or q2 == n: k0, k1, k2 = 0.0, 0.0, 0.0 elif p2 == n or q1 == n: k0, k1, k2 = 0.0, 0.0, 0.0 elif cov > 0.0: k0 = _div(cov, p2 * q1) k1 = _div(cov, p1 * q2) k2 = _div(cov, sqrt(p1 * q1 * p2 * q2)) elif cov < 0.0: k0 = _div(cov, n * c) k1 = _div(cov, n * b) k2 = _div(cov, n * sqrt(b * c)) else: k0, k1, k2 = 0.0, 0.0, 0.0 return k0, k1, k2
[docs] def kappas(self): """Pairwise precision and recall corrected for chance Kappa decomposes into a pair of components (regression coefficients), :math:`\\kappa_0` (precision-like) and :math:`\\kappa_1` (recall-like), of which it is a harmonic mean: .. math:: \\kappa_0 = \\frac{cov}{p_2 q_1}, \\quad \\kappa_1 = \\frac{cov}{p_1 q_2}. These coefficients are interesting because they represent precision and recall, respectively, corrected for chance by subtracting the fixed-margin null model. In clustering context, :math:`\\kappa_0` corresponds to pairwise homogeneity, while :math:`\\kappa_1` corresponds to pairwise completeness. The geometric mean of the two components is equal to Matthews' Correlation Coefficient, while their maximum is equal to Loevinger's H when :math:`ad \\geq bc`. See Also -------- kappa, loevinger_coeff, matthews_corr """ a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d p2, q2 = a + c, b + d n = a + b + c + d if a == n or d == n: k0, k1 = np.nan, np.nan elif b == n: k0, k1 = np.NINF, -0.0 elif c == n: k0, k1 = -0.0, np.NINF elif p1 == n or q2 == n: k0, k1 = np.nan, 0.0 elif p2 == n or q1 == n: k0, k1 = 0.0, np.nan else: cov = self.covar() p2_q1, p1_q2 = p2 * q1, p1 * q2 k0, k1 = _div(cov, p2_q1), _div(cov, p1_q2) return k0, k1, self.kappa()
[docs] def loevinger_coeff(self): """Loevinger's Index of Homogeneity (Loevinger's H) Given a clustering (numbers correspond to class labels, inner groups to clusters) with perfect homogeneity but imperfect completeness, Loevinger coefficient returns a perfect score on the corresponding pairwise co-association matrix:: >>> clusters = [[0, 0], [0, 0, 0, 0], [1, 1, 1, 1]] >>> t = ClusteringMetrics.from_clusters(clusters) >>> t.pairwise.loevinger_coeff() 1.0 At the same time, kappa and Matthews coefficients are 0.63 and 0.68, respectively. Loevinger coefficient will also return a perfect score for the dual situation:: >>> clusters = [[0, 2, 2, 0, 0, 0], [1, 1, 1, 1]] >>> t = ClusteringMetrics.from_clusters(clusters) >>> t.pairwise.loevinger_coeff() 1.0 Loevinger's coefficient has a unique property: all two-way correlation coefficients on a 2x2 table that are in L-family (including Kappa and Matthews' correlation coefficient) become Loevinger's coefficient after normalization by maximum value [1]_. However, this measure is not symmetric: when :math:`ad < bc`, it does not have a lower bound. For an equivalent symmetric measure, use Cole coefficient. See Also -------- cole_coeff References ---------- .. [1] `Warrens, M. J. (2008). On association coefficients for 2x2 tables and properties that do not depend on the marginal distributions. Psychometrika, 73(4), 777-789. <https://doi.org/10.1007/s11336-008-9070-3>`_ """ a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d p2, q2 = a + c, b + d n = p1 + q1 cov = self.covar() if n == 0: return np.nan elif a == n or d == n: # only one (diagonal) cell is non-zero return 0.5 elif cov == 0.0: return 0.0 else: return _div(cov, min(p1 * q2, p2 * q1))
[docs] def kappa(self): """Cohen's Kappa (Interrater Agreement) Kappa coefficient is best known in the psychology field where it was introduced to measure interrater agreement [1]_. It has also been used in replication studies [2]_, clustering evaluation [3]_, image segmentation [4]_, feature selection [5]_ [6]_, forecasting [7]_, and network link prediction [8]_. The first derivation of this measure is in [9]_. Kappa can be derived by correcting either Accuracy (Simple Matching Coefficient, Rand Index) or F1-score (Dice Coefficient) for chance. Conversely, Dice coefficient can be derived from Kappa by obtaining its limit as :math:`d \\rightarrow \\infty`. Normalizing Kappa by its maximum value given fixed-margin table gives Loevinger's H. Synonyms: Adjusted Rand Index, Heidke Skill Score See Also -------- kappas, loevinger_coeff, matthews_corr, dice_coeff References ---------- .. [1] `Cohen, J. (1960). A coefficient of agreement for nominal scales. Educational and psychological measurement, 20(1), 37-46. <https://doi.org/10.1177/001316446002000104>`_ .. [2] `Arabie, P., Hubert, L. J., & De Soete, G. (1996). Clustering validation: results and implications for applied analyses (p. 341). World Scientific Pub Co Inc. <https://doi.org/10.1142/9789812832153_0010>`_ .. [3] `Warrens, M. J. (2008). On the equivalence of Cohen's kappa and the Hubert-Arabie adjusted Rand index. Journal of Classification, 25(2), 177-183. <https://doi.org/10.1007/s00357-008-9023-7>`_ .. [4] `Briggman, K., Denk, W., Seung, S., Helmstaedter, M. N., & Turaga, S. C. (2009). Maximin affinity learning of image segmentation. In Advances in Neural Information Processing Systems (pp. 1865-1873). <http://books.nips.cc/papers/files/nips22/NIPS2009_0084.pdf>`_ .. [5] `Santos, J. M., & Embrechts, M. (2009). On the use of the adjusted rand index as a metric for evaluating supervised classification. In Artificial neural networks - ICANN 2009 (pp. 175-184). Springer Berlin Heidelberg. <https://doi.org/10.1007/978-3-642-04277-5_18>`_ .. [6] `Santos, J. M., & Ramos, S. (2010, November). Using a clustering similarity measure for feature selection in high dimensional data sets. In Intelligent Systems Design and Applications (ISDA), 2010 10th International Conference on (pp. 900-905). IEEE. <http://dx.doi.org/10.1109/ISDA.2010.5687073>`_ .. [7] `Doswell III, C. A., Davies-Jones, R., & Keller, D. L. (1990). On summary measures of skill in rare event forecasting based on contingency tables. Weather and Forecasting, 5(4), 576-585. <http://journals.ametsoc.org/doi/abs/10.1175/1520-0434%281990%29005%3C0576%3AOSMOSI%3E2.0.CO%3B2>`_ .. [8] `Hoffman, M., Steinley, D., & Brusco, M. J. (2015). A note on using the adjusted Rand index for link prediction in networks. Social Networks, 42, 72-79. <http://dx.doi.org/10.1016/j.socnet.2015.03.002>`_ .. [9] `Heidke, Paul. "Berechnung des Erfolges und der Gute der Windstarkevorhersagen im Sturmwarnungsdienst." Geografiska Annaler (1926): 301-349. <http://www.jstor.org/stable/519729>`_ """ a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d p2, q2 = a + c, b + d n = p1 + q1 if n == 0: return np.nan elif a == n or d == n: # only one (diagonal) cell is non-zero return 0.5 return _div(2 * self.covar(), p1 * q2 + p2 * q1)
[docs] def mp_corr(self): """Maxwell & Pilliner's association index Another covariance-based association index corrected for chance. Like MCC, based on a mean of informedness and markedness, except uses a harmonic mean instead of geometric. Like Kappa, turns into Dice coefficient (F-score) as 'd' approaches infinity. On typical problems, the resolving power of this coefficient is nearly identical to that of Cohen's Kappa and is only very slightly below that of Matthews' correlation coefficient. See Also -------- kappa, matthews_corr """ a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d p2, q2 = a + c, b + d n = p1 + q1 if n == 0: return np.nan elif a == n or d == n: # only one (diagonal) cell is non-zero return 0.5 elif b == n or c == n: # only one (non-diagonal) cell is non-zero return -0.5 return _div(2 * self.covar(), p1 * q1 + p2 * q2)
[docs] def matthews_corr(self): """Matthews Correlation Coefficient (Phi coefficient) MCC is directly related to the Chi-square statistic. Its value is equal to the Chi-square value normalized by the maximum value the Chi-square can achieve with given margins (for a 2x2 table, the maximum Chi-square score is equal to the grand total N) transformed to correlation space by taking a square root. MCC is a also a geometric mean of informedness and markedness (the regression coefficients of the problem and its dual). As :math:`d \\rightarrow \\infty`, MCC turns into Ochiai coefficient. Unlike with Kappa, normalizing the corresponding similarity coefficient for chance by subtracting the fixed-margin null model does not produce MCC in return, but gives a different index with equivalent discriminating power to that of MCC. Normalizing MCC by its maximum value under fixed- margin model gives Loevinger's H. Empirically, the discriminating power of MCC is sligtly better than that of ``mp_corr`` and ``kappa``, and is only lower than that of ``loevinger_coeff`` under highly biased conditions. While MCC is a commonly used and recently preferred measure of prediction and reproducibility [1]_, it is somewhat strange that one can hardly find any literature that uses this index in clustering comparison context, with some rare exceptions [2]_ [3]_. Synonyms: Phi Coefficient, Product-Moment Correlation See Also -------- kappa, mp_corr, ochiai_coeff References ---------- .. [1] `MAQC Consortium. (2010). The MicroArray Quality Control (MAQC)-II study of common practices for the development and validation of microarray-based predictive models. Nature biotechnology, 28(8), 827-838. <http://doi.org/10.1038/nbt.1665>`_ .. [2] `Xiao, J., Wang, X. F., Yang, Z. F., & Xu, C. W. (2008). Comparison of Supervised Clustering Methods for the Analysis of DNA Microarray Expression Data. Agricultural Sciences in China, 7(2), 129-139. <http://dx.doi.org/10.1016/S1671-2927%2808%2960032-2>`_ .. [3] `Kao, D. (2012). Using Matthews correlation coefficient to cluster annotations. NextGenetics (personal blog). <http://blog.nextgenetics.net/?e=47>`_ """ a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d p2, q2 = a + c, b + d n = p1 + q1 if n == 0: return np.nan elif a == n or d == n: # only one (diagonal) cell is non-zero return 0.5 elif b == n or c == n: # only one (non-diagonal) cell is non-zero return -0.5 elif p1 == n or p2 == n or q1 == n or q2 == n: # one row or column is zero, another non-zero return 0.0 return _div(self.covar(), sqrt(p1 * q1 * p2 * q2))
[docs] def mic_scores(self, mean='harmonic'): """Mutual information-based correlation The coefficient decomposes into regression coefficients defined according to fixed-margin tables. The ``mic1`` coefficient, for example, is obtained by dividing the G-score by the maximum achievable value on a table with fixed true class counts (which here correspond to row totals). The ``mic0`` is its dual, defined by dividing the G-score by its maximum achievable value with fixed predicted label counts (here represented as column totals). ``mic0`` roughly corresponds to precision (homogeneity) while ``mic1`` roughly corresponds to recall (completeness). """ h, c, rsquare = self.entropy_scores(mean=mean) covsign = copysign(1, self.covar()) mic0 = covsign * sqrt(c) mic1 = covsign * sqrt(h) mic2 = covsign * sqrt(rsquare) return mic0, mic1, mic2
[docs] def yule_q(self): """Yule's Q (association index) Yule's Q relates to the odds ratio (DOR) as follows: .. math:: Q = \\frac{DOR - 1}{DOR + 1}. """ a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d p2, q2 = a + c, b + d n = a + b + c + d if n == 0: return np.nan elif p1 == n: # c and d are zero return _div(a - b, p1) elif p2 == n: # b and d are zero return _div(a - c, p2) elif q1 == n: # a and b are zero return _div(d - c, q1) elif q2 == n: # a and c are zero return _div(d - b, q2) return _div(self.covar(), a * d + b * c)
[docs] def yule_y(self): """Yule's Y (colligation coefficient) The Y coefficient was used as basis of a new association measure by accounting for entropy in [1]_. References ---------- .. [1] `Hasenclever, D., & Scholz, M. (2013). Comparing measures of association in 2x2 probability tables. arXiv preprint arXiv:1302.6161. <http://arxiv.org/pdf/1302.6161v1.pdf>`_ """ a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d p2, q2 = a + c, b + d n = a + b + c + d if n == 0: return np.nan elif p1 == n: # c and d are zero return _div(sqrt(a) - sqrt(b), sqrt(a) + sqrt(b)) elif p2 == n: # b and d are zero return _div(sqrt(a) - sqrt(c), sqrt(a) + sqrt(c)) elif q1 == n: # a and b are zero return _div(sqrt(d) - sqrt(c), sqrt(d) + sqrt(c)) elif q2 == n: # a and c are zero return _div(sqrt(d) - sqrt(b), sqrt(d) + sqrt(b)) ad = a * d bc = b * c return _div(sqrt(ad) - sqrt(bc), sqrt(ad) + sqrt(bc))
[docs] def cole_coeff(self): """Cole coefficient This is exactly the same coefficient as *Lewontin's D'*. It is defined as: .. math:: D' = \\frac{cov}{cov_{max}}, where :math:`cov_{max}` is the maximum covariance attainable under the given marginal distribution. When :math:`ad \\geq bc`, this coefficient is equivalent to Loevinger's H. Synonyms: C7, Lewontin's D'. See Also -------- diseq_coeff, loevinger_coeff """ return self.diseq_coeff(standardize=True)
[docs] def diseq_coeff(self, standardize=False): """Linkage disequilibrium .. math:: D = \\frac{a}{n} - \\frac{p_1}{n}\\frac{p_2}{n} = \\frac{cov}{n^2} If ``standardize=True``, this measure is further normalized to maximum covariance attainable under given marginal distribution, and the resulting index is called *Lewontin's D'*. See Also -------- cole_coeff """ cov = self.covar() n = self.grand_total if standardize: a, c, d, b = self.to_ccw() p1, q1 = a + b, c + d p2, q2 = a + c, b + d if n == 0: return np.nan elif a == n or d == n: # only one (diagonal) cell is non-zero return 0.5 elif b == n or c == n: # only one (non-diagonal) cell is non-zero return -0.5 elif cov > 0.0: cov_max = min(p1 * q2, p2 * q1) return _div(cov, cov_max) elif cov < 0.0: cov_max = min(p1 * p2, q1 * q2) return _div(cov, cov_max) else: return 0.0 else: return _div(cov, n * n)
[docs] def covar(self): """Covariance (determinant of a 2x2 matrix) """ a, c, d, b = self.to_ccw() return a * d - b * c
# various silly terminologies follow # information retrieval precision = PPV recall = TPR accuracy = ACC # fallout = FPR # clinical diagnostics sensitivity = TPR specificity = TNR
# odds_ratio = DOR # youden_j = informedness # sales/marketing # hit_rate = TPR # miss_rate = FNR # ecology # sm_coeff = ACC # phi_coeff = matthews_corr # meteorology # heidke_skill = kappa # true_skill = informedness
[docs]def mutual_info_score(labels_true, labels_pred): """Memory-efficient replacement for equivalently named Sklean function """ ct = ContingencyTable.from_labels(labels_true, labels_pred) return ct.mutual_info_score()
[docs]def homogeneity_completeness_v_measure(labels_true, labels_pred): """Memory-efficient replacement for equivalently named Scikit-Learn function """ ct = ContingencyTable.from_labels(labels_true, labels_pred) return ct.entropy_scores()
[docs]def adjusted_rand_score(labels_true, labels_pred): """Rand score (accuracy) corrected for chance This is a memory-efficient replacement for the equivalently named Scikit-Learn function In a supplement to [1]_, the following example is given:: >>> classes = [1, 1, 2, 2, 2, 2, 3, 3, 3, 3] >>> clusters = [1, 2, 1, 2, 2, 3, 3, 3, 3, 3] >>> round(adjusted_rand_score(classes, clusters), 3) 0.313 References ---------- .. [1] `Yeung, K. Y., & Ruzzo, W. L. (2001). Details of the adjusted Rand index and clustering algorithms, supplement to the paper "An empirical study on principal component analysis for clustering gene expression data". Bioinformatics, 17(9), 763-774. <http://faculty.washington.edu/kayee/pca/>`_ """ ct = ClusteringMetrics.from_labels(labels_true, labels_pred) return ct.adjusted_rand_index()
[docs]def adjusted_mutual_info_score(labels_true, labels_pred): """Adjusted Mutual Information for two partitions This is a memory-efficient replacement for the equivalently named Scikit-Learn function. Perfect labelings are both homogeneous and complete, hence AMI has the perfect score of one:: >>> adjusted_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1]) 1.0 >>> adjusted_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0]) 1.0 If classes members are completely split across different clusters, the assignment is utterly incomplete, hence AMI equals zero:: >>> adjusted_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3]) 0.0 """ ct = ContingencyTable.from_labels(labels_true, labels_pred) return ct.adjusted_mutual_info()
[docs]def product_moment(*args, **kwargs): """Return MCC score for a 2x2 contingency table """ return ConfusionMatrix2.from_ccw(*args, **kwargs).matthews_corr()
[docs]def cohen_kappa(*args, **kwargs): """Return Cohen's Kappa for a 2x2 contingency table """ return ConfusionMatrix2.from_ccw(*args, **kwargs).kappa()