2001
@techreport{Dom2001,
vgclass = {report},
author = {Byron E. Dom},
title = {An Information-Theoretic External Cluster-Validity
Measure},
number = {RJ 10219},
institution = {IBM Research Division},
address = {IBM Almaden Research Center, 650 Harry Rd., San Jose,
California 95120-6099, USA},
month = {October},
year = {2001},
url = {http://www.almaden.ibm.com/cs/people/dom/rj10219.ps},
abstract = {In this paper we propose a measure of
similarity/association between two partitions of a set of objects. Our
motivation is the desire to use the measure to characterize the quality
or accuracy of clustering algorithms by somehow comparing the clusters
they produce with ``ground truth'' consisting of classes assigned to
the patterns by manual means or some other means in whose veracity
there is confidence. Such measures are referred to as ``external''.
Our measure also allows clusterings with different numbers of clusters
to be compared in a quantitative and principled way. Our evaluation
scheme quantitatively measures how useful the cluster labels of the
patterns are as predictors of their class labels. When all clusterings
to be compared have the same number of clusters, the measure is
equivalent to the mutual information between the cluster labels and the
class labels. In cases where the numbers of clusters are different,
however, it computes the reduction in the number of bits that would be
required to encode (compress) the class labels if both the encoder and
decoder have free access to the cluster labels. To achieve this
encoding the estimated conditional probabilities of the class labels
given the cluster labels must also be encoded. These estimated
probabilities can be seen as a ``model'' for the class labels and their
associated code length as a ``model cost''. In addition to defining the
measure we compare it to other commonly used external measures and
demonstrate its superiority as judged by certain criteria.},
}