2002
@inproceedings{PaL2002,
vgclass = {refpap},
author = {Patrick Pantel and Dekang Lin},
title = {Document Clustering with Committees},
booktitle = {Proceedings of the 25th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2002)},
address = {Tampere, Finland},
pages = {199--206},
month = {August~11--15},
year = {2002},
url = {http://doi.acm.org/10.1145/564376.564412},
abstract = {Document clustering is useful in many information
retrieval tasks: document browsing, organization and viewing of
retrieval results, generation of Yahoo-like hierarchies of documents,
etc. The general goal of clustering is to group data elements such that
the intra-group similarities are high and the inter-group similarities
are low. We present a clustering algorithm called CBC (Clustering By
Committee) that is shown to produce higher quality clusters in document
clustering tasks as compared to several well known clustering
algorithms. It initially discovers a set of tight clusters (high
intra-group similarity), called committees, that are well scattered in
the similarity space (low inter-group similarity). The union of the
committees is but a subset of all elements. The algorithm proceeds by
assigning elements to their most similar committee. Evaluating cluster
quality has always been a difficult task. We present a new evaluation
methodology that is based on the editing distance between output
clusters and manually constructed classes (the answer key). This
evaluation measure is more intuitive and easier to interpret than
previous evaluation measures.},
}