1998
@inproceedings{Gre1998,
vgclass = {refpap},
author = {Greiff, Warren R.},
title = {A Theory of Term Weighting Based on Exploratory Data
Analysis},
editor = {W. Bruce Croft and Alistair Moffat and van {R}ijsbergen, C. J.
and Ross Wilkinson and Justin Zobel},
booktitle = {Proceedings of the 21st Annual International {ACM SIGIR}
Conference on Research and Development in Information Retrieval},
address = {Melbourne, Australia},
pages = {11--19},
publisher = {ACM Press, New York},
month = {August},
year = {1998},
abstract = {Techniques of exploratory data analysis are used to
study the weight of evidence that the occurrence of a query term
provides in support of the hypothesis that a document is relevant
to an information need. In particular, the relationship between the
document frequency and the weight of evidence is investigated. A
correlation between document frequency normalized by collection
size and the mutual information between relevance and term
occurrence is uncovered. This correlation is found to be robust
across a variety of query sets and document collections. Based on
this relationship, a theoretical explanation of the efficacy of
inverse document frequency for term weighting is developed which
differs in both style and content from theories previously put
forth. The theory predicts that a ``flattening'' of idf at both low
and high frequency should result in improved retrieval performance.
This altered idf formulation is tested on all TREC query sets.
Retrieval results corroborate the prediction of improved retrieval
performance. In conclusion, we argue that exploratory data analysis
can be a valuable tool for research whose goal is the development
of an explanatory theory of information retrieval.},
}