1994
@inproceedings{RWJ1994,
vgclass = {refpap},
author = {S. E. Robertson and S. Walker and S. Jones and M. M.
Hancock-Beaulieu and M. Gatford},
title = {Okapi at {TREC}--3},
booktitle = {{NIST} Special Publication 500-225: Overview of the Third
{T}ext {RE}trieval {C}onference ({TREC}-3)},
address = {Gaithersburg, Maryland, USA},
pages = {109--126},
month = {November~2--4},
year = {1994},
url = {http://trec.nist.gov/pubs/trec3/papers/city.ps.gz},
abstract = {The sequence of TREC conferences has seen the City
University Okapi IR system evolve in several ways. Before TREC--1 it
was a very traditional probabilistic system comprising closely
integrated search engine and interface, designed for casual use by
searchers of bibliographic reference databases. During the course of
TREC--1 the low�level search functions were split off into a separate
Basic Search System (BSS) [2], but retrieval and ranking of documents
was still done using the ``classical'' probabilistic model of Robertson
and Sparck Jones[7] with no account taken of document length or term
frequency within document or query. Four runs were submitted to NIST
for evaluation: automatic ad hoc, automatic routing, manual ad hoc and
manual ad hoc with feedback. The results were undistinguished, although
not among the worst. Of the ad hoc runs, the manual was better than
the automatic (in which only the CONCEPTS fields of the topics were
used), and feedback appeared beneficial. We have only recently noticed
that our TREC--1 (and probably also TREC--2) results would have been
considerably worse had it not been that the system at that time could
not handle documents longer than 64K, and so the longest few hundred
documents in the database were truncated. The TREC--1 automatic ad hoc
run redone on the full database (with cutoff at 200 documents) gives an
11--pt average of 0.10 (0.12), precision at 5 documents 0.37 (0.50);
and at 30 documents 0.36 (0.42) (TREC--1 results in parentheses). This
appears to be because the simple weighting scheme tends to favour long
documents, particularly FR, few of which are relevant. For TREC--2 the
simple inverse collection frequency (ICF) term�weighting scheme was
elaborated to embody within�document frequency and document length
components, as well as within�query frequency, and a large number of
weighting functions were investigated.},
}