1996
@techreport{WhJ1996,
vgclass = {report},
vgproject = {cbir},
author = {David A. White and Ramesh Jain},
title = {Algorithms and Strategies for Similarity Retrieval},
number = {VCL-96-101},
institution = {Visual Computing Laboratory, University of California,
San Diego},
address = {9500 Gilman Drive, Mail Code 0407, La Jolla, CA
92093-0407},
month = {July},
year = {1996},
abstract = {In the database and multimedia community there has been
increasing interest in retrieval of objects based on similarity. We
summarize the common strategies used for similarity retrieval and
survey related work on nearest neighbor searching, including relevant
complexity results. We focus our effort on the Vector Space Model where
each object is represented as a vector of fixed dimension. Further, we
develop structures that are efficient for medium and high dimensional
vectors (4-100 dimensions) because these higher dimensions are commonly
used in similarity retrieval applications. Since we believe that query
performance is of primary importance in most applications that require
similarity retrieval, we focus on methods that create optimized R-trees
and \emph{k-d} trees given a dataset, rather than on dynamic algorithms
that we argue will almost certainly provide decreased query
performance. We present algorithms for similarity retrieval that
include support for approximate queries, including an optimal algorithm
for similarity retrieval (similarity selection) in the case where an
inexpensive similarity measure is used as a filter for a more expensive
similarity measure in an indexing structure.
In addition, we present the results of extensive empirical tests on
synthetic and real datasets that show that our optimized structures,
the VAMSplit \emph{k-d} tree and VAMSplit R-tree, are superior to the
R*-tree and SS-tree in terms of query performance, time to create an
index, and space utilization. Since many of our empirical results on
query performance are independent of the size of the dataset, they
should help researchers and practitioners predict the speedups possible
in a given application. Included are empirical results that show the
performance improvement possible when different degrees of
approximation are used. We also discuss the advantages and
disadvantages of approximation, and suggest the idea of an approximate
incremental similarity query that should be useful in interactive
applications such as visual information systems.},
}