1996
@techreport{LGT1996,
vgclass = {report},
vgproject = {nn},
author = {Steve Lawrence and C. Lee Giles and Ah Chung Tsoi},
title = {What Size Neural Network Gives Optimal Generalization?
{C}onvergence Properties of Backpropagation},
number = {CS-TR-3617},
institution = {Department of Electrical and Computer Engineering,
University of Queensland},
address = {St. Lucia 4072, Australia},
year = {1996},
abstract = {One of the most important aspects of any machine learning
paradigm is how it scales according to problem size and complexity.
Using a task with known optimal training error, and a pre-specified
maximum number of training updates, we investigate the convergence of
the backpropagation algorithm with respect to a) the complexity of the
required function approximation, b) the size of the network in relation
to the size required for an optimal solution, and c) the degree of
noise in the training data. In general, for a) the solution found is
worse when the function to be approximated is more complex, for b)
oversize networks can result in lower training and generalization
error, and for c) the use of committee or ensemble techniques can be
more beneficial as the amount of noise in the training data is
increased. For the experiments we performed, we do not obtain the
optimal solution in any case. We further support the observation that
larger networks can produce better training and generalization error
using a face recognition example where a network with many more
parameters than training points generalizes better than smaller
networks.},
}