1992
@article{Moo1992,
vgclass = {refpap},
vgproject = {nn},
author = {John E. Moody},
title = {The \emph{Effective} Number of Parameters: An Analysis of
Generalization and Regularization in Nonlinear Learning Systems},
journal = {Advances in Neural Information Processing Systems},
volume = {4},
pages = {847--854},
year = {1992},
abstract = {We present an analysis of how the generalization
performance (expected test set error) relates to the expected training
set error for nonlinear learning systems, such as multilayer
perceptrons and radial basis functions. The principal result is the
following relationship (computed to second order) between the expected
test set and training set errors:
\begin{equation}
{\left<\varepsilon_{test}\left(\lambda\right)\right>}_{\xi\xi^\prime} \approx {\left<\varepsilon_{train}\left(\lambda\right)\right>}_{\xi} + 2\sigma^2_{eff}\frac{p_{eff}(\lambda)}{n}. \label{eq:1}
\end{equation}
Here, $n$ is the size of the training sample $\xi$, $\sigma^2_{eff}$ is
the effective noise variance in the response variable(s), $\lambda$ is
a regularization or weight decay parameter, and $p_{eff}(\lambda)$ is
the \emph{effective number of parameters} in the nonlinear model. The
expectations $\left<\right>$ of training set and test set errors are
taken over possible training sets $\xi$ and training and test sets
$\xi^\prime$ respectively. The effective number of parameters
$p_{eff}(\lambda)$ usually differs from the true number of model
parameters $p$ for nonlinear or regularized models; this theoretical
conclusion is supported by Monte Carlo experiments. In addition to the
surprising result that $p_{eff}(\lambda) \neq p$, we propose an
estimate of (\ref{eq:1}) called the \emph{generalized prediction error
(GPE)} which generalizes well established estimates of prediction risk
such as Akaike's \emph{FPE} and \emph{AIC}, Mallows $C_p$, and Barron's
\emph{PSE} to the nonlinear setting.},
}