\documentclass[twoside,11pt]{article}
\usepackage{amsmath}
\usepackage{jmlr2e}

\newcommand{\sign}{\mathrm{sign}}
\newcommand{\tr}{\mathrm{tr}}
\newcommand{\tv}{\mathrm{TV}}
\newcommand{\M}{{\cal M}}
\newcommand{\N}{{\cal N}}
\newcommand{\erm}{\mathrm{erm}}
\newcommand{\sgn}{\mathrm{sgn}}
\newcommand{\entro}{\mathrm{entro}}
\newcommand{\cov}{\mathrm{cov}}

\newcommand{\ie}{{i.e.} } 
\newcommand{\cf}{{cf.} } 
\newcommand{\cL}{{\cal L} } 

\newcommand{\bx}{\mbox{\bf x}}
\newcommand{\by}{\mbox{\bf y}}
\newcommand{\bz}{\mbox{\bf z}}
\newcommand{\bu}{\mbox{\bf u}}
\newcommand{\bv}{\mbox{\bf v}}
\newcommand{\bw}{\mbox{\bf w}}
\newcommand{\bh}{\mbox{\bf h}}
\newcommand{\bg}{\mbox{\bf g}}
\newcommand{\bff}{\mbox{\bf f}}
\newcommand{\R}{{\cal R}}

\jmlrheading{2}{2002}{527-550}{6/2001}{3/2002}{Tong Zhang}
\ShortHeadings{Covering Number Bounds}{Zhang}
\firstpageno{527}

\begin{document}


\title{Covering Number Bounds of Certain Regularized Linear Function Classes}

\author{\name Tong Zhang  \email tzhang@watson.ibm.com \\
  \addr   T.J. Watson Research Center  \\
  Route 134, Yorktown Heights, NY 10598, U.S.A. \\
}

\editor{Peter L. Bartlett}

\date{}

\maketitle

\begin{abstract}
  Recently, sample complexity bounds have been derived for problems involving 
  linear functions such as neural networks and support vector machines.
  In many of  these theoretical studies, the concept of covering numbers 
  played an important role. It is thus useful to study covering numbers
  for linear function classes.
  In this paper, we investigate two closely related methods to derive upper 
  bounds on these  covering numbers. 
  The first method, already employed
  in some earlier studies, relies on the so-called Maurey's lemma; 
  the second method uses
  techniques from the  mistake bound framework in online learning.
  We compare results from these two methods, as well as their consequences in
  some learning formulations.
\end{abstract}

\begin{keywords}
  Covering Numbers, Learning Sample Complexity, Sparse Approximation, Mistake Bounds
\end{keywords}


\section{Introduction}

Assume we have 
a set of input vectors $\bx_1, \ldots, \bx_n$, with corresponding desired output
variables $y_1, \ldots, y_n$. 
The task of supervised learning is to estimate the functional relationship 
$y \approx q(\bx)$
between the input variable $\bx$ and the output variable $y$ from the 
training examples $(\bx_1, y_1), \ldots, (\bx_n, y_n)$.

A simple and useful model of an input-output functional relationship 
is to assume that the output variable can be approximately expressed as a 
linear combination of its input vector components.
With appropriate (nonlinear) features, linear models can be used to
approximate an arbitrary nonlinear function. 
One useful technique for constructing nonlinear features is 
{\em kernel methods}, where each feature is a function of the current input 
and one of the example inputs (such as their distance). If a kernel function
is positive definite, then the sample space feature representation is also 
equivalent to an implicit representation in the kernel associated reproducing 
kernel Hilbert space, which can be infinite dimensional.
Kernel methods have been successfully employed in methods such as support vector
machines and Gaussian processes \citep{CriST00}.
Furthermore, one can linearize an arbitrary nonlinear model such as a
neural network by using weighted averaging over all possible neural networks 
in the model. This approach of model combination (also called committee) has 
been widely used in machine learning to improve the predictive performance 
of a nonlinear model. For example, the recently proposed boosting algorithm 
\citep[see][]{FS97} can be considered as an implementation of this idea.

It is therefore useful to study the generalization performance of 
linear prediction methods. 
From the computational learning theory point of view, such performance 
measurements, or
sample complexity bounds, can be described by a quantity called 
{\em covering number} \citep[see][]{Pol84,Vap98}, which measures the size of a 
parametric function family.
For a two-class classification problem, its covering number can be bounded by
using a combinatorial quantity called {\em VC-dimension} \citep{Sau72,VC71}. 
More recently, researchers have discovered other
combinatorial quantities (or {\em dimensions}) 
that are useful for bounding covering numbers.
Consequently, the concept of VC-dimension 
has been generalized to more general problems
\citep{DGL96,Pol84,Vap98}.  

In a linear prediction model, we assume that the input-output functional
relationship can be expressed as $y \approx \bw \cdot \bx$, where
$\bw \cdot \bx$ denotes the inner product of vectors $\bw$ and $\bx$.
The prediction quality of this model can be measured by a loss function $\cL$, 
and our goal is to find a linear weight $\bw$ from the training data so that it
minimizes the expected loss:
\begin{align}
  & \min E_{\bx,y} \cL(\bw \cdot \bx,y)   \label{eq:reg}  \\
& \text{s.t.} \qquad g(\bw) \leq A. \nonumber
\end{align}
$E_{\bx,y}$ denotes the expectation over an unknown distribution $D$ on $(\bx,y)$.
In supervised learning, we often assume that the training data are independent
samples from $D$. 
The constraint $g(\bw) \leq A$ limit the size of the underlying linear
hypothesis family. This condition balances the prediction power and
the learning complexity of the family, and is widely used in many
recent linear prediction methods such as Gaussian process, support vector machines
and boosting. By introducing an appropriately chosen 
Lagrangian multiplier $\lambda \geq 0$ for
the constraint $g(\bw) \leq A$, the minimization of (\ref{eq:reg}) is equivalent
to minimizing 
\[
E_{\bx,y} \cL(\bw \cdot \bx,y) + \lambda g(\bw).
\]
This equivalent formulation occurs more frequently in practical applications.
Usually $E_{\bx,y}$ is replaced by the empirical expectation over the training
data, and the regularization parameter $\lambda$ is determined by cross-validation.
Similarly, if $\lambda > 0$, then we can regard $1/\lambda$ as a Lagrangian 
multiplier for the constraint $E_{\bx,y} \cL(\bw \cdot \bx,y) \leq s$, which leads
to the third equivalent formulation as follows:
\begin{align*}
  & \min g(\bw) \\
  & \text{s.t.} \qquad E_{\bx,y} \cL(\bw \cdot \bx,y) \leq s. 
\end{align*}


For a regression problem, we often choose a squared loss function 
$\cL(q,y)=(y-q)^2$.
For a binary classification problem where $y = \pm 1$, the linear
decision rule with $\bw$ is:
\[
c(\bw,\bx)=
\begin{cases}
  1 &    \mathrm{if} \quad \bw \cdot \bx >0, \\
  -1 &   \mathrm{if} \quad \bw \cdot \bx \leq 0.
\end{cases}
\]
The loss function is the classification error 
$\cL(\bw \cdot \bx ,y)= |y-c(\bw,\bx)|/2$.

Note that in the literature, one often encounters a more general type of
linear functional: $\bw \cdot \bx + b$, where $b$ is called {\em bias}.
However, one can easily convert this formulation into one in which $b$ is zero. 
This is achieved by letting
$\tilde{\bx} = [\bx, 1]$, and $\tilde{\bw} = [\bw, b]$:
$\tilde{\bw} \cdot \tilde{\bx} = \bw \cdot \bx+ b$. 
Therefore we assume a linear form with $b=0$ throughout this paper.\footnote{
The transformation may lead to a slightly different optimization problem in that
$b$ is regularized after the transformation but may not be so beforehand. 
However, the practical difference is not significant.} 

Since the classification error function is non-convex which may cause 
computational problems, one often employs a convex upper bound of 
classification error as loss function in the training. For example,
we may consider $\cL(q,y) = \log_2(1 + \exp(-qy))$ which leads 
to logistic regression.

Since the complexity regularization condition $g(\bw) \leq A$ in (\ref{eq:reg}) 
determines the shape of the underlying linear function classes, it has a 
significant impact on the generalization ability of (\ref{eq:reg}) or its 
equivalent formulations. As an example,
a support vector machine in its primal formulation can be regarded as a 
special case of (\ref{eq:reg})
with square regularization condition $g(\bw) = \bw \cdot \bw$.
For data with bounded 2-norms,
Vapnik has shown that the VC dimension of a linear function class 
bounded in 2-norm (assume it separates the input data with a positive margin)
is independent of the input space dimension. He then argued that
the generalization performance of a support vector machine does not depend
on the input data dimension. This observation is significant since it means
that with an appropriate regularization on the parameter space, the input data 
dimension does not have an adverse impact on the ability to learn from
data. This prevents the so called {\em curse-of-dimension} in many learning 
formulations. 
One natural question to ask is whether this property is unique to
the 2-norm regularization. For example, what is the implication of using 
some other regularization conditions in (\ref{eq:reg})? 
The answer to this question can be of great interest since people have already 
used different kinds of non 2-norm regularization terms in engineering 
applications.

Related to this question, there have been a number of recent works on 
large margin linear classification using non 2-norm regularization.
For example, \citet{Bar98} studied the performance of neural 
networks under the 1-norm regularization of the associated weights.
The same idea has also been applied by \citet{SFBL98} to analyze the boosting 
algorithm.
It has later been realized that these theoretical results are directly related to
some newly obtained covering number bounds for linear function classes
under appropriate regularization conditions. Consequently, a number of studies
have appeared in the last few years on covering numbers for linear function 
classes \citep{AnBar99,GBSW99,Gurvits97,WSS99,WSS00}. 

In Section~\ref{sec:linear}, 
we derive new covering number bounds, which complement and improve
results from previous studies.
In our analysis, we emphasize the importance of covering number bounds that do 
not depend on the input data dimension.
Based on these new covering number results, generalization performance of 
formulation (\ref{eq:reg}) is obtained in the PAC learning framework.
Specifically, under certain non 2-norm regularization conditions, weight $\bw$ 
computed with (\ref{eq:reg}) can also lead to generalization performances 
that do not 
deteriorate when the input dimension increases.
Note that this property has been regarded as a major theoretical advantage
for support vector machines.

The paper is organized as follows.
In Section~\ref{sec:cover}, we briefly review the concept of covering numbers
as well as some main results for analyzing the performance of a learning 
algorithm.
In Section~\ref{sec:linear}, we introduce the regularization idea. 
Our main goal is to construct regularization conditions for linear function
classes so that the resulting covering numbers are independent of the input
data dimension. We also introduce a new technique of using online learning
to prove covering number bounds. We show that this method leads to results
that improve some previous bounds.
Section~\ref{sec:example} applies the covering number bounds to analyze
some specific examples of (\ref{eq:reg}).
Section~\ref{sec:discussion} summarizes results obtained in this paper.

\section{Covering Numbers}
\label{sec:cover}
We formulate the statistical learning problem as to find a parameter from random 
observations to minimize the expected loss ({\em risk}):
given a loss function $\cL(\alpha,\bx)$ and $n$ observations 
$X_1^n=\{\bx_1,\ldots,\bx_n\}$ independently drawn from a fixed but unknown
distribution $D$, we want to find $\alpha$ that minimizes 
the true risk defined as: 
\begin{equation}
  R(\alpha) = E_{\bx} \, \cL(\alpha,\bx) = \int \cL(\alpha,\bx) \; d P_D (\bx),
  \label{eq:estimation}
\end{equation}
where $E_{\bx}$ denotes the expectation over the unknown distribution $D$.
In order to make the discussion more general, we have adopted different
notations than those in (\ref{eq:reg}). In particular, $y$ in (\ref{eq:reg})
is absorbed into the input variable $\bx$ in (\ref{eq:estimation}); 
the linear weight parameter $\bw$ in (\ref{eq:reg}) corresponds to the general
parameter $\alpha$ in (\ref{eq:estimation}).

Without any assumption of the underlying distribution $D$ on $\bx$, 
a natural method for solving (\ref{eq:estimation}) with a 
limited number of observations is the {\em empirical risk minimization} (ERM)
method \citep{Vap98}.
We choose a parameter $\alpha$ that minimizes the observed risk:
\[
  R(\alpha,X_1^n) = E_{X_1^n} \cL(\alpha,\bx)=\frac{1}{n} 
\sum_{i=1}^{n} \cL(\alpha,\bx_i),
\]
where we use $E_{X_1^n}$ to denote the empirical expectation over the observed 
data.

The learning behavior of this method with a finite sample size can be studied 
under the VC theory, which relies on the uniform convergence of the empirical 
risk to the true risk (also called the uniform law of large numbers).
Such a uniform convergence bound can be obtained from quantities that measure 
the size of a {\em Glivenko-Cantelli} class. 
For a function class containing a finite number of indices,
its size is simply measured by its cardinality.
For a general function class, a well known quantity to measure its
size, which determines the degree of uniform convergence, is 
the {\em covering number}. The covering number concept can be dated back to 
\citet{PonSch32,Kol56,KT61}: one discretizes (the discretization
process can depend on the observation $X_1^n$) the parameter space into $N$ values 
$\alpha_1, \ldots, \alpha_N$ so that each $\cL(\alpha,\cdot)$ 
can be approximated by $\cL(\alpha_i,\cdot)$ for some $i$. 
We shall only describe a simplified version relevant to our purpose. 

\begin{definition}
  Let $B$ be a metric space with metric $\rho$. 
  Given observations $X_1^n= [\bx_1,\ldots, \bx_n]$,
  and vectors $f(\alpha,X_1^n)=[f(\alpha, \bx_1), \ldots, f(\alpha, \bx_n)] \in B^n$
  parameterized by $\alpha$, the covering number in $p$-norm, denoted as
  $\N_p(f,\epsilon,X_1^n)$, is the minimum number $m$ of a collection of vectors
  $\bv_1, \ldots, \bv_m \in B^n$, such that $\forall \alpha$, $\exists \bv_j$:
  \[
  \| \rho(f(\alpha,X_1^n), \bv_j ) \|_p 
  = \left[ \sum_{i=1}^n \rho(f(\alpha,\bx_i), \bv_{j}^{i})^p \right] ^{1/p}
  \leq n^{1/p} \epsilon,
  \]
  where $\bv_{j}^{i}$ is the $i$-th component of vector
  $\bv_{j}$.
  We also define
  $\N_p(f,\epsilon,n)= \sup_{X_1^n} \N_p(f,\epsilon, X_1^n)$.
\end{definition}

Note that from the definition and Jensen's inequality,
we have $\N_p \leq \N_q$ for $p \leq q$.
We implicitly assume that the metric on the real line $\R$  is 
$|x_1 - x_2|$ unless otherwise specified.

The following theorem, which bounds the rate of uniform convergence of a function
class in terms of its covering number, is due to \citet{Pol84}:
\begin{theorem}[\citealt{Pol84}] 
  \label{thm:pol84}
  $\forall \epsilon >0$, and distribution $D$,
  \[
  P \left[\sup_{\alpha} | R(\alpha,X_1^n) - R (\alpha)| > \epsilon\right] 
  \leq 8 E [\N_1(\cL,\epsilon/8,X_1^n)] 
  \exp\left(\frac{-n\epsilon^2}{128 M^2}\right),
  \]
  where $M = \sup_{\alpha,\bx} \cL(\alpha,\bx) - \inf_{\alpha,\bx} 
  \cL(\alpha,\bx)$, and
  $X_1^n=\{\bx_1,\ldots,\bx_n\}$ are independently drawn from $D$.
\end{theorem}

Constants in the above theorem can be improved for certain problems
\citep{Dud84,Hau89,Vap98}. However, they
yield similar bounds. A result that is more relevant to our purpose is a 
lemma by \citet{Bar98}, where the $1$-norm covering number of the function class
in Theorem~\ref{thm:pol84}
is replaced by an $\infty$-norm covering number. The latter quantity
can be bounded by a scale-sensitive combinatorial dimension  
\citep{ABCH97,Gurvits97}.
Under certain circumstances, these results can replace Theorem~\ref{thm:pol84} 
to give better estimates.

However, Bartlett's lemma  is only for binary-valued function classes.
We will thus extend the result into a form that becomes comparable to
Theorem~\ref{thm:pol84}. 
In the following theorem, we replace the ``margin'' concept for 
classification problems by a notion of separation for general problems.
We also avoid introducing the concept of ``fat-shattering'' dimension which 
leads to some complicated technical manipulations \citep{Bar98}. 
There are two major differences between the following theorem and 
Theorem~\ref{thm:pol84}:
1. with the existence of a $\gamma$-separating function, we are able to 
use different accuracies $\gamma$ and $\epsilon$ respectively in the 
covering number estimate and in the Chernoff bound; 2. the covering number
used in Theorem~\ref{thm:inf-norm} does not directly correspond to that of the 
overall loss function.
\begin{theorem}
  \label{thm:inf-norm}
  Let $f_1$ and $f_2$ be two functions: $\R^n \to [0,1]$ such that 
  $|y_1 - y_2| \leq  \gamma$ implies $f_1(y_1) \leq f_3(y_2) \leq f_2(y_1)$
  where $f_3: \R^n \to [0,1]$ is a reference separating function, then
  \[
  P \left[\sup_{\alpha} [ E_{\bx} f_1(\cL(\alpha,\bx)) - 
  E_{X_1^n} f_2 (\cL(\alpha,\bx))] >  \epsilon \right] 
  \leq 4E [\N_\infty(\cL,\gamma,X_1^n)] \exp\left(\frac{-n\epsilon^2}{32}\right).
  \]
\end{theorem}
\begin{proof}
  See Appendix~\ref{apx:inf-norm}. 
\end{proof}

We say that functions $f_1: \R \to \R$ and $f_2: \R \to \R$ have a 
$\gamma$ separator if
there exists a function $f_3: \R \to \R$, 
such that $|y_1 - y_2| \leq  \gamma$ implies 
$f_1(y_1) \leq f_3(y_2) \leq f_2(y_1)$. 

Given an arbitrary function $f_1$ and $\gamma>0$, one can easily construct 
$f_2$ and $f_3$ such that $f_1$ and $f_2$ have a $\gamma$-separator $f_3$.
To see this, observe that for a function
$f(y) : \R^n \to [0,1]$, if we
define $f^\gamma(y) = \sup_{|z-y|< 2\gamma} f(z)$, then 
$f_1(y)=f(y)$ and $f_2(y)=f^{\gamma}(y)$ have a $\gamma$ separator 
$f_3(y)=f^{\gamma/2}(y)$.
Therefore Theorem~\ref{thm:inf-norm} upper bounds the true expected error 
of a function $f: \R^n \to [0,1]$ in terms of the empirical expected error
of $f^{\gamma}$. For classification problems, one usually chooses a formulation
with a function $f$ that is non-increasing.
In this case we have $f^\gamma(y) = \sup_{|z-y|< 2\gamma} f(z)= f(y-2 \gamma)$.
If $f$ is the step function such that $f(z)=1$ when 
$z \leq 0$ and $f(z)=0$ otherwise
(corresponding to the classification error function), 
then $f^\gamma$ corresponds to the
classification error with a positive margin $2 \gamma$. In this special case, 
Theorem~\ref{thm:inf-norm} yields the lemma of \citet{Bar98}.

Theorem~\ref{thm:inf-norm} leads to the following PAC style generalization
error bound for $\gamma$-separable functions $f_1$ and $f_2$: 
$\forall \eta>0$, with probability of at least $1-\eta$ over the
observed data $X_1^n$, for all $\alpha$:
\begin{equation}
  E_{\bx} f_1(\cL(\alpha,\bx)) \leq E_{X_1^n} f_2 (\cL(\alpha,\bx))
  + \sqrt{\frac{32}{n} \left(\ln 4 \N_\infty(\cL,\gamma,n) + 
      \ln \frac{1}{\eta}\right)}.
  \label{eq:inf-norm-pac}
\end{equation}
If we consider a sequence of functions $f_2^{\gamma}$ parameterized by $\gamma$,  
so that $f_1$ and $f_2^{\gamma}$ have a $\gamma$ separator, then
the above PAC bound is valid with $f_2$ replaced by $f_2^{\gamma}$
under the assumption that $\gamma$ is fixed a priori
(data independent).
However, using 
an idea described by \citet{TBWA96}, it is not difficult to give a bound 
that is uniformly valid for all $\gamma$, 
even if $\gamma$ is chosen according to the observed data:
\begin{corollary}
  Let $f_1$ be a function $\R \to \R$.
  Consider a family of functions
  $f_2^{\gamma}: \R \to \R$, parameterized by $\gamma$, such that
  $0 \leq f_1 \leq f_2^\gamma \leq 1$.
  Assume that for all $\gamma$,
  $f_1$ and $f_2^\gamma$ has a $\gamma$ separator. 
  Assume also that $f_2^{\gamma}(y) \geq f_2^{\gamma'}(y)$ when
  $\gamma \geq \gamma'$. 
  Let $\gamma_1 > \gamma_2 > \cdots$ be a decreasing sequence of parameters, 
  and $p_i$ be a sequence of positive numbers such that 
  $\sum_{i=1}^{\infty} p_i=1$, then
  for all $\eta >0$, with probability of at least $1-\eta$ over data:
  \[
  E_{\bx} f_1(\cL(\alpha,\bx)) \leq E_{X_1^n} f_2^\gamma (\cL(\alpha,\bx))
  + \sqrt{\frac{32}{n} \left(\ln 4 \N_\infty(\cL,\gamma_i,X_1^n) + 
      \ln \frac{1}{p_i \eta} \right)}
  \]  
  for all $\alpha$ and $\gamma$, where for each fixed $\gamma$, we
  use $i$ to denote the 
  smallest index such that $\gamma_i \leq \gamma$.
  \label{cor:gamma_uniform}
\end{corollary}
\begin{proof}
The result follows from Theorem~\ref{thm:inf-norm} and basic probability arguments
presented by \citet{TBWA96}.
$\forall i>0$, with probability at most $p_i \eta$
over $X_1^n$, we have
\[
 E_{\bx} f_1(\cL(\alpha,\bx)) > E_{X_1^n} f_2^{\gamma_{i}} (\cL(\alpha,\bx))
 + \sqrt{\frac{32}{n} \left(\ln 4 \N_\infty(\cL,\gamma_i,X_1^n) + 
   \ln \frac{1}{p_i \eta}\right)}.
\]
Summing up over $i$, with probability at most $\eta$ over $X_1^n$, 
\[
 E_{\bx} f_1(\cL(\alpha,\bx)) > E_{X_1^n} f_2^{\gamma_{i}} (\cL(\alpha,\bx))
 + \sqrt{\frac{32}{n} \left(\ln 4 \N_\infty(\cL,\gamma_i,X_1^n) + 
   \ln \frac{1}{p_i \eta}\right)}.
\]
for at least one $i$, which implies the corollary.
\end{proof}

We can further extend the above corollary which is useful for analyzing 
the generalization performance of (\ref{eq:reg}).
\begin{corollary}
  Under the assumptions of Corollary~\ref{cor:gamma_uniform}.
  We further assume that there is a sequence of functions 
  $\cL_1(\alpha,\bx), \cL_2(\alpha,\bx), \ldots$ with $\alpha$ 
  respectively defined on the parametric spaces 
  $\Gamma_1, \Gamma_2, \ldots$. 
  Let $q_i$  be a sequence of positive numbers such that 
  $\sum_{i=1}^{\infty} q_i=1$, then
  for all $\eta >0$, with probability of at least $1-\eta$ over data:
  \[
  E_{\bx} f_1(\cL_j(\alpha,\bx)) \leq E_{X_1^n} f_2^\gamma (\cL_j(\alpha,\bx))
  + \sqrt{\frac{32}{n} \left(\ln 4 \N_\infty(\cL_j,\gamma_i,X_1^n) + 
    \ln \frac{1}{p_i q_j\eta}\right)}
  \]    
  for all $j$, $\alpha \in \Gamma_j$, and $\gamma \in [0,1]$, 
  where for each fixed $\gamma$, we  use $i$ to denote the 
  smallest index such that $\gamma_i \leq \gamma$.
  \label{cor:para_uniform}
\end{corollary}
\begin{proof}
Similar to the proof of Corollary~\ref{cor:gamma_uniform}.
$\forall j>0$, with probability at most $q_j \eta$ over $X_1^n$, we can find
$\alpha$ and $\gamma$ such that
\[
 E_{\bx} f_1(\cL_j(\alpha,\bx)) > E_{X_1^n} f_2^{\gamma_{i}} (\cL_j(\alpha,\bx))
 + \sqrt{\frac{32}{n} \left(\ln 4 \N_\infty(\cL_j,\gamma_i,X_1^n) + 
   \ln \frac{1}{p_i q_j \eta}\right)}.
\]
Summing the probability over $j$, we obtain the corollary.
\end{proof}

If close to perfect generalization can be achieved, i.e.
$E_{X_1^n} f_2^\gamma (\cL(\alpha,\bx)) \approx 0$, we can obtain better bounds
by using a refined version of the Chernoff bound 
where the quantity $-2n\epsilon^2$ on the exponent can be replaced by 
$-n\epsilon^2/2 (E f + \epsilon)$ if the empirical mean is larger than the true
mean; and by $-n\epsilon^2/E f$ if the empirical mean is smaller than the 
true mean.
In the extreme case that there is always a choice of $\alpha$ that achieves 
the perfect generalization:
$E_{\bx} f_2^\gamma (\cL(\alpha,\bx)) = 0$, we can assume that our choice of 
$\alpha(X_1^n)$ satisfies
$E_{X_1^n} f_2^\gamma (\cL(\alpha,\bx)) = 0$. Under this assumption,
bounds in this section can be improved substantially if we replace the
standard Chernoff bound by the refined Chernoff bound. 
Specifically, a PAC bound in the order of
$O\left(\frac{1}{n} \log \N_\infty\right)$ can be obtained, rather than 
a bound in the order of
$O\left(\sqrt{\frac{1}{n} \log \N_\infty}\right)$ as in the standard case
(\ref{eq:inf-norm-pac}).


\section{Covering Number Bounds for Linear Function Classes}
\label{sec:linear}

Theorems in Section~\ref{sec:cover} indicate that
covering numbers of a function class
play crucial roles in the uniform convergence behavior of its
members' empirical risks to their corresponding true risks.
A bound on the rate of uniform convergence directly implies a bound on 
the generalization ability of an empirical risk minimization algorithm. 

In this section, we derive  some new covering number results for real valued linear 
function classes of the following form:
\begin{equation}
  L(\bw,\bx)= \bw \cdot \bx = \sum_{j=1}^d \bx^j \bw^j.
\label{eq:lin_loss}
\end{equation}
We use $\bx^j$ to denote the $j$-th component of the observation vector $\bx$.
We also use $\bw$ to denote the linear
weight, and $\bw^j$ to denote its $j$-th component.
$d$ is the dimension of the system.

Results in this section complement related results in many previous
studies such as those by \citet{Bar98,GBSW99,Mend01-geom,TBWA96,WSS99,WSS00}. 
From theorems in Section~\ref{sec:cover}, we see that 
covering number bounds for (\ref{eq:lin_loss})
are relevant to the learning behavior of (\ref{eq:reg}). 
However, keep in mind that in this Section, notation $L$ in
(\ref{eq:lin_loss}) is used to denote a linear function class.  This should not
be confused with $\cL$ in (\ref{eq:reg}), which is used to denote a 
specific loss function in a specific learning formulation. 

Covering number results of a linear function class such as those
we derive in this section can also be used to derive covering numbers for
certain nonlinear function classes.
For example, if we can write a loss function as
$\cL(w,x,y)=\rho(f(w,x),y)$  where $\rho$ is a Lipschitz function, then
covering number bounds of $\cL$ can be obtained using covering numbers
of $f(\cdot)$  \citep[see Lemma 17.6 of][]{AnBar99}. Using this result, it is 
clear that bounds derived in this section can also be used to
obtain covering numbers for more complicated functions 
such as neural networks and support vector machines.

We start with covering number results for Theorem~\ref{thm:pol84}.
Because $\N_1 \leq \N_2$, therefore in order to apply Theorem~\ref{thm:pol84}, 
it is sufficient to estimate $\N_2(L,\epsilon,n)$ for $\epsilon>0$.
It is clear that $\N_2(L,\epsilon,n)$ is not finite if no
restrictions on $\bx$ or $\bw$ are imposed.
Therefore in the following, we assume the condition that $\|\bx_i\|_p$ is 
bounded for observed data.
We then focus on the form of regularization conditions on $\|\bw\|_q$, so that 
$\log\N(f,\epsilon,n)$ is independent (or weakly dependent) of $d$.

We start our analysis with a lemma that is attributed to Maurey 
\citep[also see][]{Bar93,Jon92}.
\begin{lemma} [Maurey]
  In a Hilbert space, let $f=\sum_{j=1}^d w_j \bg^j$, where each 
  $\|\bg^j\| \leq b$,
  $w_j \geq 0$ and  $\alpha = \sum_{j=1}^d w_j \leq 1$,
  then for every $n \geq 1$, there exist non-negative integers 
  $k_1, \ldots, k_d \geq 0$, such that $\sum_{j=1}^d k_j \leq n$ and
  \[
  \left\| f - \frac{1}{n} \sum_{j=1}^d k_j \bg^j \right\|^2 \leq 
  \frac{\alpha b^2-\|f\|^2}{n}.
  \]
  \label{lem:maurey}
\end{lemma}

Our first result generalizes a theorem of \citet{Bar98}.
The original result was with $p=\infty$ and $q=1$;
some related techniques have also been used by \citet{LBW96} and \citet{SFBL98}.  
We would like to mention that it is possible to prove a better bound
(when $p<\infty$) using machineries from the geometric theory of
Banach spaces. However, the technique presented here is more elementary and
self-contained. It is directly comparable to the idea of using online 
mistake bounds to obtain $\infty$-norm covering number bounds which we shall 
investigate later. This comparison provides useful insights.
\begin{theorem}
  \label{thm:p-q-norm}
  If $\|\bx\|_p \leq b$, and $\|\bw\|_q \leq a$, where 
  $1/p + 1/q =1$ and $2 \leq p \leq \infty$, then
  \[
  \log_2 \N_2(L,\epsilon,n) \leq \left\lceil \frac{a^2b^2}{\epsilon^2} \right\rceil 
  \log_2(2d +1).
  \]
\end{theorem}
\begin{proof}
Consider matrix $X= [\bx_1, \ldots, \bx_n]^T$, where each $\bx_i$ is
a column vector and $T$ is the matrix transpose operator.
Denote the columns of $X$ by $\by^1, \ldots, \by^d$.
Let 
\[
\bg^j= \frac{n^{1/p}ab}{\|\by^j \|_p} \by^j, \qquad 
w_j' = \frac{\|\by^j\|_p}{n^{1/p}ab} \bw^j,
\]
where $\bw^j$ is the $j$-th component of vector $\bw$.
By H\"{o}lder's inequality, it is easy to check that 
\begin{align*}
\sum_{j=1}^d |w_j'| & = \sum_{j=1}^d \frac{\|\by^j\|_p}{n^{1/p}ab} |\bw^j|   \\
& \leq \frac{1}{n^{1/p}ab} \left(\sum_{j=1}^d \|\by^j\|_p^p \right)^{1/p} 
  \left(\sum_{j=1}^d |\bw^j|^q \right)^{1/q} \\
& \leq \frac{1}{n^{1/p}ab} (n b^p)^{1/p} a =1. 
\end{align*}
Since the function $x^{p/2}$ is convex, thus by Jensen's inequality, we obtain
$n^{-1/2} \|\by^j\|_2 \leq n^{-1/p} \|\by^j\|_p$ for all $j$.
This implies that $\|\bg^j\|_2 \leq n^{1/2}ab$.
Therefore by Lemma~\ref{lem:maurey}, if we let $k \geq (ab/\epsilon)^2$, then
$\forall \bz = \sum_{j=1}^d \bw^j \by^j
=\sum_{j=1}^d |w_j'| (\sgn(w_j') \bg^j)$, we can find
integers $k_1,\ldots,k_d$ such that $\sum_{j=1}^d |k_j| \leq k$ and
\[
\left\| \bz - \frac{1}{k} \sum_{j=1}^d k_j \bg^j \right\|_2^2 
\leq \frac{na^2b^2}{k} \leq n \epsilon^2.
\]
This means that the covering number $\N_2(L,\epsilon,n)$ is no larger than
the number of integer solutions of $\sum_{j=1}^d |k_j| \leq k$, which is
less than or equal to $(2d+1)^k$.
\end{proof}

In the above proof, a more careful analysis of
the number of possible solutions of $\sum_{j=1}^d |k_j| \leq k$
can lead to a bound of $(\frac{2e (d+k)}{k})^k$. 
Although when $k$ is large this bound is tighter than 
the bound $(2d+1)^k$ used in Theorem~\ref{thm:p-q-norm}, the difference is 
relatively minor since the dominant contribution is the exponent $k$. We have
thus chosen the more compact expression $(2d+1)^k$.

The above bound on the (logarithmic) covering number depends logarithmically on 
$d$, which is already quite weak (compared to the linear $d$-dependency in 
the standard situation without regularization). However, it is 
also possible to remove the dimensional dependency. 
We demonstrate for the 
case of $p=2$ in the following corollary.
\begin{corollary}
  If $\|\bx\|_2 \leq b$ and $\|\bw\|_2 \leq a$, then
  \[
  \log_2 \N_2(L,\epsilon,n) \leq \left\lceil \frac{a^2b^2}{\epsilon^2} \right\rceil 
  \log_2(2n +1).
  \]
  \label{cor:2-2-norm}
\end{corollary}
\begin{proof}
For a sequence of data $\bx_1, \ldots, \bx_n$,
denote by $S$ the subspace spanned by $\bx_1,\ldots, \bx_n$.
Denote by $P_S(\bw)$ the orthonormal projection operator that projects $\bw$ onto
the subspace $S$. Clearly, $P_S(\bw) \cdot \bx_i= \bw \cdot \bx_i$ for all 
$i=1,\ldots,n$. This means that a data-dependent cover of 
$L(P_S(\bw),\bx)$ also gives a data-dependent cover of 
$L(\bw,\bx)$ in (\ref{eq:lin_loss}).
To bound the 2-norm covering number of $L(P_S(\bw),\bx)$, we simply observe that
 $\|P_S(\bw)\|_2 \leq \|\bw\|_2 \leq a$ and that
$S$ is of dimension at most $n$. We thus obtain the corollary from
Theorem~\ref{thm:p-q-norm}.
\end{proof}


Intuitively, the reason  that we can remove the dimensional dependency of the 
2-norm covering number in Corollary~\ref{cor:2-2-norm} is based on the observation
that (in the case of $p=2$) the effective dimension
of $\bw$, acted on $n$ data $\bx_1,\ldots,\bx_n$, is at most $n$. 
Using the same underlying idea, it is also possible to obtain dimension
independent covering number results for the case $2 \leq p < \infty$.
In general, for regularization condition $g(w) \leq a$, one needs to cover 
$\bw$ with weights in the form of $\nabla g^{-1}(\sum_{i=1}^n \alpha_i \bx_i)$,
where $\nabla g^{-1}$ is the inverse function of the gradient $\nabla g$.
This changes the dimension $d$ of $\bw$ to the dimension $n$ of $\alpha$.

Naturally the above discussion  leads to the idea of using the mistake
bound framework in online learning to obtain covering number bounds. 
In online learning, 
one represents the weight vector as $\nabla g^{-1}(\sum_{i=1}^n \alpha_i \bx_i)$, 
where the function $\nabla g^{-1}$ is called a transfer function 
\citep{GLS01,GW98,KW97}. An online mistake bound can be regarded as 
an approximation bound for this representation. At each step, we look at
a data point $\bx_i$, and check the prediction (such as classification)
associated with the current 
weight representation. We add $\bx_i$ into the representation only when a
mistake is made. The mistake bound analysis by \citet{GLS01} shows that 
for certain classification problems, there exists a quantity $M$ so that 
after $M$ components are added in the representation, no more mistakes will be
made (thus no more components will be added). In this regard, 
the sparse representation in Maurey's lemma
(Lemma~\ref{lem:maurey}) corresponds to the sparse representation in the mistake
bound framework \citep{GLS01}. We can thus use the latter to upper bound the 
sparsity $k$ in the representation $\nabla g^{-1}(\sum_{j=1}^k \eta \bx_{i_j})$
such that 
$|\nabla g^{-1}(\sum_{j=1}^k \eta \bx_{i_j}) - \bw \cdot \bx_i| \leq \epsilon$ 
for all $i$ with any specified $\epsilon>0$. This idea is rigorously carried
out in the proof of Theorem~\ref{thm:pq_online} below. 

Also note that using online learning, we are able to directly
obtain bounds for $\infty$-norm covering numbers, which are not only useful for
Theorem~\ref{thm:pol84} (since $\N_1 \leq \N_\infty$), but also useful for
Theorem~\ref{thm:inf-norm}. Therefore we shall not further consider the idea
of using Maurey's lemma and restrict the effective dimension to remove 
the $d$-dependency as in Corollary~\ref{cor:2-2-norm}. We will only focus on
using online learning techniques to directly obtain $\infty$-norm covering 
numbers.

We shall mention that traditionally, $\infty$-norm covering numbers are obtained
through the so-called ``fat-shattering'' dimension \citep{ABCH97}. The latter can
be bounded using various methods \citep{Bar98,Gurvits97,TBWA96}. 
However, due to the extra stage of estimating the ``fat-shattering'' dimension, 
this approach leads to bounds that are worse than our bounds 
that are directly obtained. For example, the ``fat-shattering''
approach would have led to a bound of the order
$\log_2 \N_\infty(L,\epsilon,n)=
O(\frac{a^2 b^2}{\epsilon^2}\log_2 (n/\epsilon+1)^2)$ for $p=2$ in 
Theorem~\ref{thm:pq_online}.
Since  covering numbers (rather than fat-shattering dimensions) have more 
direct learning consequences,
our results can lead to better learning bounds than what can be obtained from
these earlier studies (see Section~\ref{sec:example}).

In the proof of Theorem~\ref{thm:pq_online}, we need the following 
mistake bound result that was proved by \citet{GLS01}. The bound indicates
that we can approximate a target vector $\bw$ (that satisfies a margin property) 
using no more than $M$ data points, so that the inner product of 
the approximation vector and any data point $x_i$ has the same sign as that of 
$\bw \cdot \bx_i$. 

\begin{proposition}
  Consider $2 \leq p < \infty$.
  Let $\bw$ be a target vector, and $\{\bx_i\}$ be a countable set of
  example vectors. Assume that $\delta = \inf_i \bw \cdot \bx_i >0$, and let
  \[
  M = \frac{(p-1) \|\bw\|_q^2 \sup_i \|\bx_i\|_p^2}{\delta^2}.
  \]
  Then there exists an integer sequence $i_1,\ldots, i_k$ where 
  $k \leq M$, and a vector $\hat{\bw}$ 
  defined as $\hat{\bw}=f_p(\sum_{\ell=1}^k \bx_{i_\ell})$
  so that $\hat{\bw} \cdot \bx_i >0$  for all $i$. 
  $f_p(\bx)$ is a component-wise function that maps each component 
  $\bx^j$ of $\bx$ to $p \cdot \sign(\bx^j) |\bx^j|^{p-1}$.
  \label{prop:mistake-bound-pq}
\end{proposition}

\begin{theorem}
  If $\|\bx\|_p \leq b$ and $\|\bw\|_q \leq a$, where
  $2 \leq p <\infty$ and $1/p + 1/q =1$, then $\forall \epsilon>0$,
  \[
  \log_2 \N_\infty(L,\epsilon,n) \leq 
  36 (p-1) \frac{a^2 b^2}{\epsilon^2} 
  \log_2 [2 \lceil 4ab/\epsilon+2 \rceil n+1].
  \]
  \label{thm:pq_online}
\end{theorem}
\begin{proof}
If $\epsilon>ab$, then since $| \bw \cdot \bx_i| \leq ab$ for all $i$, we can
choose $0$ as a cover and the theorem follows trivially. In the following
we assume that $\epsilon \leq ab$.
 
We divide the interval $[-ab-\epsilon/2,ab+\epsilon/2]$ 
into $m=\lceil 4ab/\epsilon+2 \rceil$ sub-intervals, each of size no larger
than $\epsilon/2$. 
Let $-ab-\epsilon/2=\theta_0 < \theta_1 < \cdots < \theta_m=ab+\epsilon/2$
be the boundaries of the intervals so that 
$\theta_{j}-\theta_{j-1} \leq \epsilon/2$ for all $j$.
For a sample $X_1^n=\{\bx_1,\ldots,\bx_n\}$, consider the sets 
$S_1=\{(\bx_i, -\theta_j/a): i=1,\ldots,n; j=0,\ldots,m-1\}$ and
$S_2=\{(-\bx_i,\theta_j/a): i=1,\ldots,n; j=1,\ldots,m\}$.

For all $\bw$ such that $\|\bw\|_q \leq a$, consider the set of values
$\bw \cdot \bx_i - \theta_{j_1(i,\bw)}$ and 
$- \bw \cdot \bx_i + \theta_{j_2(i,\bw)}$ for all $i$.
We use $j_1(i,\bw)$ to denote the maximum index of $\theta_j$ such that
$\bw \cdot \bx_i - \theta_{j_1(i,\bw)} \geq \epsilon/2$; and
use $j_2(i,\bw)$ to denote the minimum index of $\theta_j$ such that
$\bw \cdot \bx_i - \theta_{j_2(i,\bw)} \leq -\epsilon/2$.

Now, we consider $(\by,z)$ such that $\forall i$:
$\by \cdot \bx_i - z \theta_{j_1(i,\bw)} >0$ and $- \by \cdot \bx_i + 
z \theta_{j_2(i,\bw)} >0$. Since $\theta_{j_1(i,\bw)} < \theta_{j_2(i,\bw)}$,
it follows that $z > 0$ and 
$\forall i: \by \cdot \bx_i /z \in (\theta_{j_1(i,\bw)},\theta_{j_2(i,\bw)})$.
This implies that $|\by \cdot \bx_i/z - \bw \cdot \bx_i| < \epsilon$ for all $i$.

Next we show how to construct such a pair of $(\by,z)$. 
Let $f_p(z) = p \cdot \sign(z) |z|^{p-1}$, and
\[
M=36 (p-1) \frac{a^2 b^2}{\epsilon^2} 
\geq 
\frac{(p-1)}{(\epsilon/2)^2} 
(\|\bw\|_q^q + a^q)^{2/q} \sup_i(\|\bx_i\|_p^p+(b+\epsilon/2a)^p)^{2/p}.
\]
Using Proposition~\ref{prop:mistake-bound-pq}, we know
that $\forall \|\bw\|_q \leq a$, 
there exist non-negative integer sequences $\alpha_i$ and $\beta_i$
where $\sum_{i=1}^n (\alpha_i + \beta_i) \leq M$, with the following
property: if we let
\[
(\by, a z) = f_p\left(\sum_i \alpha_i (\bx_i, -\theta_{j_1(i,\bw)}/a)
+ \sum_i \beta_i (-\bx_i,\theta_{j_2(i,\bw)}/a)\right),
\]
then
$\by \cdot \bx_i - z \theta_{j_1(i,\bw)} >0$ and 
$-\by \cdot \bx_i + z \theta_{j_2(i,\bw)} >0$
for all $i$.

It follows from the above discussion that 
$|\by \cdot \bx_i/z - \bw \cdot \bx_i| < \epsilon$ for all $i$.
This implies that the infinity-norm covering number 
$\N_\infty(L,\epsilon,n)$ is no more than the number of possible
$(\by,z)$ constructed above. It is clear that this number is no more than 
the number of non-negative integer solutions of 
\[
\sum_{i,j} n_{i,j} + \sum_{i,j} m_{i,j} \leq M,
\]
where $(i,j)$ goes through the index of $S_1$ for $n_{i,j}$ and 
the index of $S_2$ for $m_{i,j}$. Since the
number of solutions is no more than $(|S_1|+|S_2|+1)^M$, we obtain
\[
\log_2 \N_\infty(L,\epsilon,n) \leq 
36 (p-1) \frac{a^2 b^2}{\epsilon^2} \log_2 [2 \lceil 4ab/\epsilon+2 \rceil n+1].
\]
\end{proof}

The bound given in Theorem~\ref{prop:mistake-bound-pq} is comparable to
related results by \citet{WSS00}, which were obtained by using 
a different technique relying on the operator theory of Banach spaces. 
However, in certain cases, our method
may yield results that are difficult to obtain from the Banach space approach
considered by \citet{WSS00}. For example, it is difficult to obtain the entropy 
regularization result in Theorem~\ref{thm:infty_online} using their method.
One reason is that a topological structure is necessary for
the argument of \citet{WSS00} to go through. As we shall see later, our analysis
only involves some analytical structures of an appropriately defined pair of 
dual convex functions on the linear weight space and the sample space. 
On one hand, the norm of a 
Banach space naturally leads to a convex function on the underlying space; 
on the other hand, the concept of convex function can be defined on any linear
vector space that does not necessarily have a norm or topological structure.

Note that we have made no attempt to optimize the constants
in the proof of Theorem~\ref{thm:pq_online}. For example,
since $\theta_0=-ab-\epsilon/2$ and $\theta_m=ab+\epsilon/2$ 
are quite artificially introduced for the mere purpose of consistent 
indexing, we can easily obtain an improved version of 
Theorem~\ref{thm:pq_online} by simply ignoring them.
Also note that $\N_2 \leq \N_\infty$, therefore Theorem~\ref{thm:pq_online} 
implies dimension independent
2-norm covering number bounds for $2 \leq p <\infty$, which gives better results
than Theorem~\ref{thm:p-q-norm} in the sense of dimensional dependency. 
The bound in Theorem~\ref{thm:pq_online} diverges as $p \to \infty$.
In the case of $p = \infty$, we show that an entropy condition can be used to obtain
dimension independent covering number bounds. 
This entropy condition is related to multiplicative update methods
widely studied in online learning. To our knowledge, there are no previous covering
number results for entropy regularization.
We first introduce the following definition:
\begin{definition}
  Let $\mu=[\mu_j]$ be a vector with positive entries such that 
  $\|\mu\|_1=1$ (in this case, we call $\mu$ a distribution vector). 
  Let $\bx=[\bx^j] \neq 0$ be a vector of the same dimension, then
  we define the weighted relative entropy of $\bx$ with respect to $\mu$ as:
  \[
  \entro_\mu(\bx) = \sum_j |\bx^j| \ln \frac{|\bx^j|}{\mu_j \|\bx\|_1}.
  \]
\end{definition}

It is a well-known fact that the relative entropy defined above is always
non-negative, and $\entro_\mu(\bx)=0$ only when $|\bx|=\|\bx\|_1 \cdot \mu$.
Before the main theorem, we need a lemma that refines and generalizes the 
analysis of \citet[Section 6]{GLS01}. Note that for our purpose, their result is 
not directly applicable. Related techniques have also been used by 
\citet{GW98,KW97}. In the following lemma, $\bx^j_{i}$ indicates
the $j$-th component of vector $\bx_i$.
\begin{lemma}
  Let $\mu$ be a distribution vector and $\bw$ be a vector with
  non-negative entries such that $\|\bw\|_1 \leq W$.
  Assume that $\{\bx_i\}$ is a countable set of
  example vectors such that  $\inf_i \bw \cdot \bx_i >0$.
  $\forall \delta \in (0,\min_i \bw \cdot \bx_i]$, let
  \[
  m(\delta)= \frac{2 \sup_i \|\bx_i\|_\infty^2 W \cdot \entro_\mu(\bw)}{\delta^2}.
  \]
  Then there exists an integer sequence $i_1, \ldots, i_k$ where
  $k \leq m(\delta)$, and a vector $\hat{\bw}$ with its $j$-th component
  defined as
  $\hat{\bw}^j = \mu_j \exp( \eta \sum_{\ell=1}^k \bx^j_{i_\ell})$, 
  where $\eta = \delta/ (W \sup_i \|\bx_i\|_\infty^2)$,
  so that $\hat{\bw} \cdot \bx_i >0$ for all $i$.
  \label{lem:infty_mistake}
\end{lemma}
\begin{proof}
Without loss of generality, we assume that $\|\bw\|_1=1$.
Let $\bz$ be a vector, and consider the convex dual of
$\sum_{j=1}^d \bw^j \ln \frac{\bw^j}{\mu_j}$:
\[
\ln \sum_{j=1}^d \mu_j e^{\bz^j}
= \sup_{\|\bw\|_1=1} \left[\bw \cdot \bz - 
\sum_{j=1}^d \bw^j \ln \frac{\bw^j}{\mu_j} \right].
\]
The definition of this duality implies that the quantity
\[
M(\bz) = \ln \sum_{j=1}^d \mu_j e^{\bz^j} - \bw \cdot \bz
+ \sum_{j=1}^d \bw^j \ln \frac{\bw^j}{\mu_j}
\]
is always non-negative.

Assume now that the theorem is not true, then there exists 
a sequence of integers $i_1,\ldots, i_k$ where $k >m(\delta)$ such that
if we define a sequence of vectors $\bz_\ell$ as
$\bz_\ell = \bz_{\ell-1} + \eta \bx_{i_\ell}$ with $\bz_0=0$, then
$\sum_{j=1}^d \mu_j \exp(\bz_{\ell-1}^j) \bx_{i_\ell}^j \leq 0$
for $\ell=1,2,\ldots,k$.

Note that for all pairs of vectors $(\bv, \Delta \bv)$:
\[
  \frac{d}{dt} \ln \sum_{j=1}^d \mu_j e^{\bv^j+\Delta \bv^j t} =  
  \frac{ \sum_{j=1}^d \mu_j e^{\bv^j +\Delta \bv^j t} \Delta \bv^j}
  {\sum_{j=1}^d \mu_j e^{\bv^j + \Delta \bv^j t}}
\]
and
\begin{align*}
  \frac{d^2}{d t^2} \ln \sum_{j=1}^d \mu_j e^{\bv^j + \Delta \bv^j t}  
  \leq
  \frac{ \sum_{j=1}^d \mu_j e^{\bv^j + \Delta \bv^j t} \Delta \bv^{j \,2}}
  {\sum_j \mu_j e^{\bv^j +\Delta \bv^j t}}.
\end{align*}
Therefore by Taylor expansion, we know that there exists $t \in [0,1]$ 
such that
\begin{align*}
\ln \sum_{j=1}^d \mu_j e^{\bz_{\ell}^j} \leq & 
\ln \sum_{j=1}^d \mu_j e^{\bz_{\ell-1}^j} +   
\frac{ \sum_{j=1}^d \mu_j e^{\bz_{\ell-1}^j} \eta \bx_{i_\ell}^j}
{\sum_{j=1}^d \mu_j e^{\bz_{\ell-1}^j}} +
\frac{\eta^2}{2} 
\frac{\sum_{j=1}^d \mu_j 
e^{\bz_{\ell-1}^j+\eta \bx_{i_\ell}^j t} \bx_{i_\ell}^{j \, 2}}
{ \sum_{j=1}^d \mu_j e^{\bz_{\ell-1}^j + \eta \bx_{i_\ell}^j t }} \\
\leq &
\ln \sum_{j=1}^d \mu_j e^{\bz_{\ell-1}^j} +   
\frac{\eta^2}{2} \frac{\sum_{j=1}^d \mu_j
e^{\bz_{\ell-1}^j +\eta \bx_{i_\ell}^j t} \bx_{i_\ell}^{j \, 2}}
{ \sum_{j=1}^d \mu_j e^{\bz_{\ell-1}^j +\eta \bx_{i_\ell}^j t}} \\
\leq & \ln \sum_{j=1}^d 
\mu_j e^{\bz_{\ell-1}^j} + \frac{\eta^2}{2} \|\bx_{i_\ell}\|_\infty^2. 
\end{align*}
Note that we have used the assumption
that $\sum_{j=1}^d \mu_j \exp(\bz_{\ell-1}^j) \bx_{i_\ell}^j \leq 0$
in the above derivation.
We now obtain
\begin{align*}
M(\bz_\ell) -M(\bz_{\ell-1}) = &
\ln \frac{\sum_{j=1}^d \mu_j e^{\bz_{\ell}^j}}
{\sum_{j=1}^d \mu_j e^{\bz_{\ell-1}^j}}
- \bw \cdot \eta \bx_{i_\ell} \\
\leq & 
\frac{\eta^2}{2} \|\bx_{i_\ell}\|_\infty^2 - \eta \delta. 
\end{align*}
Summing the above inequality over $\ell$, and note that
$W \geq \|\bw\|_1=1$:
\begin{align*}
M(\bz_k) <& 
M(\bz_0) + m(\delta) (\frac{\eta^2}{2} \sup_i \|\bx_{i}\|_\infty^2 - \eta \delta) \\
=& \entro_\mu(\bw) + 
m(\delta)(\frac{\eta^2}{2} \sup_i \|\bx_{i}\|_\infty^2 - \eta \delta)
\leq 0,
\end{align*}
which is a contradiction since $M(\bz_k)$ is always non-negative.
\end{proof}

\begin{theorem}
  Given a distribution vector $\mu$,
  if $\|\bx\|_\infty \leq b$ and $\|\bw\|_1 \leq a$
  and $\entro_\mu(\bw) \leq c$, 
  where we assume that $\bw$ has non-negative entries, then
  $\forall \epsilon>0$,
  \[
  \log_2 \N_\infty(L,\epsilon,n) \leq  \frac{36 b^2 (a^2+ac)}{\epsilon^2}
  \log_2 [2 \lceil 4ab/\epsilon+2 \rceil n+1].
  \]
  \label{thm:infty_online}
\end{theorem}
\begin{proof}
The proof follows the same steps of Theorem~\ref{thm:pq_online}.
We let $\mu'=[\mu,1]/2$ and $\bw'=[\bw, a]$. We have $\|\bw'\|_1 \leq 2a$,
and $\entro_{\mu'}(\bw') \leq \entro_{\mu}(\bw) + a \ln 2 < a + c $.
Similarly, the expansion $\bx_i'$ of $\bx_i$ (by appending a component $\theta/a$)
satisfies $\|\bx_i'\|_\infty \leq 1.5 b$
(again, we assume that $\epsilon/a \leq b$).

We now apply the mistake bound in Lemma~\ref{lem:infty_mistake},
where we set $\delta = \epsilon/2$ and $W=2a$. We can define $M$ as
\[
M =  \frac{36(a+c)ab^2}{\epsilon^2}
\geq \frac{2}{\delta^2} \sup_i \|\bx_i'\|_\infty^2 W \cdot \entro_{\mu'}(\bw').
\]
The remaining part of the proof is the same as that of Theorem~\ref{thm:pq_online}.
\end{proof}

\begin{corollary}
  Given a distribution vector $\mu$,
  if $\|\bx\|_\infty \leq b$ and $\|\bw\|_1 \leq a$
  and $\entro_\mu(\bw) \leq c$, then    $\forall \epsilon>0$,
  \[
  \log_2 \N_\infty(L,\epsilon,n) \leq  \frac{288 b^2 (2a^2+ac)}{\epsilon^2}
  \log_2 [2 \lceil 8ab/\epsilon+2 \rceil n+1].
  \]
  \label{cor:infty_online}
\end{corollary}
\begin{proof}
Define vector $\bu$ component-wise as $\max(\bw,0)$, and 
similarly define $\bv=\max(-\bw,0)$. By definition,
$\bw=\bu-\bv$ and $\|\bu\|_1, \|\bv\|_1 \leq \|\bw\|_1$.
For all $L=L_1- L_2$, we have
$\N_\infty(L,\epsilon,n)
\leq \N_\infty(L_1,\epsilon/2,n) \cdot \N_\infty(L_2,\epsilon/2,n)$.
Therefore we only need to show that
$\entro_\mu(\bu) \leq \entro_\mu(\bw)+ \|\bw\|_1$.
To prove this, we shall assume that $\|\bw\|_1=1$ without loss of generality,
and $\bu,\bv \neq 0$. Since $\|\bu\|_1 + \|\bv\|_1=1$,
\[
\|\bu\|_1 \ln \frac{1}{\|\bu\|_1} +  \|\bv\|_1 \ln \frac{1}{\|\bv\|_1} 
\leq \ln 2 \leq \ln 2 + \sum_{j=1}^d \bv^j \ln \frac{\bv^j}{\|\bv\|_1 \mu_j}.
\]
The above inequality can be rewritten as
\[
\sum_{j=1}^d \bu^j \ln \frac{\bu^j}{\mu_j \|\bu\|_1}
\leq \ln2 + \sum_{j=1}^d \bu^j \ln \frac{\bu^j}{\mu_j}
+ \sum_{j=1}^d \bv^j \ln \frac{\bv^j}{\mu_j}.
\]
That is, $\entro_\mu(\bu) \leq \entro_\mu(\bw)+ \ln 2$. 
\end{proof}

Note that we don't require the dimension to be finite.
However, assume that the dimension $d$ is finite, and we let $\mu_j=1/d$. Then
it is easy to check that $\forall \bw, \entro_{\mu}(\bw) \leq \|\bw\|_1 \ln d$.
Therefore by Corollary~\ref{cor:infty_online}, we obtain the following
result which gives a better bound than a similar result of \citet{Bar98} by a
logarithmic factor of $n$.
\begin{corollary}
  If $\|\bx\|_\infty \leq b$ and $\|\bw\|_1 \leq a$, then $\forall \epsilon>0$,
  \[
  \log_2 \N_\infty(L,\epsilon,n) \leq  \frac{288 a^2 b^2 (2+\ln d)}{\epsilon^2}
  \log_2 [2 \lceil 8ab/\epsilon+2 \rceil n+1].
  \]
\end{corollary}


We now discuss the relationship among different covering number bounds obtained
in this section.
Theorem~\ref{thm:p-q-norm}  uses a reduction technique to generalize a result
of \citet{Bar98}.
The derivation employs Maurey's Lemma.
By observing that the effective dimension is no larger than $n$, it is
possible to remove the inherent logarithmic dependency on dimension $d$ 
for certain regularization conditions, as demonstrated in 
Corollary~\ref{cor:2-2-norm}.

Using online learning, this idea can be more systematically developed.
For example, Theorem~\ref{thm:pq_online} (note that $\N_2 \leq \N_\infty$)
employs the online mistake bound framework, which leads to a bound
with the $\log d$ dependency replaced by a $\log n$ dependency. 
This trade-off of $\log d$ and $\log n$ is very natural from the computational
point of view since Maurey's Lemma achieves an approximation by selecting 
columns (relevant features) of the data while an online algorithm achieves
an approximation by selecting rows (related to support vectors) of the data.

It follows that if $d \ll n$, then Theorem~\ref{thm:pq_online} gives a better
bound; and if $n \ll d$, then Theorem~\ref{thm:p-q-norm} gives a better bound.
However, if we use these covering number results in a PAC style generalization 
analysis, then a $\log n$ dependency on the sample size usually does not cause 
any substantial problem.


In the proof of Corollary~\ref{cor:2-2-norm} 
the effective dimension of the problem is reduced by a compactification of
part of the dimensions.
To a certain extent, all dimension independent covering number results
obtained in this section implicitly rely on the (weak) compactness of the 
effective parameter family in one way or another.  
Theorem~\ref{thm:infty_online} achieves this through the entropy 
regularization condition. 
If we regard $\mu_j$ as a prior
measure and $\bw$ as a posterior measure, then the entropy condition in
Theorem~\ref{thm:infty_online} corresponds to the maximum entropy
principle in density estimation, which can be regarded as entropy regularization.
Therefore the dimension independent 
covering number result justifies the maximum entropy method from the PAC 
learning point of view. 

If we let $p \to \infty$, then the covering number bound given in 
Theorem~\ref{thm:pq_online} diverges. 
It has been pointed out by \citet{GLS01} that this divergence is a consequence of
regularizing the weight parameter $\bw$ around the origin. 
As a comparison, Theorem~\ref{thm:infty_online} gives a finite covering number
for $p=\infty$ with entropy regularization. 
It is also possible to construct a regularization condition around a non-zero 
vector so that when $p \to \infty$, the bound in Theorem~\ref{thm:pq_online} 
approaches the limiting case of Theorem~\ref{thm:infty_online}.
Because of Theorem~\ref{thm:infty_online} and its relation to the 
well-established maximum entropy principle, it is reasonable to 
use entropy (instead of 1-norm) as the regularization condition 
for infinity-norm bounded data. For example, such a condition has recently
been employed by \citet{JaaMeiJeb00}.
Entropy regularization has also been implicitly employed in the Winnow family of 
multiplicative update algorithms \citep{Lit88}, and its continuous version of 
EG (and EGU) algorithms \citep{KW97}. In addition,
\citet{Zhang00-dualth} used explicit entropy regularization conditions 
to convert EG online algorithms into batch learning algorithms.

In addition to the Maurey's lemma approach used in this paper, 
2-norm covering number bounds can also be obtained by using an inequality 
from the theory of Gaussian processes, often referred to as Sudakov's 
minoration \citep[see][chapter 12]{LeTa91}.
This inequality bounds the 2-norm covering number of a function class
by the expectation of a Gaussian process indexed by the function class. The latter
can be estimated, which some time leads to quite tight bounds. 
However we shall not include results obtained by this approach in this paper. 
There are also other methods to obtain $p$-norm covering number bounds, for example,
by using the fat-shattering dimension of a function class \citep{Mend01-geom}.

We have shown in this section that infinity-norm covering number bounds can 
be derived from online mistake bounds. 
From the construction of $M(\bz)$ in the proof of
Lemma~\ref{lem:infty_mistake}, we see that weight $\bw$ and data $\bz$ 
are Legendre dual variables with respect to the regularization condition $g$
(as well as its convex dual function).
The representation of $\bz$ as a linear
combination of data $\bx_i$ leads to a dual representation of $\bw$.
This Legendre duality transforms the learning problem from the original 
$d$-dimensional space (where $\bw$ is represented by its components)
into the $n$-dimensional dual space (where $\bz$ is represented by a linear
combination of the data). This is why the logarithmic dimension factor 
$\log d$ in Maurey's Lemma (in the original space) can be replaced by 
$\log n$ in the dual (online learning) approach.
The basic idea of reducing the effective dimension of $\bw$ by using a linear 
combination in the sample space has also appeared in the derivation of 
``fat-shattering'' dimensions by \citet{Gurvits97}, although an entirely different
approach was employed there. However, the online learning approach used here
that utilizes convex duality is more general. For example, we are able to
derive covering number bounds for entropy regularization, which cannot be handled 
by previous techniques.
Furthermore, this convex duality can be easily generalized so that given any 
convex potential on $\bx$, we can obtain covering number bounds with
the corresponding dual regularization condition on $\bw$.
We would also like to mention that if formulation (\ref{eq:reg}) is convex, then 
we can study the dual representation of this formulation directly 
\citep{Zhang00-dualth}. Such a dual representation is closely related to
the online learning duality employed in this paper.


Finally, it is interesting to observe that the technique used in the proof
of Lemma~\ref{lem:infty_mistake} is closely related to the potential-reduction 
method for linear programming \citep[and references therein]{Todd97},
where a variant of $M(\bz)$ with a flipped sign for the second term can be 
used to show the polynomial convergence of certain interior point algorithms. 
Similar to the proof of Lemma~\ref{lem:infty_mistake}, the technique of bounding 
the number of steps is also based on a constant reduction of the potential
function at each step, which is achieved by choosing an appropriate $\eta$ from
the first and the second order terms in a Taylor expansion.
However, since Newton steps are often taken, the techniques required for 
bounding such terms are more complicated.

\section{Some Consequences of Covering Number Bounds}
\label{sec:example}

In this section, we illustrate some simple consequences of our covering number 
bounds on some specific learning formulations. 
There are a number of books that describe how to use
covering numbers to analyze learning algorithms
\citep{AnBar99,CriST00,Vap98}.
Together with our covering number results for regularized linear function classes,
machineries developed there can be used to study the generalization behavior
of linear learning formulations such as (\ref{eq:reg}).

For simplicity, we only include some direct consequences in this section.
As an example, we can easily obtain the following bound from
Theorem~\ref{thm:pq_online}, which improves a related result of
\citet{BarSha99}, and \citet{CriST00}.
\begin{theorem}
  If the data is 2-norm bounded as $\|\bx\|_2 \leq b$, then 
  consider the family $\Gamma$ of hyperplanes $\bw$ such that $\|\bw\|_2 \leq a$.
  Denote by $err(\bw)$ the misclassification error of $\bw$ with the true 
  distribution. Then there is a constant $C$ such that
  with probability $1-\eta$ over $n>1$ random samples, for all $\gamma>0$ and 
  $w \in \Gamma$ we have
  \[
  err(\bw) \leq \frac{k_\gamma}{n} + \sqrt{
    \frac{C}{n}
    \left(\frac{a^2 b^2}{\gamma^2} \ln(n) + 
    \ln \frac{1}{\eta}\right)},
  \]
  where $k_\gamma=|\{i:w^T x^i y^i <\gamma\}|$ is the number of 
  samples with margin less than $\gamma$.
  \label{thm:2norm_margin}
\end{theorem}
\begin{proof}
Using the covering number result in Theorem~\ref{thm:pq_online}. Let
$\gamma_i = a b /2^i$ and $p_i=1/2^i= \gamma_i /(a b)$ in 
Corollary~\ref{cor:gamma_uniform}. We have the bound:
\[
  err(\bw) \leq \frac{k_\gamma}{n} + O\left(\sqrt{
    \frac{1}{n}
    \left(\frac{a^2 b^2}{\gamma^2} \ln((\frac{a b}{\gamma}+1)n) + 
    \ln \frac{ab}{\gamma}+
    \ln \frac{1}{\eta}\right)}\right),
\]

Now using the fact that with an appropriate constant in the $O$ notation,
the above bound is
trivial when $\gamma > ab$ or $\gamma n < ab$, it is easy to see
that the above bound is equivalent to the claim of the theorem. 
\end{proof}

Note that the corresponding bound given by
\citet{CriST00} was their Theorem 4.19. 
A similar theorem was also stated by \citet{BarSha99}.
However in both cases, the $\ln n$ factor in our bound were replaced by 
$\ln^2 n$.
The underlying technique leading to such a result came originally from 
\citet{Bar98}, and fat-shattering dimension results 
by \citet{AnBar99,TBWA96}.
The reason why we can obtain a better bound in this paper
is that the $L_\infty$
covering number bound in Theorem~\ref{thm:pq_online} improves the
corresponding bounds used previously,  which were obtained through
``fat-shattering'' dimension estimates. 

As another example, we consider the following bound:
\begin{theorem}
  If the data is infinity-norm bounded as $\|\bx\|_\infty \leq b$, then 
  consider the family $\Gamma$ of hyperplanes $\bw$ 
  such that $\|\bw\|_1 \leq a$. Let $\mu$ be a fixed non-negative prior vector.
  Denote by $err(\bw)$ the misclassification error of $\bw$ with the true 
  distribution. Then there is a constant $C$ such that
  with probability $1-\eta$ over $n>1$ random samples, for all $\gamma$ and
  $\bw \in \Gamma$, we have
  \[
  err(\bw) \leq \frac{k_\gamma}{n} + \sqrt{
    \frac{C}{n}
    \left(\frac{b^2(a^2+a \entro_\mu(\bw))}{\gamma^2} \ln(n) + 
      \ln \frac{1}{\eta}\right)},
  \] 
  where $k_\gamma=|\{i:w^T x^i y^i <\gamma\}|$ is the number of 
  samples with margin less than $\gamma$.
  \label{thm:entropy_margin}
\end{theorem}
\begin{proof}
Consider the restriction of parameter $\bw$ in $\Gamma_i \subseteq \Gamma$,
where $\Gamma_1=\{\bw \in \Gamma: \entro_\mu(\bw) \leq a\}$ and
$\Gamma_j=\{\bw \in \Gamma: \entro_\mu(\bw) \in (2^{j-1} a, 2^j a] \}$ for $j>1$.
For each $j$, using Corollary~\ref{cor:infty_online} and
essentially the same proof as that of Theorem~\ref{thm:2norm_margin}, we obtain
the following bound for the parameter family $\{\bw \in \Gamma_j\}$:
\[
err(\bw) \leq \frac{k_\gamma}{n} + O\left(\sqrt{
  \frac{1}{n}
  \left(\frac{b^2(a^2+a \entro_\mu(\bw))}{\gamma^2} \ln(n) + 
    \ln \frac{1}{\eta}\right)}\right),
\] 
Now let $q_j=1/2^j$, which implies that $q_j= O(a/(a+\entro_\mu(\bw)))$.
Using Corollary~\ref{cor:para_uniform}, we obtain the following bound 
for the parameter family $\{\bw \in \Gamma\}$:
\[
err(\bw) \leq \frac{k_\gamma}{n} + O\left(\sqrt{
  \frac{1}{n}
  \left(\frac{a^2 b^2 c }{\gamma^2} \ln(n) + 
  \ln c+
  \ln \frac{1}{\eta}\right)}\right),
\]
where $c=(a+\entro_\mu(\bw))/a$. It is now easy to see that the above bound
leads to the theorem.
\end{proof}

The above theorem is similar to a recent result of \citet{LaSe01} which
was obtained by specialized techniques for PAC-Bayes analysis originally
developed by \citet{Mca99}.
If we let $\mu$ be the uniform prior, then the above bound easily leads to the 
following result of \citet{SFBL98}, which was used to explain the 
effectiveness of boosting.
\begin{theorem}[\citealt{SFBL98}]
   If the data has dimension $d>1$ and is infinity-norm bounded as 
   $\|\bx\|_\infty \leq 1$, then 
  consider the family $\Gamma$ of hyperplanes $\bw$ with non-negative weights
  such that $\|\bw\|_1 \leq 1$. 
  Denote by $err(\bw)$ the misclassification error of $\bw$ with the true 
  distribution. Then there is a constant $C$ 
  with probability $1-\eta$ over $n>1$ random samples, for all $\gamma$ and
  $\bw \in \Gamma$, we have
  \[
  err(\bw) \leq \frac{k_\gamma}{n} + \sqrt{
    \frac{C}{n}
    \left(\frac{\ln(d)\ln(n)}{\gamma^2} + \ln \frac{1}{\eta}\right)},
  \] 
  where $k_\gamma=|\{i:w^T x^i y^i <\gamma\}|$ is the number of 
  samples with margin less than $\gamma$.
  \label{thm:boosting_margin} 
\end{theorem}

It can be seen that our bound in Theorem~\ref{thm:entropy_margin} can be 
significantly
better if one can guess a good prior $\mu$ so that 
$\entro_\mu(\hat{\bw})$ is small for a parameter $\hat{\bw}$ which has a good
classification performance. If $\entro_\mu(\hat{\bw})$ is small, then 
unlike Theorem~\ref{thm:boosting_margin}, our bound
can be dimension independent.
This implies that it may be possible to vote an infinite number of
classifiers so that the generalization performance is still good. 


\section{Discussion}
\label{sec:discussion}
In this paper, we have studied some theoretical aspects of using the
regularization in linear learning formulations such as (\ref{eq:reg}).
We show that with appropriate regularization conditions, we can achieve
the same dimension independent generalization performance enjoyed by
support vector machines.

The separation concept introduced in Theorem~\ref{thm:inf-norm} 
implies that the ``margin'' idea developed for
linear classification can be naturally extended to general learning problems. 
Compared with Theorem~\ref{thm:pol84},
Theorem~\ref{thm:inf-norm} is more suitable for problems
with non-smooth loss functions since it does not directly employ 
the covering number of the overall loss function itself.
Note that in general, the covering number of a function class depends 
on certain smoothness conditions of the family.
Such smoothness requirement can lead to difficulties when we try to directly apply
Theorem~\ref{thm:pol84} to 
problems with non-smooth loss functions.

In Section~\ref{sec:linear}, we have obtained some
new covering number bounds for linear function
classes under certain regularization conditions.
These bounds have complemented and improved various previous results.
We compared two different approaches for deriving covering number bounds.
The first approach employs Maurey's lemma for sparsification. This approach has
also been used in many previous studies of covering numbers \citep{AnBar99}, 
and leads to bounds that have logarithmic dependencies on 
dimension. However, by observing that the effective dimension can be bounded
by the sample size (as in Corollary~\ref{cor:2-2-norm}), it is possible to
remove this dimensional dependency for certain regularization conditions. 
This observation naturally leads to a new approach of using online 
learning to derive covering number bounds, as outlined in Section~\ref{sec:linear}.
Compared with earlier methods that have relied on the concept of 
``fat-shattering'' dimension, our approach directly yields $\infty$-norm
covering number bounds that improve previous results by a $\log n$ factor.
Some specific consequences are discussed in Section~\ref{sec:example}.

Furthermore, the convex duality technique used in deriving online mistake bound
\citep[see][]{GLS01} is very general. It can be used to study general convex
regularization conditions. As an example, we are able to derive entropy
covering number bounds that are difficult to obtained using previous techniques.

Also we shall mention that related to these covering number results,
there have been some recent studies on randomized
algorithms that select posterior distributions 
under certain regularization conditions \citep{Mca99,Zhang99-colt}.
The generalization performance of these methods can be independent of the
underlying dimension.
Since these algorithms can be considered as special linear models,
the dimension independent covering number bounds in 
Section~\ref{sec:linear} give intuitive explanations for the
generalization ability of those algorithms within the traditional PAC analysis 
framework. 

\section*{Acknowledgement}
The author would like to thank Peter Bartlett and anonymous referees for 
pointing out related works,
and for constructive suggestions that helped to improve the paper.

\appendix

\section{Proof of Theorem~\ref{thm:inf-norm}}
\label{apx:inf-norm}

For simplicity, we assume all quantities appearing in the proof are measurable.
We follow the standard techniques \citep{Pol84,VC71}. 

Step 1 (symmetrization by a replicate sample). 
For all $n \epsilon^2 \geq 2$, and consider i.i.d. random sample $Y_1^n$,
independent of $X_1^n$, 
\begin{align*}
& P \left[\sup_{\alpha} [ E_{\bx} f_1(\cL(\alpha,\bx)) - 
E_{Y_1^n} f_2 (\cL(\alpha,\by))] > 
  \epsilon \right] \\
\leq & 2 P \left[ \sup_\alpha  [E_{X_1^n} f_1(\cL(\alpha,\bx))- 
E_{Y_1^n} f_2 (\cL(\alpha,\by))] >\epsilon/2\right].
\end{align*}
To see this, consider a function $\alpha^*$ such that
$\alpha^*(Y_1^n)$ is a parameter that satisfies
$E_{\bx} f_1(\cL(\alpha^*,\bx)) - E_{Y_1^n} f_2 (\cL(\alpha^*,\by))>\epsilon$
if such a parameter exists;
and let $\alpha^*(Y_1^n)$ be an arbitrary parameter if no such parameter
exists.
Note that for any $Y_1^n$, by the Chebyshev's inequality,
the conditional probability
\begin{align}
& P\left[ E_{\bx} f_1(\cL(\alpha^*,\bx)) -
E_{X_1^n} f_1(\cL(\alpha^*,\bx)) \leq \epsilon/2
|Y_1^n\right] \label{eq:cheby} \\
\geq & 1 - \frac{1}{n\epsilon^2/4}
E_{\bx} f_1(\cL(\alpha^*,\bx)) (1-E_{\bx} f_1(\cL(\alpha^*,\bx)))
\geq 1/2. \nonumber
\end{align}
We thus have
\begin{align*}
&\frac{1}{2}
P\left[\sup_{\alpha} [ E_{\bx} f_1(\cL(\alpha,\bx)) - 
E_{Y_1^n} f_2 (\cL(\alpha,\by))]>\epsilon\right] \\
=& 
\frac{1}{2} P\left[E_{\bx} f_1(\cL(\alpha^*,\bx)) - 
  E_{Y_1^n} f_2 (\cL(\alpha^*,\by)) >\epsilon\right] \\
\leq &
P\left[E_{\bx} f_1(\cL(\alpha^*,\bx)) - E_{Y_1^n} f_2 (\cL(\alpha^*,\by)) >\epsilon,
 E_{\bx} f_1(\cL(\alpha^*,\bx)) -E_{X_1^n} f_1(\cL(\alpha^*,\by)) 
 \leq \epsilon/2\right] \\
\leq & 
P\left[ E_{X_1^n} f_1(\cL(\alpha^*,\bx))- E_{Y_1^n} f_2 (\cL(\alpha^*,\by)) 
>\epsilon/2\right] \\
\leq & P\left[ \sup_\alpha  E_{X_1^n} f_1(\cL(\alpha,\bx))- 
E_{Y_1^n} f_2 (\cL(\alpha,\by)) >\epsilon/2\right].
\end{align*}
In the above derivation, the first inequality is a direct consequence of
(\ref{eq:cheby}). The second and the third inequalities follow from simple
algebra.

Step 2 (symmetrization by random signs). 
Consider i.i.d. sign variables $\sigma_1, \ldots, \sigma_n$, independent
of $X_1^n$ and $Y_1^n$, with $P(\sigma_i=-1)=P(\sigma_i=1)=1/2$.
Define 
\[
g_\sigma(\alpha,\bx)= (f_1(\cL(\alpha,\bx))-f_2 (\cL(\alpha,\bx)))/2 +
\sigma (f_1(\cL(\alpha,\bx))+f_2 (\cL(\alpha,\bx)))/2,
\]
and
\[
h_\sigma(\alpha,\by)= -(f_1(\cL(\alpha,\by))-f_2 (\cL(\alpha,\by)))/2
+\sigma (f_1(\cL(\alpha,\by))+f_2 (\cL(\alpha,\by)))/2.
\]
It is easy to check that 
\begin{align*}
& \sum_{i=1}^n \left[g_{\sigma_i}(\alpha,\bx_i) -h_{\sigma_i}(\alpha,\by_i)
\right] \\
=&
\sum_{\sigma_i=1} \left[f_1(\cL(\alpha,\bx_i)) - f_2 (\cL(\alpha,\by_i))\right] +
\sum_{\sigma_i=-1} \left[f_1(\cL(\alpha,\by_i)) - f_2 (\cL(\alpha,\bx_i))\right].
\end{align*}
This implies that the distribution of
\[
\sup_\alpha \sum_{i=1}^n \left[f_1(\cL(\alpha,\bx_i)) - 
  f_2 (\cL(\alpha,\by_i))\right]
\]
is the same as that of
\[
\sup_\alpha \sum_{i=1}^n \left[g_{\sigma_i}(\alpha,\bx_i) -
  h_{\sigma_i}(\alpha,\by_i)\right].
\]
Therefore
\begin{align*}
& P\left[ \sup_\alpha  E_{X_1^n} f_1(\cL(\alpha,\bx))- 
E_{Y_1^n} f_2 (\cL(\alpha,\by)) >\epsilon/2\right] \\
= & P\left[\sup_\alpha\frac{1}{n}
\sum_{i=1}^n (g_{\sigma_i}(\alpha,\bx_i) -
h_{\sigma_i}(\alpha,\by_i))>\epsilon/2\right] \\
\leq & 2 P\left[\sup_\alpha\frac{1}{n}\sum_{i=1}^n g_{\sigma_i}(\alpha,\bx_i)
>\epsilon/4\right].
\end{align*}

Step 3 (derandomizing data).
To estimate 
$P[\sup_\alpha \frac{1}{n}\sum_{i=1}^n g_{\sigma_i}(\alpha,\bx_i)>\epsilon/4]$,
we fix $X_1^n$ and estimate the conditional probability
\[
P\left[\sup_\alpha\frac{1}{n}
\sum_{i=1}^n g_{\sigma_i}(\alpha,\bx_i)>\epsilon/4 | X_1^n\right].
\]
Let $\{(\bz^j_{1}, \ldots \bz^j_{n}): j=1,\ldots,m\}$ be an infinity-norm
$\gamma$-covering of $\cL(\alpha,X_1^n)$, where $m=\N_\infty(\cL,\gamma,X_1^n)$.
By definition, for all $\alpha$, there exists $j$ such that
$|\bz^j_{i} - \cL(\alpha,\bx_i)|<\gamma$ for all $i$. 
Therefore 
$g_{1}(\alpha,\bx_i)=f_1(\cL(\alpha,\bx_i)) \leq f_3(\bz^j_{i})$ and
$g_{-1}(\alpha,\bx_i)=-f_2(\cL(\alpha,\bx_i)) \leq -f_3(\bz^j_{i})$;
that is, $g_{\sigma_i}(\alpha,\bx_i) \leq \sigma_i f_3(\bz^j_{i})$.
We thus obtain
\begin{align*}
& P\left[\sup_\alpha\frac{1}{n}
\sum_{i=1}^n g_{\sigma_i}(\alpha,\bx_i)>\epsilon/4 | X_1^n\right] \\
\leq &
P\left[\sup_j\frac{1}{n}\sum_{i=1}^n \sigma_i f_3(\bz^j_{i})>\epsilon/4 
| X_1^n\right] \\
\leq & \N_\infty(\cL,\gamma,X_1^n)
\sup_j P\left[\frac{1}{n}\sum_{i=1}^n \sigma_i f_3(\bz^j_{i})>\epsilon/4 
| X_1^n\right] \\
\leq &
\N_\infty(\cL,\gamma,X_1^n) e^{-n\epsilon^2/32}.
\end{align*}
The last inequality follows from the Hoeffding's inequality~\citep{Hoe63}.
This proves the theorem.


\bibliography{learning,pubc,pubj}


\end{document}

% LocalWords:  alg env tr erm sgn al Zhang VC Vapnik Kolmogorov Glivenko Maurey
% LocalWords:  Cantelli lder's SVD Frobenius pp Rao mis cf regularized watson
% LocalWords:  ibm com regularization hyperplane discretize Jensen's Pollard ab
% LocalWords:  Lipschitz tzhang LocalWords PAC discretization symmetrization Tx
% LocalWords:  derandomizing Hoeffding's emp Txy jy iid Chebyshev's Chernoff's
% LocalWords:  Maurey's compacification regularize EG sparsity entro Legendre
% LocalWords:  Bartlett's xy Courant eigenproblems Bregman Chernoff McAllester
% LocalWords:  Jacobi pt cov perceptron EGU boundness SVM discretizes RC qy th
% LocalWords:  hyperplanes sparsification linearize parametric combinatorial na
% LocalWords:  minimization indices cardinality parameterized priori dt ac USV
% LocalWords:  logarithmically orthonormal misclassification classifiers pubc
% LocalWords:  pubj compactification Sudakov's minoration Acknowledgement NY FS
% LocalWords:  CriST Vap Sau DGL SFBL AnBar GBSW Gurvits WSS Kol KT PonSch Hau
% LocalWords:  ABCH TBWA geom LBW GLS GW KW JaaMeiJeb dualth LeTa BarSha LaSe
% LocalWords:  Mca