\documentclass[twoside,11pt]{article}
\usepackage{jmlr2e,amsmath,amssymb,amscd,graphics,eucal}
%\usepackage{jmlr2e,amsmath,amssymb,amscd,graphics,eucal,rotfloat}

\jmlrheading{2}{2001}{$1$-$18$}{2/01}{10/01}{Shahar Mendelson}
\ShortHeadings{On the Size of Convex Hulls of Small Sets}
{Mendelson} \firstpageno{1}

\newcommand{\Ef}{\E_\mu \Ell_f}
\newcommand{\Pn}[1]{{\mathbb{P}}_{\mu_n}(\Ell_{#1})}
\newcommand{\En}[1]{\E_\mu_n{#1}}
\newcommand{\Emul}[1]{\E_\mu \Ell_{#1}}
\newcommand{\iger}[1]{\lceil {#1} \rceil}
\newcommand{\F}{\cal F \rm}
\newcommand{\vol}{\mathop{\rm vol}\!}
\newcommand{\adj}{\mathop{\rm adj}\,}
\newcommand{\rank}{\mathop{\rm rank}}
\newcommand{\TT}{\tilde T}
\newcommand{\rond}{\mathbin{\scriptstyle\circ}}
%\newcommand{\R}{{\rm I}\!{\rm R}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\norm}[1]{\left\Vert #1\right\Vert}
\newcommand{\abs}[1]{\left\vert #1\right\vert}
\newcommand{\inr}[1]{\bigl< #1 \bigr>}
\newcommand{\C}{{\bf C}}
%\newcommand{\N}{{\rm I}\!{\rm N}}
\newcommand{\N}{\mathbb{N}}
%\newcommand{\E}{{\rm I}\!{\rm E}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\T}{{\bf T}}
\newcommand{\eps}{\varepsilon}
\newcommand{\spec}{\mathop{\rm}}
\newcommand{\ord}{\mathop{\rm ord}}
\newcommand{\MM}{{\cal M}}
\newcommand{\conv}{\mathop{\rm conv}}
%\newcommand{\PP}{{\rm I}\!{\rm P}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\LL}{{\rm I}\!{\rm L}}
\newcommand{\BB}{{\rm I}\!{\rm B}}
\newcommand{\Eff}{F}
\newcommand{\om}{\omega}
\newcommand{\Ell}{\cal L}

% SPECIFIC COMMANDS
\newcommand{\normF}[1]{\left\Vert #1
\right\Vert_{\ell_\infty(F)}}
\newcommand{\normpol}[1]{\norm{#1}_{(F/\mu_n)^o}}
\newcommand{\sff}[1]{{\rm fat}_{#1}^{\frac{1}{2}}(F)}
\newcommand{\ff}[1]{{\rm fat}_{#1}(F)}
\newcommand{\base}{(n^{1/2}e_i)_{i \in I}}
\newcommand{\packt}[1]{\log{D\bigl(#1,F,L_2(\mu_n)\bigr)}}
\newcommand{\covert}[1]{\log{N\bigl(#1,F,L_2(\mu_n)\bigr)}}
\newcommand{\scovert}[1]{\log^{\frac{1}{2}}{N\bigl(#1,F,L_2(\mu_n)\bigr)}}
\newcommand{\pack}[1]{\log{D\bigl(#1,F,L_p(\mu_n)\bigr)}}
\newcommand{\cover}[1]{\log{N\bigl(#1,F,L_p(\mu_n)\bigr)}}
\newcommand{\scover}[1]{\log^{\frac{1}{2}}{N\bigl(#1,F,L_p(\mu_n)\bigr)}}
\newcommand{\bi}{B\bigl(L_\infty(\Omega)\bigr)}
\newcommand{\aconv}{{\rm absconv}}


\newcommand{\Gauss}{\norm{\sum_{i=1}^n
g_i\delta_{X_i}}_{\ell_\infty(F)}}
\newcommand{\Rad}{\norm{\sum_{i=1}^n
r_i\delta_{X_i}}_{\ell_\infty(F)}}

\newtheorem{Theorem}{Theorem}[section]
\newtheorem{Lemma}[Theorem]{Lemma}
\newtheorem{Sublemma}{Sublemma}
\newtheorem{Definition}[Theorem]{Definition}
\newtheorem{Problem}[Theorem]{Problem}
\newtheorem{Proposition}[Theorem]{Proposition}
\newtheorem{Corollary}[Theorem]{Corollary}
\newtheorem{Remark}[Theorem]{Remark}
\newtheorem{Example}[Theorem]{Example}
\newtheorem{Assumption}{Assumption}[section]
\newtheorem{Question}[Theorem]{Question}
\numberwithin{equation}{section} \font\titre=cmbx12

\begin{document}
\title{On the Size of Convex Hulls of Small Sets}

\author{\name Shahar Mendelson \email shahar@csl.anu.edu.au \\
\addr Computer Sciences Laboratory\\
 Research School of Information Sciences and Engineering\\
  The Australian National University\\
  Canberra 0200, Australia
  }

\editor{Dana Ron}

\maketitle

\begin{abstract}%
We investigate two different notions of ``size" which appear
naturally in Statistical Learning Theory. We present quantitative
estimates on the {\it fat-shattering dimension} and on the
covering numbers of convex hulls of sets of functions, given the
necessary data on the original sets. The proofs we present are
relatively simple since they do not require extensive background
in convex geometry.
\end{abstract}
\begin{keywords}
Convex hulls, fat-shattering dimension, covering numbers
\end{keywords}

\section{Introduction}
Convexity plays an important role in Machine Learning. Its
significance can be seen in both sides of Learning Theory.
Firstly, from the practitioner's point of view, minimizing
empirical risks is much easier when the class is convex.
Secondly, from the theoretical standpoint, the sample complexity
needed for agnostic learning is considerably smaller for convex
classes \citep{LBW,COLT01a}. On the other hand, if the original
class one is interested in happens to be non-convex, taking the
convex hull increases the size of the class and it is not clear
whether this is worth the effort, since the benefits of learning
from a convex class might be negligible compared to the price one
has to pay for using the much larger class. Thus, it is natural
to ask ``how large" can a convex hull of a given class be? We
shall present an answer with respect to two important parameters
which measure the size of a class: the covering numbers and the
fat-shattering dimension. The estimates on the covering numbers
we present are not new and have recently appeared in \cite{CKP}.
The main reason we chose to present an alternative proof is
because the one in \cite{CKP} uses very deep results in the local
theory of Banach spaces, hence it is less accessible to the non
expert reader. The proof we present here is self contained and
(almost) does not assume any prior knowledge.
\par
The upper bound on the fat-shattering dimension of the convex
hull of a class uses a notion originating from Banach spaces
theory called {\it type}. This is a property of some Banach
spaces which appears naturally when trying to compute the
fat-shattering dimension of linear functionals
\citep{Gurvits,Men98}. The path we take is as follows: we begin by
improving the known upper bounds on the fat-shattering dimension
of the functionals $\{x^*| \norm{x^*} \leq 1\}$ (defined below)
when considered as functions on the unit ball of the space. It
turns out that this linear fat-shattering dimension is determined
by the {\it type} of the Banach space. We use this fact to prove
the results concerning the fat-shattering dimension of a convex
hull of a general class. To that end, we embed both the class and
the shattered set into two finite dimensional Banach spaces, where
the dimension of the spaces is the size of the shattered set. The
unit ball of the first space is the symmetric convex hull of the
$n$-tuples $\bigl(f(\om_i)\bigr)_{i=1}^n$ where $f \in F$ and
$(\om_i)_{i=1}^n$ is the shattered set. The second Banach space
is the dual space to the first. It is possible to show that the
fat-shattering dimension of a given class may be controlled by
the fat-shattering dimension of the embedded class, only now, its
members are considered as linear functionals on the images of
$(\om_i)_{i=1}^n$. Thus, it is possible to bound the
fat-shattering dimension using ``linear" methods.
\par
The article is divided into two main sections. In section
\ref{entropy} we prove the covering numbers estimate. Section
\ref{typesec} is devoted to the investigation of the linear
fat-shattering dimension and the fat-shattering of convex hulls.
%\par
\subsection{Basic results, definitions and notation}
\par
\noindent Given a Banach space $X$, the {\it dual} of $X$, denoted
by $X^*$, consists of all the bounded linear functionals on $X$,
endowed with the norm
\begin{equation*}
\norm{x^*}_{X^*}=\sup_{\norm{x}_X=1}\abs{x^*(x)}.
\end{equation*}
We denote the unit ball of $X$ by $B(X)$ and the dual unit ball
by $B(X^*)$. If $1 \leq p< \infty$, let $\ell_p^n$ be $\R^n$
equipped with the norm $\norm{x}_p =\bigl(\sum_{i=1}^n
\abs{x_i}^p \bigr)^{1/p}$ and set $\ell_\infty^n$ to be $\R^n$
with respect to the sup norm.
\par
Given a set $A$, let $A^c$ be its complement, set $\abs{A}$ to be
its cardinality and denote its characteristic function by
$\chi_A$. Thus, $\chi_A(x)=1$ if $x \in A$ and $0$ otherwise. If
$A$ and $B$ are sets, let $A+B=\{a+b|a\in A, b \in B\}$.
\par
For any probability measure $\mu$ on a measurable space
$(\Omega,\Sigma)$, let $\E_\mu$ denote the expectation with
respect to $\mu$. $L_p(\mu)$ is the set of functions which satisfy
$\E_\mu \abs{f}^p < \infty$ and set $\norm{f}_{L_p(\mu)}=(\E_\mu
\abs{f}^p)^{1/p}$. Given $I \subset \Omega$, $L_\infty(I)$ is the
space of bounded functions on $I$, with respect to the norm
$\norm{f}_\infty =\sup_{\omega \in I} \abs{f(\omega)}$. For every
$\omega \in \Omega$ let $\delta_\omega$ be the point evaluation
functional, that is, for every function $f$ on $\Omega$,
$\delta_\omega(f)=f(\omega)$. We denote by $\mu_n$ an empirical
measure supported on a set of $n$ points, hence,
$\mu_n=\frac{1}{n}\sum_{i=1}^n \delta_{\omega_i}$. If $\abs{I}=n$
and $\mu_n$ is the empirical measure supported on $I$, we denote
$L_\infty(I)$ by $L_\infty(\mu_n)$.
\par
Throughout this paper, all absolute constants are denoted by $C$
or $c$. Their values may change from line to line or even within
the same line.
\par
The following are the definitions of well known combinatorial
parameters which are often used in Learning Theory.
\begin{Definition} \label{VCdef}
Let $\Eff$ be a class of $\{0,1\}$-valued functions on a space
$\Omega$. We say that $\Eff$ shatters $\{\omega_1,...,\omega_n\}
\subset \Omega$, if for every $I \subset \{1,...,n \}$ there is a
function $f_I \in \Eff$ for which $f_I(\omega_i) =1$ if $i \in I$
and $f_I(\omega_i)=0$ if $ i \not \in I$. Let
\begin{equation*}
VC(\Eff,\Omega)=\sup \Bigl\{\abs{A} \Big| A \subset \Omega, \ A \
{\rm is \ shattered \ by} \ \Eff \Bigr\}.
\end{equation*}
$VC(\Eff,\Omega)$ is called the VC dimension of $\Eff$.
\end{Definition}
There is a parametric version of the VC dimension, called the
fat-shattering dimension:
%
\begin{Definition}
For every $\eps>0$, a set $A=\{ \omega_1,...,\omega_n \} \subset
\Omega$ is said to be $\eps$--shattered by $F$ if there is some
function $s:A \to \R$, such that for every $I \subset
\{1,...,n\}$ there is some $f_I \in F$ for which $f_I(\omega_i)
\geq s(\omega_i)+\eps$ if $i \in I$, and $f_I(\omega_i) \leq
s(\omega_i)-\eps$ if $i \not \in I$. Let
\begin{equation*}
{\rm fat}_\eps(\Eff,\Omega)=\sup \Bigl\{\abs{A} \Big| A \subset
\Omega, \ A \ {\rm is} \ \eps{\rm -shattered \  by} \ \Eff
\Bigr\}.
\end{equation*}
The set $\bigl(s_i \bigr)=\bigl(s(\om_i)\bigr)$ is called a
witness to the shattering and for every $I \subset \{1,...,n\}$
we call $f_I$ the shattering function of the set $I$. In cases
where the set $\Omega$ is clear, we will denote the fat-shattering
dimension by ${\rm fat}_\eps(F)$.
\end{Definition}
\par
If $(X,d)$ is a metric space and if $\Eff \subset X$, let
$N(\eps,\Eff, d)$ be the minimal number of open balls with radius
$\eps>0$ (with respect to the metric $d$) needed to cover $\Eff$.
The numbers $N(\eps,\Eff,d)$ are called the covering numbers of
$\Eff$. A set $A \subset X$ is said to be an $\eps$-cover of
$\Eff$ if the union of open balls $\bigcup_{a \in A} B(a,\eps)$
contains $\Eff$. In cases where the subset ${\Eff}$ is obvious,
we denote the covering numbers by $N(\eps,d)$. In cases where the
metric is clear we denote the covering numbers by
$N(\eps,{\Eff})$. The logarithm of the covering numbers of a set
is sometimes called the {\it entropy} of the set.
\par
A set is called $\eps$-separated if the distance between any two
elements of the set is larger than $\eps$. Set $D(\eps,\Eff)$ to
be the maximal cardinality of an $\eps$-separated set in $\Eff$.
It is easy to see that $N(\eps,\Eff) \leq D(\eps,\Eff) \leq
N(\eps/2,\Eff)$.
\par
%
%
The following result, which is due to \citet*{ABCH}, enables one
to estimate the $L_\infty(\mu_n)$ covering numbers of classes in
terms of the fat-shattering dimension.
\begin{Theorem} \label{ABCH}
Let $\Eff$ be a class of functions from $\Omega$ into $[0,1]$ and
set $d=\ff{\eps/4}$. Then, for every empirical measure $\mu_n$ on
$\Omega$,
\begin{equation*}
D{\bigl(\eps, F, L_\infty(\mu_n) \bigr)} \leq
2\Bigl(\frac{4n}{\eps^2}\Bigr)^{d\log{\bigl(en/(d\eps)\bigr)}}.
\end{equation*}
\end{Theorem}
%\par
\subsection{The main results}
We end this introduction with a summary of the main results
presented in the sequel which are relevant in the context of
Machine Learning.
\par
In the next section, we present a general estimate on the $L_2$
covering numbers of convex hulls of a class, given the covering
numbers of the class. As a corollary of that result we obtain the
following:
\begin{Theorem}
Let $F$ be a class of functions on $\Omega$.
\begin{description}
\item{1.}
There is an absolute constant $C$ such that for every VC class of
functions $F$, every probability measure $\mu$ on $\Omega$ and
every $\eps>0$,
\begin{equation*}
\log N \bigl(\eps,{\rm conv}(F),L_2(\mu)\bigr) \leq
Cd\Bigl(\frac{1}{\eps}\Bigr)^{\frac{2d}{d+1}},
\end{equation*}
where ${\rm VC}(F)=d$.
\item{2.} If $F$ maps $\Omega$ into $[0,1]$ and ${\rm fat}_\eps(F)
\leq \gamma\eps^{-p}$ for some $p<2$ and $\gamma>0$, then for
every $p<p'<2$ there is a constant $C=C(p,p',\gamma)$, such that
for every probability measure on $\Omega$ and every $\eps>0$,
\begin{equation*}
\log N \bigl(\eps,{\rm conv}(F),L_2(\mu)\bigr) \leq
\frac{C}{\eps^2}\log^{1-\frac{2}{p'}} \frac{1}{\eps}.
\end{equation*}
\end{description}
\end{Theorem}
\par
In the final section we investigate the fat-shattering dimension
of convex hulls. In particular, we prove the following:
\begin{Theorem}
There is an absolute constant $C$, such that for every class $F$
which consists of functions which map $\Omega$ into $[0,1]$ and
every $\eps>0$,
\begin{equation*}
{\rm fat}_\eps \bigl({\rm conv}(F)\bigr) \leq C\frac{{\rm
fat}_{\frac{\eps}{4}}(F)}{\eps^2}\log^2 \Bigl(\frac{2{\rm
fat}_{\frac{\eps}{4}}(F)}{\eps}\Bigr).
\end{equation*}
\end{Theorem}


\section{Entropy of convex hulls of classes} \label{entropy}
In this section we provide estimates on the entropy of convex
hulls of classes when considered as subsets of $L_2$ spaces. We
divide our discussion into two parts. First, we deal with classes
of $\{0,1\}$-valued functions which have a finite VC dimension.
Then, we investigate classes of functions with a uniformly bounded
range for which the fat-shattering dimension is polynomial in
${\eps}^{-1}$.
\par
The path we take is rather general. We estimate the covering
numbers of a convex hull of a set, given that the covering
numbers of the set itself are polynomial in ${\eps}^{-1}$. We
combine this general result with well known bounds on the
covering numbers of classes using the fat-shattering dimension,
thus obtaining the desired entropy estimate.
\par

\subsection{General Estimates}
We shall investigate two generic cases. The first is when
$N\bigl(\eps,\Eff,L_2(\mu)\bigr)=O(\eps^{-p})$ for some $p>0$,
and the second, (which is more difficult), when $\log
N\bigl(\eps,\Eff,L_2(\mu)\bigr)=O(\eps^{-p})$ for $0 < p <2$. The
first case was investigated by \citet{Dud87}. He showed that for
every $\delta>0$, the log-covering numbers of the convex hull of
$\Eff$ are polynomial in ${\eps}^{-1}$ with exponent $\delta+
2p(p+2)^{-1}$ . This result was improved independently by
\citet{VW} and by \citet{Carl97} who removed the superfluous
$\delta$. Those results indicate that if
$N\bigl(\eps,\Eff,L_2(\mu)\bigr) \leq \gamma\eps^{-p}$ for some
$\gamma>0$ and $p>0$, and if $K$ is the symmetric convex hull of
$\Eff$, then the entropy integral $\int_0^1 \log^{1/2}
N\bigl(\eps,K,L_2(\mu)\bigr)d\eps$ converges. This fact is very
significant, since this integral measures in some sense how
``large" the class is. For example, classes with bounded entropy
integrals satisfy the {\it uniform central limit theorem}
\citep[see][]{Dudley99}.
\par
On the other hand, from the quantitative point of view, there were
no estimates on the constant $C=C(\gamma,p)$ for which
\begin{equation*}
\log N\bigl(\eps,K,L_2(\mu)\bigr) \leq
C\Bigl(\frac{1}{\eps}\Bigr)^{ \frac{2p}{p+2}}.
\end{equation*}
Note that from the Machine Learning point of view, the constant
is significant, since for VC classes the exponent $p$ is half the
VC dimension of the class. Hence, it is only natural to try and
find the way in which the constant depends on $p$. Another natural
question which Dudley's assertion raises is whether similar
results may be obtained even when the covering numbers are
considerably larger. For example, does the entropy integral of
the symmetric convex hull of $\Eff$ converge if $\log
N\bigl(\eps,\Eff,L_2(\mu)\bigr)=O(\eps^{-p})$ for some $0<p<2$.
We present partial answers to both questions.
\begin{Theorem} \label{coverthm}
Let $\mu$ be a probability measure on $\Omega$. Assume that $\Eff$
is a subset of the unit ball of $L_2(\mu)$ and set $K$ to be the
symmetric convex hull of $\Eff$.
\begin{description}
\item{{\rm 1.}}
If there are $\gamma,p>0$ such that
$N\bigl(\eps,\Eff,L_2(\mu)\bigr) \leq \gamma \eps^{-p}$ for every
$\eps>0$, then there is an absolute constant $C$ such that for
every $\eps>0$,
\begin{equation*}
\log N\bigl(\eps,K,L_2(\mu)\bigr) \leq C\gamma^{\frac{2}{p}}p
\Bigl(\frac{1}{\eps}\Bigr)^{\frac{2p}{2+p}}.
\end{equation*}
\item{{\rm 2.}}
If there are $\gamma>0$ and $0<p<2$ such that $\log
N\bigl(\eps,\Eff,L_2(\mu)\bigr) \leq \gamma \eps^{-p}$ for every
$\eps>0$, then there is some constant $C(p,\gamma)$ (which depends
only on $p$ and $\gamma$), such that
\begin{equation*}
\log N\bigl(\eps,K,L_2(\mu)\bigr) \leq C(p,\gamma)
\frac{1}{\eps^2}\log^{1-\frac{2}{p}}\frac{1}{\eps}.
\end{equation*}
In particular, the entropy integral of $K$ converges for $p<2/3$.
\end{description}
\end{Theorem}
An estimate on the growth rate of the entropy numbers recently
appeared in \cite{CKP}, using very deep results in the local
theory of Banach spaces. The significance of the proof we present
here is the fact that it does not use the powerful machinery of
convex geometry, hence, it is more accessible.
\par
We present a complete proof of the second claim. The proof of the
first assertion follows the same path and some of the details are
omitted. Both proofs are based on an idea which was used in
\citet[pg.~142]{VW}, and in fact, the first claim follows from a
careful analysis of the proof in \cite{VW}.
\par
%
We shall require three preliminary results. The first result is
due to B. Maurey, appeared in \citep{Maurey}, and is interesting
by itself.
\begin{Lemma} \label{maurey}
Let $\Eff \subset L_2(\mu)$ be a set of $n$ functions and denote
its diameter by ${\rm diam}(\Eff)$. Then, for every $\eps>0$,
\begin{equation*}
N\bigl(\eps {\rm diam}(\Eff), {\rm conv}(\Eff), L_2(\mu)\bigr)
\leq (e+en\eps^2)^{\frac{2}{\eps^2}}.
\end{equation*}
\end{Lemma}
\par
\noindent Let $(\delta_i)_{i=1}^\infty$ be a positive sequence
decreasing to $0$ and set $\Eff_i$ to be an increasing family of
sets, such that for every $i$, $\Eff_i$ is a $\delta_i$-separated
set in $\Eff$ which is maximal with respect to inclusion (that is,
each $\Eff_i$ is $\delta_i$-separated and if $\Eff_i \subset A
\subset \Eff$ then $A$ is not $\delta_i$-separated). Note that in
our case, each one of the sets $\Eff_i$ is finite.
\par
For every $i>j$ and every $x \in \Eff_i$, let $P_jx$ be a member
of $\Eff_j$ which is nearest to $x$ and put $G_{ij}=\{ x-P_jx \ |
\ x \in \Eff_i \}$.
\par
\begin{Lemma} \label{conv}
For every $i>j \geq 1$ and every $\eps,\eps'>0$,
\begin{equation*}
\conv (\Eff_i) \subset \conv (\Eff_j) + \conv (G_{ij} \cup \{0\}),
\end{equation*}
and
\begin{equation*}
N(\eps,\conv(\Eff_i)) \leq \Bigl(1+\frac{4\eps'}{\eps} \Bigr)
^{\abs{\Eff_j}}N\bigl(\eps',\conv(\Eff_j)\bigr)
\Bigl(e+e\abs{\Eff_i} \frac{\eps^2}{16\delta_j^2}
\Bigr)^{32\delta_j^2/\eps^2}.
\end{equation*}
\end{Lemma}
\par
\noindent {\bf Proof:} Clearly, the cardinality $\abs{G_{ij} \cup
\{0\}} \leq \abs{\Eff_i}-\abs{\Eff_j}+1$ and, since $\Eff_j$ is a
maximal $\delta_j$-separated set in $\Eff$, then for every $x \in
\Eff$, $\norm{x-P_jx} \leq \delta_j$.
\par
Set $\Eff_i=\{x_1,...,x_{\abs{\Eff_i}}\}$ and $\Eff_j
=\{x_1,...,x_{\abs{\Eff_j}} \} \subset \Eff_i$. If $z \in
\conv(\Eff_i)$, then there are $\lambda_i \geq 0$ such that
$\sum_{i=1}^{\abs{\Eff_i}} \lambda_k=1$ and
\begin{equation*}
z=\sum_{k=1}^{\abs{\Eff_i}} \lambda_k x_k =
\sum_{k=1}^{\abs{\Eff_j}} \lambda_kx_k
+\sum_{k=\abs{\Eff_j}+1}^{\abs{\Eff_i}} \lambda_k P_jx_k+
\sum_{k=\abs{\Eff_j}+1}^{\abs{\Eff_i}}
\lambda_k(x_j-P_jx_k)=z_1+z_2
\end{equation*}
where $z_1 \in \conv(\Eff_j)$ and $z_2 \in \conv(G_{ij} \cup
\{0\})$. Hence,
\begin{equation*}
\conv(\Eff_i) \subset \conv(\Eff_j)+\conv(G_{ij} \cup \{0\})
\end{equation*}
and the first assertion is verified.
\par
It is routine to see that if $A,B,C \subset L_2(\mu)$ are such
that $A \subset B+C$, then for every $\eps_1,\eps_2>0$,
\begin{equation*}
N(\eps_1+\eps_2,A) \leq N(\eps_1,B) \cdot N(\eps_2,C).
\end{equation*}
Thus, by the first claim it follows that
\begin{equation} \label{mul}
N(\eps,\conv(\Eff_i)) \leq
N\bigl(\frac{\eps}{2},\conv(\Eff_j)\bigr) \cdot
N\bigl(\frac{\eps}{2},\conv(G_{ij} \cup \{0\})\bigr).
\end{equation}
To estimate the first term, note that ${\rm span}(\Eff_j)$ can be
isometrically embedded in $\ell_2^{\abs{\Eff_j}}$. Therefore, the
covering numbers of $\Eff_j$ in $L_2(\mu)$ and in
$\ell_2^{\abs{\Eff_j}}$ are the same. Let ${\cal B}$ be the unit
ball in $\ell_2^{\abs{\Eff_j}}$. Using a standard volume estimate
\citep[see][]{Pisier89}, one can show that for every $\eps,\eps'$,
\begin{equation*}
N(\frac{\eps}{2},\eps'{\cal B}) \leq
\Bigl(1+\frac{4\eps'}{\eps}\Bigr)^{\abs{\Eff_j}}.
\end{equation*}
Hence,
\begin{align*}
N\bigl(\frac{\eps}{2},\conv(\Eff_j)\bigr) & \leq
N\bigl(\frac{\eps}{2},\eps'{\cal B}\bigl) \cdot
N\bigl(\eps',\conv(\Eff_j)\bigr)
\\
& \leq \Bigl(1+\frac{4\eps'}{\eps} \Bigr)^{\abs{\Eff_j}} \cdot
N\bigl(\eps',\conv(\Eff_j)\bigr).
\end{align*}
As for the second term in (\ref{mul}), since ${\rm diam}(G_{ij}
\cup \{0\}) \leq 2\delta_j$ and $\abs{G_{ij} \cup \{0\}} \leq
\abs{\Eff_i}$, then by Lemma \ref{maurey}
\begin{align*}
N\Bigl(\frac{\eps}{2},\conv \bigl(G_{ij} \cup \{0\} \bigr) \Bigr)
& = N\Bigl(\frac{\eps}{4\delta_j}2\delta_j,\conv \bigl( G_{ij}
\cup \{0\} \bigr) \Bigr)
\\
& \leq \Bigl(e+e\abs{\Eff_i} \frac{\eps^2}{16\delta_j^2}\Bigr)
^{32\delta_j^2/\eps^2}.
\end{align*}
\par
\rightline{$\blacksquare$}
\par
Lemma \ref{conv} reveals our strategy which is similar in nature
to chaining. Here, we create an increasing family of sets which
form an increasingly finer approximation of $F$. It is possible to
estimate the covering numbers of the convex hulls of the ``finer"
sets using an estimate on the ``coarser" sets. The rates by which
the mesh of the classes $F_i$ decreases will be dictated by the
growth rates of the covering numbers in the class $F$. In the next
two technical results we select an appropriate rate of decay for
the ``mesh" sequence $(\delta_n)$.
\par
\begin{Lemma} \label{sequences1}
Let $\Eff$ be a subset of the unit ball in $L_2(\mu)$ and assume
that there are constants $\gamma,p>0$, such that for every
$\eps>0$,
\begin{equation*}
N\bigl(\eps,\Eff,L_2(\mu)\bigr) \leq \gamma \Bigl(\frac{1}{\eps}
\Bigr)^{p}.
\end{equation*}
Let $\delta_n=\gamma^{1/p}n^{-1/p}$ and set $\Eff_n$ to be as in
Lemma \ref{conv}. Then, there are bounded sequences $(A_k)$ and
$(B_k)$ such that for every $n$ and $k$,
\begin{equation}
\log N \bigl(A_k n^{\frac{1}{2}}\delta_n,{\rm conv}
(\Eff_{nk^{3p}})\bigr) \leq B_kn.
\end{equation}
Moreover, there are absolute constants $A$ and $B$ such that for
every $k$, $A_k \leq \gamma^{1/p}A$ and $B_k \leq Bp$.
\end{Lemma}
\begin{Lemma} \label{sequences2}
Let $\Eff$ be a subset of the unit ball of $L_2(\mu)$ and assume
that there are $\gamma>0$ and $0<p<2$ such that for every
$\eps>0$,
\begin{equation*}
\log N\bigl(\eps,\Eff,L_2(\mu) \bigr) \leq \gamma\eps^{-p}.
\end{equation*}
Set $\delta_n=2\gamma^{1/p}\log^{-1/p}n$,
$\eps_n=n^{-1}\log^{1/p}n$ and let $\Eff_n$ be as in Lemma
\ref{conv}. Then, there are sequences $(A_k)$ and $(B_k)$ which
depend on $p$, such that for all integers $n \geq 2$ and $k \geq
1$
\begin{equation*}
\log N \bigl(A_k\eps_n, \conv (\Eff_{[n^{k^\alpha}]}) \bigr) \leq
B_k \frac{n^2}{\log^{\frac{4}{p}-1}n},
\end{equation*}
where $\alpha=4p/(2-p)$ and $[x]$ denotes the integer value of
$x$. Moreover, $\sup A_k \leq A_p'$ and $\sup B_k \leq B_p'$ for
some constants $A_p'$ and $B_p'$ which depend only on $p$.
\end{Lemma}
\par
We present a complete proof of Lemma \ref{sequences2}. The proof
of Lemma \ref{sequences1} follows from a similar argument. The
idea behind the proof of Lemma \ref{sequences1} is due to
\citet{VW}. The quantitative estimate on the constants does not
appear in that text, but may be derived by a close analysis of
the proof the authors present.
\par
\vskip 0.4cm
\noindent {\bf Proof of Lemma \ref{sequences2}:} We
use a nested induction argument. First, we prove our claim for
$k=1$ using induction on $n$. We then prove the claim for a
general $k$ for every fixed $n$.
\par
Recall that for every integer $n$, $\Eff_n$ is maximal $\delta_n$
separated in $\Eff$. Thus
\begin{equation*}
\abs{\Eff_n} \leq N(\frac{\delta_n}{2},\Eff) \leq
e^{\gamma(2/\delta_n)^p}=n.
\end{equation*}
\par
Let $n_0 \geq 4$ be an integer such that for every $n \geq n_0$,
\begin{equation*}
\frac{ \iger{ \frac{n}{2} }^2}{\log^{\frac{4}{p}-1}\iger{
\frac{n}{2} }} \leq \frac{3}{4}\frac{n^2}{\log^{\frac{4}{p}-1}n}.
\end{equation*}
Let $k=1$ and $2 \leq n \leq 2n_0$ and set
$A_1=2n_0\log^{-1/p}2n_0$. Since for such a value of $n$,
$A_1\eps_n \geq 1$, only a single ball is required to cover
$\Eff$ and our claim follows.
\par
Next, let $n > 2n_0$ and assume that for every $2 \leq m <n$,
\begin{equation*}
\log N(A_1\eps_m,\Eff_{m}) \leq B_1 m^2 \log^{1-4/p}m,
\end{equation*}
where $B_1$ is to be specified later. We can apply Lemma
\ref{maurey} with $i=n$, $j=\iger{n/2}$, $\eps=A_1\eps_n$ and
$\eps'=A_1\eps_{\iger{n/2}}$. It follows that
\begin{align} \label{eq1}
& N  \bigl( A_1\eps, \conv \Eff_{n} \bigr)
\\
\nonumber & \leq \Bigl(1+\frac{4\eps_{\iger{\frac{n}{2}}}}
{\eps_n}\Bigr)^{\iger{\frac{n}{2}}} \cdot N \bigl(
A_1\eps_{\iger{\frac{n}{2}}}, \conv \Eff_{\iger{\frac{n}{2}}}
\bigr) \cdot \bigl(e+e\abs{\Eff_n}
\frac{\eps^2}{16\delta_j^2}\bigr) ^{32\delta_j^2/\eps^2}.
\end{align}
A straightforward calculation shows that there is an absolute
constant $C$ such that for every integer $n$,
\begin{equation*}
\frac{\eps_{\iger{n/2}}}{\eps_n} \leq C, \qquad
\frac{\delta_j}{\eps_n} \leq C\gamma^{\frac{1}{p}}
\frac{n}{\log^{\frac{2}{p}}n}.
\end{equation*}
Applying the induction hypothesis and (\ref{eq1}) there is a
constant $C=C(p)$ such that
\begin{align*}
\log{N  \bigl( A_1\eps_n, \conv \Eff_{n} \bigr)} \leq Cn+ B_1
\frac{ \iger{ \frac{n}{2} }^2} { \log^{\frac{4}{p}-1} \iger{
\frac{n}{2} } }+ C\frac{ \gamma^{\frac{2}{p}}
n^2}{\log^{\frac{4}{p}}n}.
\end{align*}
By the selection of $n_0$,
\begin{equation*}
B_1 \iger{ \frac{n}{2} }^2 \log^{1-\frac{4}{p}} \iger{
\frac{n}{2} } \leq \frac{3}{4} B_1 n^2\log^{1-\frac{4}{p}}n,
\end{equation*}
and thus, there is a constant $C(p,\gamma)$ such that if
$B_1=C(p,\gamma)$ then
\begin{equation*}
N(A_1\eps_n,\conv \Eff_{n}) \leq B_1n^2 \log^{1-\frac{4}{p}}n
\end{equation*}
as claimed.
\par
Now, we fix some $n \geq 2$ and use induction with respect to $k$.
Let $i=[n^{k^\alpha}]$ and $j=[n^{(k-1)^\alpha}]$. Note that
$\abs{G_{ij} \cup \{0\}} \leq [n^{k^{\alpha}}]$ and that
\begin{equation*}
{\rm diam}(G_{ij} \cup \{0\}) \leq 2\delta_j=
4\gamma^{1/p}\log^{-1/p}[n^{(k-1)^\alpha}].
\end{equation*}
By Lemma \ref{maurey},
\begin{align*}
N\Bigl(\frac{\eps_n}{k^2},\conv(G_{ij}\cup \{0\}) \Bigr) & \leq
N\Bigl(\frac{\eps_n}{2\delta_j k^2}{\rm diam}(G_{ij}\cup \{0\}),
\conv(G_{ij}\cup \{0\}) \Bigr) \leq \\
& \leq \Bigl(e+e\abs{G_{ij}\cup
\{0\}}\frac{\eps_n^2}{4k^2\delta_j^2}
\Bigr)^{8k^2\delta_j^2/\eps_n^2}.
\end{align*}
It is straightforward to see that there is some constant $C=C(p)$
such that
\begin{align*}
\frac{k^2\delta_j^2}{\eps_n^2} & =
\frac{\gamma^{\frac{2}{p}}k^2}{\log^{\frac{2}{p}}[n^{(k-1)^\alpha}]}
\cdot \frac{n^2}{\log^{\frac{2}{p}}n} \leq
C\frac{\gamma^{\frac{2}{p}}k^2}{(k-1)^{\frac{2\alpha}{p}}} \cdot
\frac{n^2}{\log^{\frac{4}{p}}n}.
\end{align*}
Also,
\begin{align*}
\frac{\abs{G_{ij} \cup \{0\}}\eps_n^2}{k^2\delta_j^2} &  \leq
C\frac{n^{k^\alpha}}{k^{2-\frac{2\alpha}{p}}} \cdot
\frac{\log^{\frac{4}{p}}n}{n^2} \leq
C\frac{n^{k^\alpha}}{k^{2-\frac{2\alpha}{p}}}.
\end{align*}
Using the above estimates and the definition of $\alpha$ it
follows that
\begin{align*}
& \log{ N  \Bigl(\frac{\eps_n}{k^2},\conv(G_{ij}\cup \{0\})
\Bigr)}
\\
\leq & C(p,\gamma) \frac{n^2}{\log^{\frac{4}{p}-1}{n}} \cdot
\frac{k^{2+\alpha}\log{k}}{(k-1)^{\frac{2\alpha}{p}}}
=C(p,\gamma)\frac{n^2\log{k}}{k^2\log^{\frac{4}{p}-1} n}.
\end{align*}
On the other hand, by the induction hypothesis,
\begin{equation*}
\log{\Bigl(A_{k-1}\eps_n, \conv \Eff_{[n^{(k-1)^\alpha}]} \Bigr)}
\leq B_{k-1}\frac{n^2}{\log^{\frac{4}{p}-1}n}.
\end{equation*}
Applying Lemma \ref{conv} and combining the two covers, we obtain
an $A_k\eps_n=(A_{k-1}+1/k^2)\eps_n$ cover of $\conv
\Eff_{[n^{k^\alpha}]}$. Hence,
\begin{equation*}
\log{\bigl(A_{k}\eps_n, \conv \Eff_{[n^{k^\alpha}]} \bigr)} \leq
\Bigl(B_{k-1}+C(p,\gamma)\frac{\log{k}}{k^2}\Bigr)
\frac{n^2}{\log^{\frac{4}{p}-1} n}.
\end{equation*}
And our claim follows. It is important to note that the sequences
$(A_k)$ and $(B_k)$ are bounded by some constant $C=C(p,\gamma)$.
\par
\rightline{$\blacksquare$}
\par
\noindent {\bf Proof of Theorem \ref{coverthm}:} We begin with
the proof of the first part of our theorem. Fix some integer $n
\geq 2$ and let $\eps_n=\gamma^{1/p}n^{-1/2-1/p}$. By Lemma
\ref{sequences1} it follows that for every integer $k$,
\begin{equation*}
\log{N\bigl(A_k\eps_n, \conv (\Eff_{nk^{3p}})\bigr)} \leq B_kn
\leq Bpn = Bp\Bigl(\frac{1}{\eps_n} \Bigr)^{\frac{2p}{2+p}}.
\end{equation*}
Since for every $k$, $A \geq A_k$, then
\begin{equation*}
\log{N\bigl(A\eps_n,\conv (\Eff_{nk^{3p}})\bigr)} \leq
\log{N\bigl(A_k\eps_n, \conv (\Eff_{nk^{3p}})\bigr)} \leq
Bp\Bigl(\frac{1}{\eps_n} \Bigr)^{\frac{2p}{2+p}}.
\end{equation*}
Taking $k$ to infinity,
\begin{equation*}
\log{N\bigl(A\eps_n,\conv (\Eff)\bigr)} \leq
Bp\Bigl(\frac{1}{\eps_n} \Bigr)^{\frac{2p}{2+p}}.
\end{equation*}
The claim for a general $\eps$ follows since $\eps_n/\eps_{n+1}$
is a bounded sequence.
\par
Turning to the second assertion, according to Lemma
\ref{sequences2} and by the same argument as above for every $n
\geq 2$,
\begin{equation*}
\log{N\bigl(A_p^{'}\eps_n,\conv (\Eff)\bigr)} \leq
B_p^{'}\frac{n^2}{\log^{\frac{4}{p}-1}n}.
\end{equation*}
Since $\eps_n=n^{-1}\log^{1/p}n$ then
$n=\eps_n^{-1}\log^{\frac{1}{p}}n$ and $n \geq
\frac{C}{\eps_n}\log^{\frac{1}{p}} \frac{1}{\eps_n}$. Hence,
\begin{equation*}
\log{N\bigl(A_p^{'}\eps_n,\conv (\Eff)\bigr)} \leq
B_p^{'}\frac{1}{\eps_n^2}\log^{1-\frac{2}{p}}\frac{1}{\eps_n}.
\end{equation*}
Again, the claim follows since $\eps_n/\eps_{n+1}$ is a bounded
sequence.
\par
\rightline{$\blacksquare$}
\subsection{Applications}
We shall present several examples which are interesting from the
Machine Learning perspective.
\par
One of the basic results regarding VC classes is that the
covering numbers of such classes in any $L_2(\mu)$ are polynomial
in $1/\eps$ \citep{Haussler95,VW}:
\begin{Theorem} \label{Haussler}
Let $\Eff$ be a class of $\{0,1\}$-valued functions such that
$VC(\Eff)=d$. Then, there is an absolute constant $C$ such that
for every probability measure $\mu$ on $\Omega$,
$N(\eps,\Eff,L_2(\mu)) \leq Cd(4e)^d \eps^{-2d}$.
\end{Theorem}
\begin{Corollary} \label{e_kVC}
Assume that $\Eff$ is a $\{0,1\}$-class such that $VC(\Eff)=d$.
By Theorem \ref{coverthm} and Theorem \ref{Haussler} for $p=2d$
and $\gamma=Cd(4e)^d$, and since $\gamma^{2/p}$ is bounded by some
absolute constant, it follows that
\begin{equation*}
\log{N \bigl(\eps,{\rm conv}(F),L_2(\mu)\bigr)} \leq Cd
\Bigl(\frac{1}{\eps}\Bigr)^{\frac{2d}{1+d}},
\end{equation*}
where $C$ is an absolute constant.
\end{Corollary}
\par
Next, we establish a similar estimate in the case where the
fat-shattering dimension satisfies $\ff{\eps}=O(\eps^{-p})$ for
some $0<p<2$. It is possible to connect the fat-shattering
dimension and the $L_2(\mu)$ covering numbers \citep{COLT01b}.
\begin{Theorem} \label{thm:generalcover}
Let $F$ be a class of functions into $[0,1]$. Then, there is an
absolute constant $C$ such that for every probability measure
$\mu$,
\begin{equation*}
\log{N\bigl(\eps,F,L_2(\mu)\bigr)} \leq C\ff{
\frac{\eps}{32}}\log^2{\Bigl(\frac{2\ff{\frac
{\eps}{32}}}{\eps}\Bigr)}.
\end{equation*}
\end{Theorem}
%
%
\begin{Corollary} \label{N(F,mu)}
Let $\Eff$ be a class of functions from $\Omega$ into $[0,1]$ and
assume that there are $\gamma>0$ and $0<p<2$ such that $\ff{\eps}
\leq \gamma\eps^{-p}$. Then, for every $p'>p$ there is some
constant $C=C(p,p',\gamma)$ such that for every probability
measure $\mu$,
\begin{equation*}
\log N\bigl(\eps,{\rm conv}(F),L_2(\mu)\bigr) \leq
C\frac{1}{\eps^2}\log^{1-\frac{2}{p'}} \frac{1}{\eps}.
\end{equation*}
\end{Corollary}
%
%
%
%
%
\section{Type and the fat-shattering dimension} \label{typesec}
In this section, we tackle the problem of estimating the
fat-shattering dimension of convex hulls of classes. To that end,
we first deal with ``linear" fat-shattering dimension. By this we
mean the fat-shattering dimension of a set of linear functionals,
for example, the unit ball of the dual space of some Banach space,
when considered to be a class of functions on the unit ball of
the space. As it turns out, given a Banach space $X$, the
fat-shattering dimension of the set $B(X^*)$ when considered as
functions on $B(X)$ is completely determined by a property of $X$
called {\it type} (defined below).
\par
The results we present aid our goal since one can apply an
embedding argument to show that, in some sense, the
fat-shattering dimension of a given class may be controlled by the
fat-shattering dimension of a class of linear functionals.
\par
In order to bound the fat-shattering dimension of convex hulls we
take the following course of action: first, we apply the
embedding argument, which reduces the problem to a ``linear" one.
Then, we show that the symmetric convex hull of the embedded class
can be approximated by a class which has ``almost" the same
fat-shattering dimension, but also a well behaved type structure.
This is done by using an estimate on the covering numbers of the
class with respect to the $L_\infty$ norm. The fat-shattering of
the approximating class may be bounded via the bound on ${\rm
fat}_\eps \bigl(B(X^*),B(X)\bigr)$ mentioned above.
\par
Before we continue, we require additional definitions, originating
in the theory of Banach spaces. For the basic definitions we
refer the reader to \citep{Pisier89} or \citep{TJ}.
\par
Let $K$ be a bounded, convex symmetric subset of $\R^n$ which has
a nonempty interior. One can define a norm on $\R^n$ whose unit
ball is $K$. This is done using the Minkowski functional on $K$,
which is denoted by $\norm{ \ }_K$, and given by
\begin{equation*}
\norm{x}_K=\inf \{t>0 | t^{-1}x \in K \}.
\end{equation*}
It is possible to show that if $K \subset \ell_2^n$ is convex and
symmetric with a nonempty interior then $\norm{ \ }_K$ is indeed
a norm and $K$ is its unit ball. Set $\norm{ \ }_{K^*}$ to be the
dual norm to $\norm{ \ }_K$.
\begin{Definition} \label{polar}
If $F$ is a bounded subset of $\ell_2^n$, let
\begin{equation*}
F^{\rm o}=\{x \in \ell_2^n | \sup_{f \in F} \abs{\inr{f,x}} \leq
1 \},
\end{equation*}
where $\inr{-,-}$ is the inner product in $\ell_2^n$. The set
$F^{\rm o}$ is called the polar of $F$.
\end{Definition}
For any set $F$, let ${\rm absconv}(F)$ be its symmetric convex
hull. Formally,
\begin{equation*}
\aconv(F)=\Bigl\{\sum_{i=1}^n a_if_i | n \in \N, f_i \in F,
\sum_{i=1}^n \abs{a_i}=1 \Bigr\}.
\end{equation*}
\par
It is easy to see that $F^o=\bigl({\rm absconv}(F)\bigr)^o$ and
that if $G \subset F$ then $F^o \subset G^o$. Note that $F^o$ is
the unit ball of the norm $\norm{ \ }_{K^*}$, where $K={\rm
absconv}(F)$. Hence, for every $x \in \ell_2^n$,
\begin{equation*}
\norm{x}_{F^o}=\sup_{y \in {\rm absconv(F)}} \inr{y,x}=\sup_{y
\in F} \abs{\inr{y,x}}.
\end{equation*}
In particular, if $F=\{f_1,...,f_m\}$ then
$\norm{x}_{F^o}=\max_{1 \leq i \leq m} \abs{\inr{f_i,x}}$.
\par
Given a class $F$ and an empirical measure $\mu_n$ supported on
$\{\om_1,...,\om_n\}$, we endow $\R^n$ with the Euclidean
structure of $L_2(\mu_n)$, which is isometric to $\ell_2^n$.
Therefore, for every $f \in L_2(\mu_n)$,
\begin{equation*}
\norm{f}_{L_2(\mu_n)}=\Bigl(\frac{1}{n}\sum_{i=1}^n
f^2(\om_i)\Bigr)^{\frac{1}{2}}.
\end{equation*}
Let $F/\mu_n$ be the image of $F$ in $L_2(\mu_n)$ under the
inclusion operator, that is,
\begin{equation*}
F/\mu_n= \Bigl\{\sum_{i=1}^n f(\om_i)\chi_{\{\om_i\}} | f \in F
\Bigr\}.
\end{equation*}
Since $(n^{1/2}\chi_{\{\om_i\}})_{i=1}^n$ is an orthonormal basis
of $L_2(\mu_n)$, then
\begin{equation*}
F/\mu_n=\Bigl\{n^{-\frac{1}{2}}\sum_{i=1}^n f(\om_i)e_i | f \in
F\Bigr\}.
\end{equation*}
\par
Throughout this section, given an empirical measure $\mu_n$, we
denote by $(e_i)_{i=1}^n$ the orthonormal basis of $L_2(\mu_n)$
given by $(n^{1/2}\chi_{\{\om_i\}})_{i=1}^n$.
\par
The next lemma is straightforward and its proof is omitted.
\begin{Lemma} \label{shattering}
Let $S=\{\om_1,...,\om_n\}$ be a sample and let $\mu_n$ be the
empirical measure supported on $S$.
\begin{description}
\item{1.} If $S$ is shattered by $\Eff$ then the
set $\{\sqrt{n}e_1,...,\sqrt{n}e_n\} \subset (\Eff/\mu_n)^o$ is
shattered by $\Eff/\mu_n$.
\item{2.} If $S$ is $\eps$-shattered by $\Eff$ then
$\{\sqrt{n}e_1,...,\sqrt{n}e_n\} \subset (\Eff/\mu_n)^o$ is
$\eps$-shattered by $\Eff/\mu_n$.
\end{description}
\end{Lemma}
%
%
%
\par
\subsection{The fat-shattering dimension of linear functionals}
Recall the definition of the Rademacher type of a Banach space:
\begin{Definition}
A Banach space $X$ has Rademacher type $p$ if there is some $C$
such that for every integer $n$ and every $x_1,...,x_n \in X$,
\begin{equation} \label{radtype}
\E \norm{ \sum_{i=1}^n \eps_i x_i} \leq C \Bigl( \sum_{i=1}^n
\norm{x_i}^p \Bigr)^{1/p},
\end{equation}
where $(\eps_i)$ are independent Rademacher random variables
(i.e., symmetric $\{-1,1\}$-valued). The best constant for which
(\ref{radtype}) holds is called the Rademacher $p$-type constant
of $X$ and is denoted by $T_p(X)$.
\end{Definition}
\begin{Theorem} \label{lvc}
Let $X$ be a Banach space which has type $p$ for some $1 < p \leq
2$ with a type constant $T_p(X)$. Then
\begin{equation*}
{\rm fat}_\eps \bigl(B(X^*),B(X)\bigr) \leq
\Bigl(\frac{T_p(X)}{\eps}\Bigr)^{\frac{p}{p-1}}.
\end{equation*}
\end{Theorem}
Note that a similar result to this was demonstrated by
\cite{Gurvits}, though in his result the bound is on the level
fat-shattering dimension (that is, the witness $(s_i)_{i=1}^n$ to
the shattering is a constant set: there is some $a$ such that
$s_i=a$ for every $1 \leq i \leq n$).
\par
\vskip 0.4cm
\noindent {\bf Proof:} Assume that the set
$\{x_1,...,x_n\} \subset B(X)$ is $\eps$-shattered  by $B(X^*)$
and set $\{s_1,...,s_n\}$ to be a witness to the shattering. Let
$I \subset \{1,...,n\}$ and put $x^*_I$ to be the functional
shattering the set $I$. Note that if $i \in I$ then
\begin{equation*}
x^*_I(x_i)-x^*_{I^c}(x_i) \geq s_i+\eps-(s_i-\eps)=2\eps,
\end{equation*}
and if $i \in I^c$,
\begin{equation*}
x^*_{I^c}(x_i)-x^*_{I}(x_i) \geq s_i+\eps-(s_i-\eps)=2\eps.
\end{equation*}
Thus,
\begin{align*}
& \norm{ \Bigl(\sum_{i \in I} x_i-\sum_{i \in I^c} x_i \Bigr)}
=\sup_{x^* \in B(X^*)} \abs{x^*\Bigl(\sum_{i \in I} x_i-\sum_{i
\in I^c} x_i \Bigr)} \\
& \geq \frac{1}{2} \sup_{x^*,\tilde{x}^* \in B(X)}
\abs{x^*\Bigl(\sum_{i \in I} x_i-\sum_{i \in I^c} x_i
\Bigr)-\tilde{x}^*\Bigl(\sum_{i \in I} x_i-\sum_{i \in I^c} x_i
\Bigr)}=(*)
\end{align*}
Selecting $x^*=x^*_I$ and $\tilde{x}^*=x^*_{I^c}$,
\begin{align*}
(*) & \geq \frac{1}{2} \abs{x^*_I\Bigl(\sum_{i \in I} x_i-\sum_{i
\in I^c} x_i \Bigr)-x^*_{I^c}\Bigl(\sum_{i \in I} x_i-\sum_{i \in
I^c} x_i \Bigr)} \\
& =\frac{1}{2} \abs{\sum_{i \in I} \bigl(x^*_I(x_i)-x^*_{I^c}(x_i)
\bigr) + \sum_{i \in I^c}
\bigl(x^*_{I^c}(x_i)-x^*_{I}(x_i) \bigr)} \\
& \geq \frac{1}{2} \bigl(2\eps\abs{I}+2\eps \abs{I^c} \bigr) =
\eps n.
\end{align*}
%
%
%
Now, we can use the type property to establish an upper bound for
an appropriate subset $I$ which will be selected randomly. Indeed,
since $X$ has type $p$ and since $\norm{x_i} \leq 1$ then
\begin{equation*}
\E\norm{\sum_{i=1}^n \eps_ix_i} \leq T_p(X)\bigl(\sum_{i=1}^n
\norm{x_i}^p)^{\frac1p} \leq T_p(X)n^{\frac1p}.
\end{equation*}
Thus, there is a realization of the random variables $(\eps_i)$
for which this inequality holds. Let $I=\{i | \eps_i=1\}$. Then,
\begin{equation*}
\norm{\sum_{i \in I} x_i - \sum_{i \in I^c} x_i} \leq
T_p(X)n^{\frac1p}.
\end{equation*}
Thus, $n\eps \leq T_p(X)n^{1/p}$ and our claim follows.
\par
\rightline{$\blacksquare$}
\par
\begin{Remark}
It is possible to show that this bound is tight
\citep[see][]{Men98}. Indeed, one can show that if $p^*=\sup\{p|
{\rm X \ has \ type} \ p \}$ then for every $\eps>0$,
\begin{equation*}
\Bigl(\frac{1}{\eps}\Bigr)^{\frac{p^*}{p^*-1}} -1 \leq {\rm
fat}_\eps\bigl(B(X^*),B(X)\bigr).
\end{equation*}
This implies that if $X$ is a Hilbert space then for every
$\eps>0$,
\begin{equation*}
\Bigl(\frac{1}{\eps^2}\Bigr)-1 \leq{\rm
fat}_\eps\bigl(B(X^*),B(X)\bigr)\leq \Bigl(\frac{1}{\eps^2}\Bigr).
\end{equation*}
\end{Remark}
%
\subsection{Covering numbers and type constants}
Here, we show that if the class $F$ is well behaved, (either a VC
class or a class with a finite fat-shattering dimension for every
$\eps$), then the Rademacher type-2 constant of $(F/\mu_n)^o$
does not grow too rapidly as a function of $n$.
\par
In order to bound the type-2 constant of $(F/\mu_n)^o$, we use a
combination of two facts. The first, which is the only non
elementary fact we require, is an estimate on
$T_2(\ell_\infty^n)$. It is possible to show \citep{TJ} that there
are absolute constants $c$ and $C$ such that for every integer
$n$,
\begin{equation} \label{ell_inftyT2}
c(1+\log{n})^{1/2} \leq T_2(\ell_\infty^n) \leq
C(1+\log{n})^{1/2}.
\end{equation}
The second fact we use is that if the cardinality of $F$ is small
then $F^o$ can be isometrically embedded into $\ell_\infty^n$ for
a relatively small $n$.
\begin{Lemma} \label{iso}
Let $F \subset \ell_2^n$ be a finite set. Then $F^o$ can be
isometrically embedded into $\ell_\infty^{\abs{F}}$.
\end{Lemma}
\noindent {\bf Proof:} Let $F=\{f_1,...,f_m\}$ and define
$T:(\R^n,\norm{ \ }_{F^o}) \to \ell_\infty^{\abs{F}}$ by
$Tx^*=\bigl(x^*(f_i)\bigr)_{i=1}^m$. Then, for every $x^* \in
\R^n$
\begin{equation*}
\norm{Tx^*}_{\ell^{\abs{F}}_\infty}=\sup_{1 \leq i \leq m}
\abs{x^*(f_i)}=\norm{x^*}_{F^o},
\end{equation*}
implying that $T$ is an isometry.
\par
\rightline{$\blacksquare$}
\par
This fact is very useful from our point of view, since $F/\mu_n$
are relatively small sets. In the real valued case, the
$L_\infty(\mu_n)$ covering numbers of $F$ may be bounded using
$\ff{\eps}$ (Theorem \ref{ABCH}), whereas for VC classes, one may
use the following version of Sauer's Lemma \citep{VW}:
\begin{Lemma} \label{Sauer}
There is an absolute constant $C$ such that if $F$ is a class of
$\{0,1\}$-valued functions on $\Omega$ with $VC(F)=d$, then for
every empirical measure $\mu_n$, $\abs{F/\mu_n} \leq Cn^d$.
\end{Lemma}
Now, we can bound $T_2\bigl((F/\mu_n)^o\bigr)$ for VC classes. In
fact, we prove the following:
\begin{Theorem} \label{VCandT2}
Let $\Eff$ be a class of $\{0,1\}$-valued functions. Then, $\Eff$
is a VC class if and only if there is some constant $C>0$ such
that for every integer $n$ and every empirical measure $\mu_n$,
$T_2\bigl((F/\mu_n)^o\bigr) \leq C(1+\log{n})^{1/2}$.
\end{Theorem}
\noindent{\bf Proof:} Note that if $VC(\Eff)=d$ then by Sauer's
Lemma there is an absolute constant $C$ such that
$\abs{\Eff/\mu_n}\leq Cn^d$. Hence, $(F/\mu_n)^o$ can be
isometrically embedded in $\ell_\infty^{Cn^d}$. It follows that
$T_2(F^o) \leq C(1+d\log{n})^{1/2} \leq Cd^{1/2}(1+\log n)^{1/2}$
as claimed.
\par
Conversely, assume that there is a constant $C$ such that for
every integer $n$ and every empirical measure $\mu_n$,
$T_2((F/\mu_n)^o) \leq C(1+\log{n})^{1/2}$. Let
$S=\{\om_1,...,\om_n\}$ be shattered by $\Eff$ and set $\mu_n$ to
be the empirical measure supported on $S$. By the first part of
Lemma \ref{shattering} and Theorem \ref{lvc},
\begin{equation*}
n \leq {\rm fat}_{\frac{1}{2}}((F/\mu_n)^o,F/\mu_n) \leq
4T_2^2\bigl((F/\mu_n)^o \bigr) \leq C(1+\log{n}),
\end{equation*}
implying that $n$ can not be arbitrarily large, and thus
$VC(\Eff,\Omega)<\infty$.
\par
\rightline{$\blacksquare$}
\par
\begin{Remark}
The proof of theorem \ref{VCandT2} implies that if $\Eff$ is a
$\{0,1\}$-valued class such that $\sup_{\mu_n} T_2\bigl(
(F/\mu_n)^o \bigr)=o(n)$ then $\Eff$ is a VC class.
\end{Remark}
Note that the upper bound on $T_2\bigl((F/\mu_n)^o \bigr)$ can
not be improved. Indeed, let $\Eff$ be the set of characteristic
functions of intervals $[-1,a]$ for $a \in (-1,1]$. Thus,
$VC(\Eff)=1$ and for every non degenerate empirical measure
$\mu_n$ of $[-1,1]$,
\begin{equation*}
\Eff/\mu_n=\Bigl\{(1,0,...,0),(1,1,0,...,0),...,(1,1,...,1)\Bigr\}.
\end{equation*}
Therefore, $(F/\mu_n)^o$ is isometric to $\ell_\infty^n$ and
$T_2\bigl((F/\mu_n)^o\bigr) \geq C(1+\log{n})^{1/2}$.
\par
Also, Theorem \ref{VCandT2} does not apply to classes which are
not $\{0,1\}$-valued classes. For example, let $\Omega=B(\ell_p)$
for some $1<p<2$ and $\Eff=B(\ell_q)$ where $p^{-1}+q^{-1}=1$.
For every $i$ denote by $e_i$ the $i$-th unit vector in $\ell_p$,
and given an integer $n$, let $\mu_n$ be the empirical measure
supported on $\{e_1,...,e_n\} \subset B(\ell_p)$. Since
$\Eff/\mu_n=n^{-1/2}B(\ell_q^n)$ then $(F/\mu_n)^o$ is isometric
to $\ell_p^n$ and $T_2 \bigl((F/\mu_n)^o \bigr) \geq
n^{\frac{1}{p}-\frac{1}{2}}$ \citep[see][]{TJ}. Hence, it is
impossible to obtain an analogous result to Theorem \ref{VCandT2}
in the general case. We bypass this obstacle by replacing the set
$F/\mu_n$ (which may be infinite) by an $L_\infty(\mu_n)$ cover of
$F$.
\par
The following lemma indicates that an $\eps$-cover of a set in
$L_\infty(\mu_n)$ has essentially the same fat-shattering
dimension as the original set.
\begin{Lemma} \label{shattering2}
Let $\Eff$ be a class of functions into $[0,1]$ which
$\delta$-shatters $A=\{\om_1,...,\om_n\}$, and let $\mu_n$ be the
empirical measure on $A$. Assume that $\delta>\eps$ and that $H$
is an $\eps$-cover of $\Eff/\mu_n$ in $L_\infty(\mu_n)$. Then,
$A'=\{\sqrt{n}e_1,...,\sqrt{n}e_n\} \subset (F/\mu_n)^o$ is
$(\delta-\eps)$-shattered by $H$, where the elements of $H$ are
viewed as linear functionals on $(F/\mu_n)^o$.
\end{Lemma}
\noindent{\bf Proof:} By Lemma \ref{shattering}, $A'$ is
$\delta$-shattered by $\Eff/\mu_n$ and set $(s_i)$ to be a
witness to the shattering. Fix $I \subset \{1,...,n\}$ and let
$f_I \in \Eff/\mu_n$ be the $\delta$-shattering functional on
$I$. Put $h_I \in H$ such that $\norm{f_I-h_I}_{L_\infty(\mu_n)}
< \eps$. Since $\norm{\sqrt{n}e_i}_{L_1(\mu_n)}=1$ then for every
$i \in I$
\begin{align*}
\inr{h_I,\sqrt{n}e_i} & =\inr{f_I,\sqrt{n}e_i}+\inr{h_I-f_I,\sqrt{n}e_i} \\
& \geq s_i+\delta-
\norm{h_I-f_I}_{L_\infty(\mu_n)}\norm{\sqrt{n}e_i}_{L_1(\mu_n)} \\
& \geq s_i+\delta-\eps,
\end{align*}
and in a similar fashion, if $j \in I^c$ then
%\begin{equation*}
$\inr{h_I,\sqrt{n}e_i} \leq s_i-\delta+\eps$,
%\end{equation*}
and our claim follows.
\par
\rightline{$\blacksquare$}
\par
\subsection{The fat-shattering dimension of convex hulls}
We begin by formulating the main result, which enables one to
estimate the fat-shattering dimension of the convex hull of a
class $\Eff$ using the $L_\infty(\mu_n)$ covering numbers of
$\Eff$.
\par
We introduce the following notation. Given a class $\Eff$, an
integer $n$ and $\eps>0$, let
\begin{equation*}
N_\infty(n,\eps)= \sup_{\mu_n}
N\bigl(\eps,\Eff,L_\infty(\mu_n)\bigr).
\end{equation*}
Thus, $N_\infty(n,\eps)$ is the supremum of the $\eps$-covering
numbers of $\Eff$ in empirical $L_\infty$ spaces, where the
empirical measure is supported on at most $n$ elements of
$\Omega$.
%
%
%
\begin{Theorem} \label{fatandcover}
Let $\Eff$ be a class of functions whose range is a subset of
$[0,1]$ and set $K$ to be its symmetric convex hull. Then, for
every $\eps,\tau>0$,
\begin{equation*}
{\rm fat}_{\eps+\tau}\bigl(K,\Omega\bigr) \leq \max \Bigl\{ n| n
\leq  C\bigl(1+\log N_\infty(n,2\eps)\bigr) \frac{1}{\tau^2}
\Bigr\},
\end{equation*}
where $C$ is an absolute constant.
\end{Theorem}
\noindent{\bf Proof:} Fix $\eps, \tau>0$, assume that
$A=\{\om_1,...,\om_n\}$ is $(\eps+\tau)$-shattered by $K={\rm
absconv}(F)$ and let $\mu_{n}$ be an empirical measure supported
on $A$. Let $H \subset \Eff/\mu_{n}$ be an $\eps$-cover to
$\Eff/\mu_{n}$ in $L_\infty(\mu_{n})$ and set $G$ to be the
symmetric convex hull of $H$. Clearly, $\abs{H} \leq
N\bigl(2\eps,F/\mu_n,L_\infty(\mu_n)\bigr)$.
\par
Since $H$ is an $\eps$-cover of $\Eff/\mu_{n}$ in $L_\infty
(\mu_{n})$ then $G$ is an $\eps$-cover to $K$ in
$L_\infty(\mu_{n})$. By Lemma \ref{shattering2}, if the set $A$ is
$\eps+\tau$ shattered by $K$ then the set
$A'=\{\sqrt{n}e_1,...,\sqrt{n}e_{n}\}$ is $\tau$-shattered by
$G$, when members of $G$ are viewed as linear functionals on
$(F/\mu_n)^o$. Also, note that $A' \subset (F/\mu_n)^o$. Thus,
\begin{equation*}
n \leq {\rm fat}_\tau\bigl(G,(\sqrt{n}e_i)_{i=1}^n\bigr) \leq {\rm
fat}_\tau \bigl(G,(F/\mu_n)^o \bigr).
\end{equation*}
\par
Since $H \subset \Eff/\mu_{n}$, then by taking convex hulls $G
\subset K$. Using the properties of the polar, $(F/\mu_n)^o=K^o
\subset G^o$, thus ${\rm fat}_\tau \bigl(G,(F/\mu_n)^o\bigr)\leq
{\rm fat}_\tau (G,G^o)$. By Lemma \ref{iso} there is an absolute
constant $C$ such that
\begin{equation*}
T_2(G^o) \leq C(1+\log \abs{H})^{\frac{1}{2}} \leq C\Bigl(1+\log N
\bigl(2\eps,F/\mu_n,L_\infty(\mu_n)\bigr)\Bigr)^{\frac{1}{2}}.
\end{equation*}
Hence, by Theorem \ref{lvc},
\begin{equation*}
{\rm fat}_\tau (G,G^o) \leq C\Bigl(1+\log N
\bigl(\eps,F/\mu_n,L_\infty(\mu_n)\bigr)\Bigr) \frac{1}{\tau^2}.
\end{equation*}
Therefore,
\begin{equation*}
n \leq C\Bigl(1+\log N
\bigl(2\eps,F/\mu_n,L_\infty(\mu_n)\bigr)\Bigr) \frac{1}{\tau^2}.
\end{equation*}
\par
\rightline{$\blacksquare$}
\par
%
%
%
%
%
Applying the known bounds on the $L_\infty(\mu_n)$ covering
numbers in terms of the VC or the fat-shattering dimension we can
establish the following estimate on the fat-shattering dimension
of convex hulls.
\begin{Corollary} \label{6.10}
There is an absolute constant $C$ such that for every
$\{0,1\}$-class of functions and every $\eps,\tau>0$,
\begin{equation*}
{\rm fat}_{\eps+\tau}(K,\Omega) \leq C
\frac{d}{\tau^2}\log{\frac{2d}{\tau}},
\end{equation*}
where $K$ is the symmetric convex hull of $\Eff$ and $VC(\Eff)=d$.
\par
\noindent In particular, there is an absolute constant $C$ such
that for every $\tau>0$,
\begin{equation*}
{\rm fat}_\tau(K,\Omega) \leq C\frac{d}{\tau^2}\log
\frac{2d}{\tau}.
\end{equation*}
\end{Corollary}
\par
\noindent {\bf Proof:} By Sauer's Lemma, there is an absolute
constant $C$ such that for every integer $n$,
$\log{N_\infty(n,\eps)} \leq Cd\log{n}$. Fix $\eps,\tau>0$ and
let ${\rm fat}_{\eps+\tau}(K,\Omega)=n$. By Theorem
\ref{fatandcover}, $n \leq Cd\tau^{-2}(1+\log{n})$ and our claim
follows.
\par
\rightline{$\blacksquare$}
\par
The best known estimate on the fat-shattering dimension of the
convex hull of a VC class is ${\rm fat}_\eps\bigl({\rm
conv}(F)\bigr)=O(d/\eps^2)$. This bound is due to \cite{Gurvits}
and was obtained using a similar geometric approach to the one
presented here. The one key difference between the two approaches
is that Gurvits uses a different notion of type. His result is
based on an estimate on the type-2 constant of a certain
operator, which is due to \citet[Theorem 14.15]{LeTa}. It turns
out that their bound depends on the fact that the class $F$ has a
converging entropy integral, which is the case for VC classes.
\par
This approach is not suitable for arbitrary classes of functions,
when one does not have a-priori ``global" data (for example, the
fat-shattering dimension or covering numbers at every scale), but
rather, the fat-shattering dimension at a given scale. It is
possible to extend Corollary \ref{6.10} and estimate the
fat-shattering dimension of the convex hull of a class using the
fat-shattering dimension of the class itself, without resorting
to ``global" data.
\begin{Theorem} \label{7}
There is an absolute constant $C$ such that for any class $\Eff$
of functions into $[0,1]$ and every $\eps,\tau \in (0,1)$,
\begin{equation*}
{\rm fat}_{\eps+\tau}(K,\Omega) \leq C \frac{ {\rm fat}
_{\frac{\eps}{2}}(\Eff,\Omega)} {\tau^2} \log^2{\frac{ 2{\rm fat}
_{\frac{\eps}{2}}(\Eff,\Omega)}{\tau^2\eps}}
\end{equation*}
where $K$ is the symmetric convex hull of $\Eff$.
\par
\noindent In particular, there is an absolute constant $C$ such
that for every $\tau>0$,
\begin{equation*}
{\rm fat}_{\tau}(K,\Omega) \leq C \frac{ {\rm fat}
_{\frac{\tau}{4}}(\Eff,\Omega)} {\tau^2} \log^2{\frac{ 2{\rm fat}
_{\frac{\tau}{4}}(\Eff,\Omega)}{\tau}}.
\end{equation*}

\end{Theorem}
\noindent {\bf Proof:} Set $\eps, \delta \in (0,1)$ and let
$n={\rm fat}_{2\eps+\tau}(K,\Omega)$. By Theorem \ref{ABCH},
\begin{equation*}
\log{N_\infty(n,2\eps)} \leq C {\rm
fat}_{\frac{\eps}{2}}(\Eff,\Omega) \log^2{\frac{n}{\eps}}.
\end{equation*}
Applying Theorem \ref{fatandcover} and since $n>1$ and $\eps \in
(0,1)$, there is an absolute constant $C$ such that
\begin{equation*}
n \leq C\frac{{\rm fat}_{\frac{\eps}{2}}(\Eff,\Omega)}{\tau^2}
\log^2{\frac{n}{\eps}}.
\end{equation*}
Hence,
\begin{equation*}
n \leq C \frac{{\rm fat}_{\frac{\eps}{2}}(\Eff,\Omega)}{\tau^2}
\log^2{ \biggl(\frac{2{\rm fat}
_{\frac{\eps}{2}}(\Eff,\Omega)}{\tau^2 \eps}\biggr) },
\end{equation*}
and the claim follows.
\par
\rightline{$\blacksquare$}
\par
\begin{Remark}
Using a different approach it is possible to improve Theorem
\ref{7} when one has  ``global" data on the fat-shattering
dimension of the class. For example, it is possible to recover
Gurvits' estimate for the convex hull of VC classes, with a much
simpler proof. This approach may be extended to a more general
setup. Indeed, if $\ff{\eps}=O(\eps^{-p})$ for $p \not =2$ then
${\rm fat}_\eps \bigl({\rm
conv}(F)\bigr)=O\bigl(\eps^{-\max\{2,p\}}\bigr)$. If $p=2$ then
${\rm fat}_\eps \bigl({\rm conv}(F)\bigr)=O(\eps^{-2}\log^4
\frac{1}{\eps})$. This result is demonstrated by analyzing the
growth rate of the Rademacher averages associated with the class
\citep[see][]{COLT01b} for further details).
\end{Remark}


\bibliography{crap}
\end{document}
