\NeedsTeXFormat{LaTeX2e}
\documentclass[twoside,11pt]{article}
\usepackage{jmlr2e}
\usepackage{natbib}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{theorem}
\usepackage{url}
\sloppy
\DeclareSymbolFontAlphabet{\Bbb}{AMSb}
\parskip0em



\newtheorem{example}{Example} 
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma} 
\newtheorem{proposition}[theorem]{Proposition} 
\newtheorem{remark}[theorem]{Remark}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{axiom}[theorem]{Axiom}







\newcommand{\eins}{\boldsymbol{1}}

\newcommand{\T}{\Bbb{T}}    % Sonderzeichen f〉

\newcommand{\C}{\Bbb{C}}    % Sonderzeichen f〉

\newcommand{\N}{\Bbb{N}}    % Sonderzeichen f〉
\newcommand{\R}{\Bbb{R}}    % Sonderzeichen f〉
\newcommand{\Rp}{\Bbb{R}^+}    % Sonderzeichen f〉
\newcommand{\Rn}{\Bbb{R}^n}    % Sonderzeichen f〉
\newcommand{\K}{\Bbb{K}}    % Sonderzeichen f〉
\newcommand{\Z}{\Bbb{Z}}    % Sonderzeichen f〉
\newcommand{\Q}{\Bbb{Q}}    % Sonderzeichen f〉
\newcommand{\E}{\Bbb{E}}    % Sonderzeichen f〉


\newcommand{\opa}{\mbox{\rm Pr}^*}


\newcommand{\So}[1]{{\sf SO(\mbox{$#1$})}}
\newcommand{\On}[1]{{\sf ON(\mbox{$#1$})}}
\newcommand{\Gl}[1]{{\sf GL(\mbox{$#1$})}}

\newcommand{\ca}[1]{\mbox{${\mathcal #1}$}}
\newcommand{\rem}[1]{}

\newcommand{\iso}{=\hspace{-0.8em}^{^1}\hspace{0.4em}}
\newcommand{\isoin}{\hookrightarrow\hspace{-1em}^{^1}\hspace{0.6em}}


\newenvironment{abstrakt}{\small}{}

\newenvironment{beispiel}{\small\it}{}


\def \B         {{\cal B}}                % Borelsche s-Algebra

\def \Om        { \Omega }
\def \om        { \omega }
\def \lb        { \lambda }
\def \a         { \alpha }
\def \b         { \beta }
\def \g         { \gamma }
\def \e         { \varepsilon }
\def \r         { \rho }
\def \s         { \sigma }
\def \vs        { \varsigma }
\def \t         { \tau }
\def \et        { \tilde{\eta}}
\def \d         { \delta }
\def \D         { \Delta }
\def \p         { \varphi }
\def \x         { \xi }
\def \P         { \Phi }


\newcommand{\ld}{\log_2}
\newcommand{\loq}{\log_m}
\newcommand{\ed}{\exp_2}

\newcommand{\edot}{\!\cdot\!}
\newcommand{\dsp}{S\edot P}
\newcommand{\jin}{j\in\{0,+\}}
\newcommand{\iin}{i\in\{-1,1\}}


\newcommand{\Sim}{\ \sim \ }
\newcommand{\Preceq}{\ \preceq \ }
\newcommand{\Succeq}{\ \succeq \ }
\newcommand{\Leq}{\ \leq \ }
\newcommand{\Geq}{\ \geq \ }
\newcommand{\Eq}{\ = \ }
\newcommand{\Defi}{\ := \ }


\newcommand{\Lfloor}{\left\lfloor}
\newcommand{\Rfloor}{\right\rfloor}



\newcommand{\Bew}{\noindent{\bf Proof\ }}
\newcommand{\Bewo}[1]{\noindent{\bf Proof #1\ }}
\newcommand{\eBew}{$\hfill\BlackBox$\\[2mm]}

\newcommand{\sn}{\s_{n-1}}

\newcommand{\Int}{\int\limits}
\newcommand{\Sum}{\sum\limits}

\DeclareMathOperator{\co}{co}
\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\diam}{diam}

\newcommand{\modc}[2]{\mbox{$\omega (#1,#2)$}}


\newcommand{\im}[1]{\mbox{${\rm {im}} \ #1$}}
\newcommand{\Ker}[1]{\mbox{${\mathfrak {Ker}} \ #1$}}
\newcommand{\rank}[1]{{\rm rank} \ #1}
%\newcommand{\rank}[1]{\mbox{\rm rank $#1$}}
\newcommand{\codim}[1]{\mbox{\rm codim $#1$}}
\newcommand{\vol}{\mbox{\rm vol}}
\DeclareMathOperator{\spann}{span}
\newcommand{\card}{\mbox{\rm card }}
\newcommand{\dn}[2]{\mbox{$d_{\| .\|}(#1,#2)$}}

\newcommand{\ol}[1]{\overline{#1}}

\DeclareMathOperator{\vcdim}{VC-dim}
\DeclareMathOperator{\pot}{pot}


\newcommand{\id}{\mbox{${\rm {id}}$}}

\newcommand{\posku}[2]{\mbox{$\ca W_{#1}(#2)$}}
\newcommand{\einku}[2]{\mbox{$\ca W_{#1}^1(#2)$}}

\newcommand{\btens}{\bigotimes}
\newcommand{\tens}{\otimes}
\newcommand{\xtens}[1]{\otimes_{_{\triangle_{#1}}}}
\newcommand{\afalt}{\circledast}


%Normen und Rеme

\newcommand{\lt}[1]{\mbox{${\ell}_2 (#1)$}}

\newcommand{\lto}{\mbox{${\ell}_2$}}
%Der Raum Lo

\newcommand{\Lo}{\mbox{${L}_o (\mu)$}}
\newcommand{\Lom}[1]{\mbox{${L}_o (#1)$}}
\newcommand{\Lox}[1]{\mbox{${L}_o (\mu,#1)$}}
\newcommand{\Loxinf}[1]{{\sf L\mbox{$_o (\mu,#1)$}}}
\newcommand{\Lof}{\mbox{${L}_o (\mu,F)$}}
\newcommand{\onorm}[1]{\mbox{$\left\Vert #1 \right\Vert_{_0}$}}

%Der Raum Lps

\newcommand{\Lps}{\mbox{${L}_{p,\infty} (\mu)$}}
\newcommand{\Lqs}{\mbox{${L}_{q,\infty} (\mu)$}}
\newcommand{\Lpsm}[1]{\mbox{${L}_{p,\infty} (#1)$}}
\newcommand{\Lxsm}[2]{\mbox{${L}_{#1,\infty} (#2)$}}
\newcommand{\psno}[1]{\mbox{$\left\Vert #1 \right\Vert_{p,\infty}$}}
\newcommand{\qsno}[1]{\mbox{$\left\Vert #1 \right\Vert_{q,\infty}$}}


% Der Raum Lp

\newcommand{\Lp}{\mbox{${L}_{p} (\mu)$}}
\newcommand{\Lpm}[1]{\mbox{${L}_{p} (#1)$}}
\newcommand{\Lx}[1]{\mbox{${L}_{#1} (\mu)$}}
\newcommand{\Lxn}[1]{\mbox{${L}_{#1} (\nu)$}}
\newcommand{\Lpx}[3]{\mbox{${L}_{#1} (#2,#3)$}}
\newcommand{\Le}{\mbox{${L}_{1} (\mu)$}}
\newcommand{\Li}{\mbox{${L}_{\infty} (\mu)$}}
\newcommand{\Lpmx}[2]{\mbox{${L}_{#1} (#2)$}}
\newcommand{\Lpmxge}[2]{\mbox{${L}_{#1}^{^{\geq 0}} (#2)$}}
\newcommand{\Lpmxpo}[2]{\mbox{${L}_{#1}^{^{> 0}} (#2)$}}
\newcommand{\wLpx}[3]{\mbox{$w^*$-$L_{#1} (#2,#3')$}}
\newcommand{\wsLpx}[3]{\mbox{$w^*$-$\ca L_{#1} (#2,#3')$}}
\newcommand{\wN}[3]{\mbox{$w^*$-$\ca N_{#1} (#2,#3')$}}


\newcommand{\pnorm}[1]{\mbox{$\left\Vert #1 \right\Vert_{p}$}}
\newcommand{\norm}[1] {\Bigl\Vert #1 \Bigr\Vert}
\newcommand{\Norm}[1] {\left\Vert #1 \right\Vert}
\newcommand{\snorm}[1] {\Vert #1 \Vert}

\newcommand{\enorm}[1]{\mbox{$\left\Vert #1 \right\Vert_{E}$}}
\newcommand{\fnorm}[1]{\mbox{$\left\Vert #1 \right\Vert_{F}$}}
\newcommand{\xnorm}[3]{\mbox{$\left\Vert #1 \right\Vert_{{#2}}^{#3}$}}
\newcommand{\ynorm}[2]{\mbox{$\left\Vert #1 \right\Vert_{{#2}}$}}
\newcommand{\betr}[1]{\mbox{$\left\vert #1 \right\vert$}}

\newcommand{\M}[4]{\mbox{$\ca M (\Lpx #1 G #3,\Lpx #2 G #4)$}}

\newcommand{\einorm}[1]{\mbox{$\left\Vert #1 \right\Vert_{1}$}}
\newcommand{\inorm}[1]{\mbox{$\left\Vert #1 \right\Vert_{\infty}$}}
\newcommand{\isnorm}[1]{\mbox{$\bigl\Vert #1 \bigr\Vert_{\infty}$}}

\newcommand{\Coo}[1]{\mbox{${C}_{oo} (#1)$ }}
\newcommand{\supp}[1]{\mbox{{\rm supp} $#1$}}
\newcommand{\sL}[2]{\mbox{${\cal L}(#1,#2)$ }}
\newcommand{\sLp}[1]{\mbox{${\cal L}_p(\mu)$ }}


\newcommand{\vwu}[1]{\norm {#1} _r}
\newcommand{\vwurs}[1]{\norm {#1} _{r'}}
\newcommand{\mzr}[1]{{\sf m}_r(#1)}
\newcommand{\mzx}[2]{{\sf m}_{#2}(#1)}




\begin{document}


\jmlrheading{2}{2001}{67-93}{08/01}{12/01}{Steinwart}
\ShortHeadings{On the Consistency of Support Vector Machines}{Steinwart}
\firstpageno{67}


\title{On the Influence of the Kernel on the Consistency of Support Vector Machines}
\author{\name Ingo Steinwart \email steinwart@minet.uni-jena.de\\
	\addr Mathematisches Institut\\
	      Friedrich-Schiller-Universit\"at\\
	      Ernst-Abbe-Platz 1-4\\
	      07743 Jena, Germany}
\editor{Bernhard Sch\"olkopf (for \url{HTTP://WWW.KERNEL-MACHINES.ORG})}





\maketitle



\begin{abstract}%
In this article we study the generalization abilities
of several classifiers of support vector machine (SVM) type using a certain class
of kernels that we call universal. It is shown that the soft margin algorithms with universal kernels
are consistent for
a large class of classification problems including some kind of noisy tasks provided that the 
regularization parameter is chosen well. In particular we derive a simple
sufficient condition for this parameter in the case of Gaussian RBF kernels.
On the one hand our considerations are based on an investigation of an 
approximation property---the so-called universality---of the used kernels 
that ensures that all continuous functions can be approximated by certain kernel expressions.
This approximation property
also gives a new insight into the role of kernels in these and other
algorithms. On the other hand the results are achieved by a precise study of the
underlying optimization problems of the classifiers.
Furthermore, we show consistency for the maximal margin classifier as well as for the soft margin SVM's
in the presence of large margins. In this case it turns out that also constant regularization parameters
ensure consistency for the soft margin SVM's. Finally we prove that even for simple, noise free 
classification problems SVM's with polynomial kernels can behave arbitrarily badly.
\end{abstract}


\begin{keywords}
Computational learning theory, pattern recognition, PAC model,
support vector machines, kernel methods
\end{keywords}



\section{Introduction}



Support vector machines comprise a class of learning algorithms
originally introduced for pattern recognition problems.
Although their development was motivated by results of statistical
learning theory the known bounds on their generalization performance
are not fully satisfactory.
In particular, the influence of the chosen kernel is far from being
completely understood.
The aim of this paper is to give a new insight into the role of the kernels.
Our considerations are mainly based on a certain
approximation property of various standard kernels that generate
function classes with infinite VC-dimension. Since in this case classical
Vapnik-Chervonenkis theory fails to be applicable for support
vector machines, other concepts
such as data dependent structural risk minimization, e.g.\ in terms of the observed
margin, were introduced \citep*[cf.][chap.~2]{SBWA,BST,CST}.
The latter usually needs large margins on the training sets to provide good bounds.
It is, however, open which distributions and kernels guarantee this
assumption. A systematical study of this question is the starting point of
this paper. The resulting techniques allow us to show new bounds
for the generalization performance of several standard support vector classifiers.

We begin with a description of the problem of pattern
recognition \citep*[cf.][]{Vap,CST}.
Let $(X,d)$ be a compact metric space\footnote{For mathematical notions see Section 2.},
$Y:= \{-1,1\}$ and $P$ be a probability measure on $X\times Y$,
where $X$ is equipped with the Borel $\s$-algebra.
By disintegration \citep[cf.][Lem.~1.2.1.]{Dud} there exists
a map $x\mapsto P(\,.\,|x)$ from $X$ into the set of all probability measures on $Y$
such that $P$ is the joint distribution of $(P(\,.\,|x))_x$ and of the
marginal distribution $P_X$ of $P$ on $X$.
We call $P(\,.\,|.)$, which is in fact a regular conditional probability, the {\em supervisor}.
A classifier is an algorithm that constructs to every {\em training set}
$T=((x_1,y_1),\dots,(x_n,y_n))\in (X\times Y)^n$ a {\em decision function}
$f_T:X \to Y$. In our context it
is always assumed that $T$
is i.i.d.\ according to $P$, which itself is unknown.
Then the decision function $f_T:X\to Y$ constructed by the classifier should guarantee
a small probability for the misclassification
of an example $(x,y)$ randomly generated according to $P$. Here, misclassification means
$f(x) \neq y$.
To make this precise, for a measurable function
$f:X\to \{-1,1\}$ we define the risk of $f$ by
$$
\ca R_{P}(f) \Defi \Int_{X\times Y} \eins_{\{f(x) \neq y\}}\ P(dx,dy) \Eq
P\bigl(\{(x,y): f(x)\neq y\}\bigr)\ .
$$
When considering noisy supervisors we cannot expect that we obtain zero risk.
Indeed, for
\begin{eqnarray*}
B_1(P) & := & \bigl\{x\in X\ : \ P(y=1|x) > P(y=-1|x)\, \bigr\}\\
B_{-1}(P) & := & \bigl\{x\in X\ : \ P(y=1|x) < P(y=-1|x)\, \bigr\}\\
B_0(P) & := & \bigl\{x\in X\ : \ P(y=1|x) = P(y=-1|x)\, \bigr\}
\end{eqnarray*}
and a function $f_0:X\to \{-1,1\}$ with
$f_0(x)=1$ if $x\in B_1(P)$ and $f_0(x)=-1$ if $x\in B_{-1}(P)$
we have \citep[cf.][Thm.~2.1.]{DGL}
\begin{equation}\label{Bayesrisk}
 \ca R_{P}(f_0)  \Eq
\inf\bigl\{\ca R_{P}(f) \ :\  f\!:\!X\to \{-1,1\} \mbox{ measurable}\bigr\}
\Eq
\Int_X p(x)\, P_X(dx)\ ,
\end{equation}
where the {\em noise level} $p:X\to \R$ is defined by
$p(x):= P(y=-1|x)$ for $x\in B_1(P)$, $p(x):= P(y=1|x)$ for $x\in B_{-1}(P)$
and $p(x)=1/2$ otherwise. Equation (\ref{Bayesrisk}) shows that
no function can yield less risk than $f_0$.
The function $f_0$ is called  an optimal Bayes decision rule and we write
$\ca R_{P} := \ca R_{P}(f_0)$.
Now, a classifier $\ca C$ should guarantee with high probability that $\ca R_{P}(f_T)$
is close to $\ca R_{P}$ provided that $T$ is
large enough. Here, $f_T$ denotes the decision function
constructed by $\ca C$ on the basis of $T$.
Asymptotically, this means that
\begin{equation*}
\ca R_{P}(f_T) \to \ca R_{P}
\end{equation*}
should hold in probability if $|T|\to\infty$.
In this case the algorithm $\ca C$ is called {\em consistent} for $P$ \citep[cf.][Def.~6.1]{DGL}.
If a classifier is consistent for all distributions on $X\times Y$ it is said to
be universally consistent. Although several algorithms such as the $k$-nearest neighbour
classifier
for $k\to \infty$ and $k/|T| \to 0$ \citep[cf.][Thm.~6.4]{DGL} are universally
consistent it is an open question whether support vector machines are universally
consistent for a particular choice of the free parameters.

In this article we show
that at least for a restricted class of distributions SVM's are consistent
provided that the parameters are chosen in a specific manner. In particular our results
cover both the noise free case and the case of constant noise level,
i.e $p \equiv p^*$, $p^*\in [0,1/2)$.
Our results are based on an investigation of a certain approximation property of kernels.
Recall that the ansatz of SVM's is to solve specific optimization problems
over the class of functions $\{\langle w,\P(.)\rangle : w\in H\}$
(or $\{\langle w,\P(.)\rangle + b : w\in H, b\in \R\}$),
where $\P:X\to H$ is a feature map of the used kernel.
If this function class is dense in $C(X)$ we shall call the corresponding kernel {\em universal}
(cf.\ Def.\ \ref{Defuniv}).
Roughly speaking this notion enables us to approximate the Bayes decision rule in probability, a fact that is
frequently used in our proofs of consistency. Using the approximation theorem of
Stone-Weierstra\ss\ we show that kernels that can be expanded in certain types of
Taylor or Fourier series are universal (cf.\ Cor.\ \ref{taylorexp} and Cor.\ \ref{fourierexp}).
In particular it turns out that the Gaussian RBF kernel is universal (cf.\ Ex. \ref{gaussexp}).
Besides the importance of the notion of universality in the context of
consistency it also turns out that this concept has strong implications
for the geometric interpretation of the shape of the feature map 
(cf.\ Cor.\ \ref{sepcpct} and the following remark).

Since the class of functions implemented by an SVM with universal kernel
is very rich the problem of overfitting can always occur in the presence of noise.
Thus it is very important to know how to chose the regularization parameter. The second part of this work is
devoted to this question. Here, we show in particular that for a soft margin SVM
with Gaussian RBF kernel on $X\subset \R^d$ the regularization sequence
$c_n=n^{\b-1}$, $0<\b<1/d$, yields consistency for all problems with constant noise level
(cf.\ Cor.\ \ref{Cor1} and Cor.\ \ref{Cor3}). These results are of special
interest since they show at the first time that SVM's are able to learn noisy
problems arbitrarily well.
Moreover, we prove that for problems that ensure a large margin it suffices to use universal kernels
and a regularization parameter that is {\em independent} of the training set size
(cf.\ Thm.\ \ref{HS3}, Thm.\ \ref{HS4} and Thm.\ \ref{HS7}). For this class of
problems we also prove consistency for the maximal margin classifier
(cf.\ Thm.\ \ref{HS8}). Finally, it turns out that even for these
simple, noise free classification problems SVM's with polynomial kernels
can behave arbitrarily badly (cf.\ Prop.\ \ref{Pro1}).

This work is organized as follows:
we introduce some mathematical notions in Section \ref{prelim}.
In the third section
we study the concept of universal kernels.
The following sections are devoted to applications of these kernels to support vector
machines. We begin with the 2-soft margin classifier in Section 4. Here, consistency results
for both noisy distributions and problems ensuring a large margin are proved. In the
fifth section
we show that these results also hold for the 1-soft margin algorithm. In the last section
we discuss the maximal margin classifier in the presence of large margins.







\section{Preliminaries}\label{prelim}





For a set $X$, a metric $d$ on $X$ is a function
$d:X\times X\to [0,\infty)$ such that for all $x,y,z\in X$ we have
$d(x,y)=d(y,x)$ and $d(x,y)  \leq  d(x,z)+d(z,y)$ as well as $d(x,y) =0$ if and
only if $x=y$.
We denote
the closed ball with radius $\e$ and centre $x$ by $B_d(x,\e):=\{y\in X:d(x,y)\leq \e\}$.
The covering numbers of $X$ are defined by
$$
\ca N((X,d),\e) \Defi \min\Bigl\{n\in \N\cup \{\infty\}\ : \
\exists x_1,\dots,x_n \mbox{ with }
X\subset \bigcup_{i=1}^n B_d(x_i,\e)\Bigr\}
$$
for all $\e>0$. The space $(X,d)$ is precompact if and only if $\ca N((X,d),\e)$
is finite for all $\e>0$.
Moreover, $X$ is called compact if every open covering of $X$ has a finite
subcovering.
If the space $(X,d)$ is complete, i.e.\ every Cauchy
sequence converges in $X$, then $X$ is compact if and only if
$X$ is precompact.
For given $A,B\subset X$ we denote the distance of $A$ and $B$
by
$$
d(A,B) \ :=\ \inf_{\substack{x\in A\\ y\in B}} d(x,y).
$$
We often use that if $A$ is closed, $B$ is compact and both sets are disjoint then
$d(A,B)>0$ holds. 

A $\s$-algebra on a set $X$ is a set of subsets of $X$ that contains $\emptyset$
and is closed under elementary, countable set operations such as complements and countable
intersections. For a metric space $(X,d)$ the Borel $\s$-algebra is
the smallest $\s$-algebra that contains all open sets. Let $\ca A$ be a $\s$-algebra on $X$.
A subset $A$ of $X$ is called measurable if $A\in \ca A$.
We say that
a function $f:X\to \R$ is measurable if the pre-image of every Borel measurable $B\subset \R$
is in $\ca A$. Basic examples are the functions $\eins_A$, where $A\in \ca A$ and
$\eins_A(x)=1$ if $x\in A$ and $\eins_A(x)=0$ otherwise.
A probability measure $P :\ca A\to \Rp$
is a $\s$-additive function with $P(\emptyset)=0$ and $P(X)=1$.
If $\ca A$ is a Borel $\s$-algebra we call $P$ a Borel probability measure.
In this case $P$ is said to be regular, if for
all Borel measurable $B\subset X$ we have
$$
P(B) \Eq \sup \{P(K): K\subset B \mbox{, $K$ compact}\}\ .
$$
If $(X,d)$ is compact, then every Borel probability measure on $X$
is regular \citep[cf.][p.~176]{Dud}.

In this paper $H$ always denotes a Hilbert space, i.e.\ a complete, normed
vector space endowed with a dot product $\langle.,.\rangle$ giving rise to
its norm via $\snorm x = \sqrt{\langle x,x\rangle}$.
Let $B_H:=\{x\in H: \snorm x \leq 1\}$ be the closed unit ball of $H$ and
$S_H:=\{x\in H: \snorm x = 1\}$ be its sphere.
Recall, that every separable Hilbert space is isometrically isomorphic to
the space of all 2-summable sequences $\lto$.

A commutative algebra $A$ is a vector space equipped with an
additional associative and commutative multiplication $\cdot:A\times A\to A$
such that
\begin{eqnarray*}
x\edot (y+z) & = & x\edot y+x\edot z\\
\lb (x\edot y) & = & (\lb x)\edot y
\end{eqnarray*}
holds for all $x,y,z\in A$ and $\lb\in \R$.
A classical example of an algebra is the space $C(X)$ of all
continuous functions $f:X\to \R$ on the compact metric space $(X,d)$
endowed with the usual supremum norm
$$
\inorm f \Defi \sup_{x\in X} |f(x)|\ .
$$
The following well-known approximation theorem of
Stone-Weierstra\ss \  \citep[cf.][Cor.~4.3.5.]{Ped} states
that certain subalgebras of $C(X)$ generate the whole space.
This result will be the key tool when considering approximation
properties of kernels in the next section:




\begin{theorem}\label{StoneWeier}
Let $(X,d)$ be a compact metric space and $A\subset C(X)$ be an algebra.
Then $A$ is dense in $C(X)$ if both
$A$ does not vanish, i.e.\ for all $x\in X$ there exists
an $f\in A$ with $f(x)\neq 0$, and $A$ separates points, i.e.\
for all $x,y\in X$ with $x\neq y$ there exists an
$f\in A$ with $f(x)\neq f(y)$.
\end{theorem}






\section{Kernels}\label{kernels}


In the following let $(X,d)$ be a metric space.
A function $k:X\times X\to \R$ is called a kernel on $X$ if there exists a Hilbert
space $H$ and a map $\P:X\to H$ with
$$
k(x,y) \ =\  \langle\P(x),\P(y)\rangle
$$
for all $x,y\in X$. We call $\P$ a {\em feature map} and $H$ a {\em feature space} of $k$.
Note, that both $H$ and $\P$ are far from being unique.
However, for a given kernel there exists a canonical feature space (with associated
feature map), which is the so-called reproducing kernel Hilbert space (RKHS)
\citep*[cf.][Ch.~3]{CST}. Since our ansatz is mainly based on specific series
expansions of certain kernels we do not need to consider these spaces.

Let $k$ be a kernel on $X$ and $\P:X\to H$ be a feature map of $k$.
A function $f:X\to \R$ is {\em induced} by $k$
if there exists an element $w\in H$ such that
$f=\langle w,\P(.)\rangle$.
The next lemma shows that this
definition is independent of $\P$ and $H$:




\begin{lemma}\label{sameHyper}
Let $k:X\times X\to \R$ be a kernel and $\P_1:X\to H_1$, $\P_2:X\to H_2$ be two
feature maps of $k$. Then for all $w_1\in H_1$ there exist $w_2\in H_2$ with
$\snorm {w_2} \leq \snorm{w_1}$ and
\begin{displaymath}
\langle w_1,\P_1(x)\rangle \ = \ \langle w_2,\P_2(x)\rangle\eqno{\mbox{for all }x\in X.}
\end{displaymath}\\[-8mm]
\end{lemma}

\begin{proof} 
Let $H_1^*:= \overline{\spann {\P_1(X)}}$ and $\tilde H_1$ its orthogonal
complement in $H_1$. Then $w_1\in H_1$ can be written as $w_1=w_1^*+\tilde w_1$
with $w_1^*\in H_1^*$ and $\tilde w_1\in \tilde H_1$.
Given an $x\in X$ we have $\langle \tilde w_1,\P_1(x)\rangle =0$ and therefore
we obtain
$\langle w_1^*,\P_1(x)\rangle  =  \langle w_1,\P_1(x)\rangle$
for all $x\in X$.
Now by the definition of $H_1^*$ there exists a sequence $(w_n^{(1)})\subset \spann {\P_1(X)}$
with $w_n^{(1)}= \sum_{m=1}^{m_n} \lb_m^{(n)} \P_1(x_m^{(n)})$ and
$w_1^*= \sum_{n=1}^\infty w_n^{(1)}$.
Then for $w_n^{(2)}:= \sum_{m=1}^{m_n} \lb_m^{(n)} \P_2(x_m^{(n)})$ and $l_2\geq l_1\geq 1$
we obtain
\begin{eqnarray*}
 \norm{\sum_{n=l_1}^{l_2}w_n^{(1)}}^2
& =& \sum_{n=l_1}^{l_2}\sum_{m=1}^{m_n}  \sum_{i=l_1}^{l_2}\sum_{j=1}^{m_i}
\ \lb_m^{(n)} \lb_j^{(i)}\langle \P_1(x_m^{(n)}),\P_1(x_j^{(i)})\rangle\\
& =& \sum_{n=l_1}^{l_2}\sum_{m=1}^{m_n}  \sum_{i=l_1}^{l_2}\sum_{j=1}^{m_i}
\ \lb_m^{(n)} \lb_j^{(i)}\langle \P_2(x_m^{(n)}),\P_2(x_j^{(i)})\rangle\\
& = & \norm{\sum_{n=l_1}^{l_2}w_n^{(2)}}^2 \ .
\end{eqnarray*}
Therefore, $(\sum_{n=1}^m w_n^{(2)})_{m\geq 1}$ is a Cauchy sequence and hence converges
to $w_2:= \sum_{n=1}^\infty w_n^{(2)} \in H_2$.
Clearly, we then have $\snorm{w_2}=\snorm{w_1^*}\leq\snorm{w_1}$.
Moreover, an easy calculation similar to the consideration above shows
$\langle w_1,\P_1(x)\rangle = \langle  w_2,\P_2(x)\rangle$ for all $x\in X$. 
\end{proof}



\noindent In the following we only consider continuous kernels. The
following lemma provides some useful properties of this class:




\begin{lemma}\label{contkernel}
Let $k$ be a kernel on the metric space $(X,d)$
and $\P:X\to H$ be a feature map of $k$.
Then $k$ is continuous if and only if $\P$ is continuous. In this case
$$
d_k (x,y)\Defi\Norm{\P(x)-\P(y)}
$$
defines a semi-metric on $X$ such that the identity map $\id:(X,d)\to (X,d_k)$ is continuous.
If $\P$ is injective $d_k$ is even a metric.
\end{lemma}

\begin{proof} Let us first suppose that $k$ is continuous. Since
$$
d_k(x,y)\Eq\sqrt{k(x,x)-2k(x,y)+k(y,y)}
$$
we observe that $d_k(x,.):(X,d)\to \R$ is continuous for every $x\in X$.
In particular, $\{y\in X : d_k(x,y)<\e\}$
is open with respect to $d$ and therefore $\id:(X,d)\to (X,d_k)$ is continuous.
Furthermore, $\P:(X,d_k)\to H$ is continuous and hence $\P:(X,d)\to H$ is also continuous.\\
Conversely, assume that $\P$ is continuous. Since for all $x,x',y,y'\in X$
we have
\begin{eqnarray*}
|k(x,y)-k(x',y')| & \leq &
|\langle \P(x),\P(y)-\P(y')\rangle| + |\langle \P(x)-\P(x'),\P(y')\rangle|\\
& \leq & \Norm{\P(x)}\edot \Norm{\P(y)\!-\!\P(y')} + \Norm{\P(y')}\edot \Norm{\P(x)\!-\!\P(x')} 
\end{eqnarray*}
it is easily verified that $k$ is also continuous.\end{proof}





%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%                 separierende und universelle Kerne

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\noindent The metric $d_k$ enjoys the property that every induced function $\langle w,\P(.)\rangle$
is Lipschitz continuous with respect to $d_k$ and the Lipschitz constant is
bounded from above by $\snorm w$. This fact turns out to be very important
in the proof of Theorem \ref{HS1} since it allows us to control the behaviour of
solutions of SVM's on subsets of small diameters. 

From the last lemma we know in particular that
for a continuous kernel every induced function is continuous. The following
definition plays a central role throughout this paper:

\begin{definition}\label{Defuniv}
A continuous kernel $k$ on a compact metric space $(X,d)$ is called {\em universal} if
the space of all functions induced by $k$ is dense in $C(X)$, i.e.
for every
function $f\in C(X)$ and every $\e>0$ there exists a function $g$ induced by $k$
with
$$
\inorm{f-g} \Leq \e\ .
$$
\end{definition}


\noindent We also need a weaker concept. Let
$\P:X\to H$ be a feature map of $k$
and $A$, $B$ be disjoint subsets of $X$. We say that $k$
{\em separates $A$ and $B$ with margin $\g\geq 0$} if
$\P(A)$ and $\P(B)$ can be separated by a hyperplane with margin $\g$, i.e.\
if there exists a pair $(w,b)\in S_H\times \R$ such that
$$
\begin{array}{rcll}
\langle w,\P(x)\rangle + b & >  &\g  & \qquad \qquad \mbox{for all } x\in A \mbox{ and}\\
\langle w,\P(y)\rangle + b & <  &-\g  & \qquad \qquad \mbox{for all } y\in B\ .
\end{array}
$$
If $\g=0$ we simply say that $k$ separates $A$ and $B$. In this case
the restriction $w\in S_H$ is superfluous.
By Lemma \ref{sameHyper} both definitions are independent of the feature map $\P$.
We say that
the kernel $k$ separates all finite, resp.\ compact subsets if it separates all
finite, resp.\ compact disjoint subsets of $X$.
Note, that if $k$ separates {\em compact} sets $A$ and $B$ then it automatically
separates them with a suitable margin $\g>0$.
Moreover, it was shown in \citet[][Ex.~3.13]{Ste} that there exists a continuous kernel
that separates all finite subsets but fails to separate all compact subsets.

Before we investigate which kernels are universal we collect some useful properties
of these kernels. Firstly, let $(X,d)$ and $(X',d')$
be compact metric spaces, $k$ be a universal kernel on $X$
and $\iota:X'\to X$ be a continuous and injective map.
Then one easily checks that $k(\iota(.),\iota(.))$ is a universal
kernel on $X'$. Moreover, we
have $k(x,x)>0$ for all $x\in X$ since $k(y,y) = 0$ implies $g(y)=0$
for all induced functions $g$.
Since all feature maps of $k$ are continuous and $X$ is compact we may also restrict ourselves
to separable feature spaces of $k$.
The next proposition is fundamental for our considerations of
support vector machines:



\begin{proposition}\label{approxstep}
Let $(X,d)$ be a compact metric space and $k$ be a universal
kernel on $X$. Then for all compact and mutually disjoint
subsets $K_1,\dots,K_n \subset X$, all $\a_1,\dots,\a_n \in \R$
and all $\e>0$ there exists a function $g$ induced by $k$ with $\inorm g \leq \max_i |\a_i| + \e$ such that
$$
\norm{g_{|K} - \Sum_{i=1}^n \a_i \eins_{K_i}}_\infty \leq \e\ ,
$$
where $K:= \bigcup_{i=1}^n K_i$ and $g_{|K}$ denotes the restriction of $g$ to $K$.
\end{proposition}

\begin{proof} Since $d(K_i,K_j) > 0$ for all $i\neq j$ we obtain
$\sum_{i=1}^n \a_i \eins_{K_i} \in C(K)$.
Since this function can be extended to a continuous function $f$ on $X$
with
$\inorm f \leq \max_i |\a_i|$
(by the Lemma of Urysohn or by a direct construction with the help of $d$)
the assertion follows. \end{proof}



\begin{corollary}\label{sepcpct}
Every universal kernel separates all compact subsets.
\end{corollary}

\begin{proof} Let $(X,d)$ be a compact metric space and $k$ be a universal kernel on $X$ with
feature map $\P:X\to H$.
Given two compact and disjoint subsets $K_1$ and $K_{-1}$ of $X$
there exists an induced function $g=\langle w,\P(.)\rangle$
with $\inorm{g_{|K_{-1}\cup K_1}- (\eins_{K_1}-\eins_{K_{-1}})} < 1/2$.
This implies that
$({\Norm w}^{-1} w,0)$
separates $K_1$ and $K_{-1}$ with margin $\frac 1{2\Norm w}$. \end{proof}

\noindent Although the previous corollary is an almost trivial consequence
of the notion of universality it has surprising implications for
the geometric interpretation of the shape of the feature map:
let us suppose that we have a finite subset $\{x_1,\dots,x_n\}$ of $X$. Then the
above corollary ensures that for every sequence of signs $y_1,\dots,y_n$ the
corresponding training set can be correctly separated by a hyperplane in the
feature space. Moreover, this can even be done
by a hyperplane that has almost the same distance to every point of $\{x_1,\dots,x_n\}$.
Therefore, any finite dimensional interpretation of the geometric situation
in a feature space of a universal kernel must fail. In particular this holds for
2- or 3-dimensional drawings.
(Actually, the shape of the feature map is even more
complicated since not only all finite subsets but every
pair of compact disjoint subsets can be separated.)

The following corollary ensures in particular that the semi-metric $d_k$
induced by a universal kernel $k$ is in fact a metric:

\begin{corollary}
Every feature map of a universal kernel is injective.
\end{corollary}

\begin{proof} Finite subsets are compact and thus the assertion follows by the previous
corollary. \end{proof}


\begin{proposition}\label{normkern}
Let $(X,d)$ be a compact metric space and $k$ be a universal kernel on $X$.
Then
$$
k^*(x,y)\Defi \frac{k(x,y)}{\sqrt{k(x,x)k(y,y)}}
$$
defines a universal kernel on $X$.
\end{proposition}


\begin{proof} Let $\P:X\to H$ be a feature map of $k$ and $\a(x):= k(x,x)^{-1/2}$.
Clearly, $\a\P:X\to H$ is a feature map of $k$ and thus $k$ is a kernel.
To show that $k^*$ is universal we fix a function $f\in C(X)$ and $\e>0$.
For $a:=\inorm{\a}$ we then get an induced function $g=\langle w,\P(.)\rangle$
with $\isnorm{ \a^{-1} f -g} \leq \frac \e a$. This yields
$$
\inorm{f-\langle w ,\a\P(.)\rangle}
\Leq \inorm{\a}\  \isnorm{\a^{-1} f -g} \Leq \e 
$$
and thus the assertion is proved. \end{proof}








\noindent Up to now we do not know whether there exist universal kernels.
To attack this question we begin with a simple criterion that makes it possible to
check whether a given kernel is universal:




\begin{theorem}\label{kernuniv}
Let $(X,d)$ be a compact metric space and $k$ be a continuous kernel on $X$ with
$k(x,x)>0$ for all $x\in X$. Suppose that we have an injective feature map
$\P:X\to \lto$ of $k$ with $\P(x)=(\P_n(x))_{n\in \N}$.
If $A:=\spann{\{\P_n :  n\in \N\}}$ is an algebra then $k$ is universal .
\end{theorem}



\begin{proof}
Because of $k(x,x)>0$ for all $x\in X$ the algebra $A$
does not vanish. Since $k$ is continuous every
$\P_n:X\to \R$ is continuous by Lemma \ref{contkernel} and hence $A\subset C(X)$.
Moreover, $A$ is even dense in $C(X)$ since
the injectivity of $\P$ implies that $A$
separates points and thus
Theorem \ref{StoneWeier} can be applied. Now we fix $f\in C(X)$ and $\e>0$.
Then there exists a function
\begin{displaymath}
g=\sum_{j=1}^n\lb_j\cdot(\P_{n_j})  \ \in \ A
\end{displaymath}
such that $\inorm{f-g} \leq \e$. However, if we define
$w_n:= \lb_j$ for $n=n_j$ and $w_n:= 0$ otherwise,
we have $w:=(w_n) \in \lto$ and $\langle w,\P(.)\rangle =g$. \end{proof}



\noindent The following corollaries give various examples of universal kernels.
We begin with kernels that can be expressed by a Taylor series:


\begin{corollary}\label{taylorexp}
Let $0<r\leq \infty$ and
$f:(-r,r)\to \R$ be a $C^\infty$-function that can be expanded into its Taylor series
in $0$, i.e.\
\begin{displaymath}
f(x) \Eq \sum_{n=0}^{\infty} a_n x^n \eqno{\mbox{for all } x\in (-r,r)\ .}
\end{displaymath}
Let $X:= \{x\in \R^d : \ynorm x 2 < \sqrt{\mbox{$r$}}\}$. If we have
$a_n > 0$ for all $n\geq 0$ then $k(x,y):=f(\langle x,y\rangle)$
defines a universal kernel on every compact subset of $X$.
\end{corollary}



\begin{proof} Since $|\langle x, y\rangle| \leq \ynorm x 2 \ynorm y 2 < r$ for all $x,y\in X$ we see that
$k$ is well-defined.
We also have
\begin{eqnarray*}
k(x,y)
& = & \sum_{n=0}^\infty a_n\ \Bigl(\sum_{k=1}^d x_k y_k\Bigr)^n\\
& = & \sum_{n=0}^\infty a_n\ \sum_{\substack{k_1+\dots+k_d=n\\k_1,\dots,k_d\geq 0}}
c_{k_1,\dots,k_d} \prod_{i=1}^d (x_iy_i)^{k_i}\\
& = & \sum_{k_1,\dots,k_d\geq 0} a_{k_1+\dots+k_d} c_{k_1,\dots,k_d}
\prod_{i=1}^d x_i^{k_i} \prod_{i=1}^d y_i^{k_i} \ ,
\end{eqnarray*}
where $c_{k_1,\dots,k_d}:= (\prod_{i=1}^d k_i!)^{-1}{(\sum_{i=1}^d k_i)!}$
\citep[cf.\ also][Lem.~2.1]{Pog}. Note, that
the series can be rearranged since it is absolutely
summable.
In particular, for $x=y$ we obtain that $\P:X  \to  \lt{\N_0^d}$ is well defined by
$$
\P(x) \Defi
\Bigl(\sqrt{a_{k_1+\dots+k_d} c_{k_1,\dots,k_d}} \prod_{i=1}^d x_i^{k_i}\Bigr)_{k_1,\dots,k_d\geq 0}\ .
$$
The above equation also shows that
$k(x,y)= \langle \P(x),\P(y) \rangle$ holds for all $x,y\in X$
and hence $k$ is indeed a kernel.
Moreover, $a_0>0$ implies $k(x,x)>0$ for all $x\in X$ and
trivially, $\P$ is injective. Since
$A:=\spann{\{\P_{k_1,\dots,k_d} :  k_1,\dots,k_d\geq 0\}}$
is an algebra
we thus obtain by Theorem \ref{kernuniv} that $k$ is universal. \end{proof}


\noindent Instead of Taylor series
one can also consider Fourier expansions. The result reads as follows:


\begin{corollary}\label{fourierexp}
Let $f:[0,2\pi] \to \R$ be a continuous function that can
be expanded
in a pointwise absolutely convergent Fourier series of the form
\begin{equation}\label{fourierh1}
f(t) \Eq \sum_{n=0}^\infty a_n \cos(n t)\ .
\end{equation}
If $a_n>0$ holds for all $n\geq 0$ then $k(x,y):=\prod_{i=1}^d
f(|x_i-y_i|)$
defines a universal kernel on every compact subset of $[0,2\pi)^d$.
\end{corollary}

\noindent Recall, that every function
$f:[0,2\pi] \to \R$ that can be extended to a continuous, symmetric,
periodic and piecewise continuously differentiable function on $\R$ has
a Fourier series of the form (\ref{fourierh1}).\\


\begin{proof} By induction and the Cauchy product of series we may restrict ourselves to $d=1$.
Then
\begin{displaymath}
k(x,y) = a_0  + \! \sum_{n=1}^\infty a_n \sin(n x)\sin(n y)
+\! \sum_{n=1}^\infty a_n \cos(n x)\cos(n y)
\end{displaymath}
holds for all $x,y\in [0,2\pi)$
and hence $\P=(\P_n)_{n\geq 0}$ defined by $\P_0(x):= a_0$ and
$\P_{2n-1}(x):= \sqrt{a_n} \sin (nx)$,
$\P_{2n}(x):= \sqrt{a_n} \cos (nx)$ for $n\geq 1$
is an injective feature map of $k$ with image in $\lto$.
Moreover,
$
A:=\spann \bigl(\{\sqrt{a_n} \sin(n\cdot . ) : n\geq 1\} \cup
 \{\sqrt{a_0} \cos(n \cdot .) :  n\geq 0\}\bigr)
$
is an algebra and since $a_0>0$ implies $k(x,x)>0$ for all $x\in X$
we obtain that $k$ is universal. \end{proof}





\noindent The following examples show that various well-known kernels are universal:




\begin{example}\label{gaussexp}
{\em The kernels $\exp(-\s^2 \xnorm{.-.} 2 2)$ and $\exp(\langle .,.\rangle)$ are
universal on every compact subset of $\R^d$.}
\end{example}
\begin{proof} The universality of $\exp(\langle .,.\rangle)$ is due to Corollary \ref{taylorexp}.
Therefore, by Proposition \ref{normkern} and
$\exp(-\s^2 \xnorm{x-y} 2 2) \Eq \exp(- \xnorm{\s x} 2 2) \exp(- \xnorm{\s y} 2 2)
\exp(\langle \sqrt 2 \s x,\sqrt 2 \s y\rangle)$
the assertion follows for the RBF kernel. \end{proof}




\begin{example}
{\em Let $X:=\{x\in \R^d :  \ynorm x 2 < \mbox{$1$}\}$ and $\a>0$.
Then V. Vovk's \citep[cf.][p.~15]{SSW} infinite polynomial kernel
$k(x,y):= (1-\langle x,y \rangle )^{-\a}$, $x,y\in X$, is universal on every compact subset of $X$.}
\end{example}
\begin{proof} To check the assertion
we use that $(1-t)^{-\a} = \sum_{n=0}^\infty \binom {-\a} n (-1)^n t^n$
holds for $|t|<1$. Since $\binom {-\a} n (-1)^n > 0$ for all $n\geq 0$,
the assertion then follows by
Corollary \ref{taylorexp}. \end{proof}







\begin{example}
{\em Let $0<q<1$ and $f(t):= (1-q^2)/{(2-4q\cos t +2q^2)}$, $t\in \R$.
Then the stronger regularized Fourier kernel
$k(x,y):=\prod_{i=1}^d f(x_i-y_i)$
considered by \citet[][p.~470]{Vap} and \citet[][p.~15]{SSW} is universal on every compact subset of $[0,2\pi)^d$.}
\end{example}
\begin{proof} The assertion can be seen using Corollary \ref{fourierexp}
and $f(t)=1/2 + \sum_{n=1}^\infty q^n \cos(nt)$
\citep*[cf.][p.~68]{G-R}. \end{proof}






\begin{example}
{\em Let $0<q<\infty$ and
$f(t):=  \pi \cosh\bigl(({\pi-|t|})/ q\bigr)/\bigl({2q\sinh(\pi/q)}\bigr)$
for all $t$ with $-2\pi\leq t\leq 2\pi$. Then the weaker regularized Fourier kernel
$k(x,y):=\prod_{i=1}^d f(x_i-y_i)$
considered by \citet[][p.~470/1]{Vap} and \citet[][p.~15]{SSW} is
universal on every compact subset of $[0,2\pi)^d$.}
\end{example}
\begin{proof}
To obtain the assertion we use
$f(t)=1/2 + \sum_{n=1}^\infty \cos(nt)/({1+q^2n^2})$
\citep*[cf.][p.~68]{G-R}. \end{proof}


















\section{The 2-norm soft margin classifier}\label{2smc}


Let $k$ be a kernel on $X$ and
$\P:X\to H$ be a feature map of $k$. For a training set
$T = ((x_1,y_1),\dots,(x_n,y_n)) \in (X \times Y)^n$ and $c_n>0$
we denote the unique
solution of the optimization problem
\begin{equation}\label{prim2sno}
\begin{array}{rll}
\!\!\!\!\!\!\mbox{minimize} & \qquad
\ca W(w,b,\xi) := \langle w,w\rangle + c_n\Sum_{i=1}^n\x_i^2 \qquad  &\mbox{over } w,b,\x\\
\!\!\!\!\!\!\mbox{subject to} & \qquad y_i(\langle w,\P(x_i)\rangle +b) \geq 1-\x_i,   &
i=1,\dots,n
\end{array}
\end{equation}
by $(w_{T}^{2,k,c_n},b_T^{2,k,c_n})\in H\times \R$. An algorithm $\ca C_k^{2,(c_n)}$
that provides the decision function
$$
f_T^{2,k,c_n}(x):= \sign(\langle w_T^{2,k,c_n},\P(x)\rangle +b_T^{2,k,c_n})\ , \qquad \qquad x\in X
$$
for every training set $T$ is called a 2-norm soft margin classifier (2-SMC)
with kernel $k$ and parameter sequence $(c_n)$.
Note, that in order to have a small set of free parameters one usually fixes
$c_n := c$ for all $n\geq 1$.
In this section it turns out that this is not suitable for problems that do no guarantee a large
margin. Instead one should use sequences $c_n = c n^{\b-1}$ where $\b>0$ is a parameter a-priori
determined by the kernel and $c$ is a new free parameter (cf.\ Cor.\ \ref{Cor1}). Of course, for fixed
training set sizes both parameterizations are equivalent, i.e.\ they can
be transformed into each other. 

By Lemma \ref{sameHyper} the decision function is
independent of the choice of the feature map $\P$. Moreover, $f_T^{2,k,c_n}$
can be expressed by
\begin{displaymath}
f_T^{2,k,c_n}(x) \Eq \sum_{i=1}^n y_i \a_i k(x_i,x) +b_T^{2,k,c_n},
\end{displaymath}
where $\a_i\geq 0$ are suitable constants depending on $T$ and $b_T^{2,k,c_n}$
can also be computed with the help of the kernel \citep*[cf.][]{CST,Vap,SHSW}.
Note, that if $k$ is a kernel on $X$
which separates all finite sets and
$X$ has infinitely many elements then the function class represented by the 2-SMC has infinite
VC-dimension.
For more information on this we refer to \citet*[][Ch.~4]{Vap},
\citet*[][Ch.~4]{CST} and \citet*[][Ch.~2.6]{V-W}.

Given a Borel probability measure $P$ on $X\times Y$ with noise level $p$ we denote the
nondeterministic part of the supervisor by
$X^+:=\{x\in X :  p(x)>0\}$. If $P_X(X^+)>0$ we write
$q^*:= \inf_{x\in X} p(x)$ and $p^* := \sup_{x\in X}p(x)$. Due to technical
reasons we define $q^*:= p^* := 1/4$ otherwise.
We begin with a preliminary result:


\begin{theorem}\label{HS1}
Let $(X,d)$ be a compact metric space and $k$ be a universal kernel on $X$.
Then for all Borel probability measures $P$ on $X\times Y$ with $q^*,p^* \in (0, 1/2)$
and all $\e>0$ there exist
$c^*>0$ and $\d^*>0$ such that for all $c\geq c^*$, $0<\d\leq\d^*$ and all $n\geq 1$ we have
\begin{displaymath}
\opa \Bigl(\Bigl\{T\in (X\times Y)^n :  \ca R_P
(f^{2,k,c/n}_{T})\leq \ca R_P + 4 \frac{p^*-q^*}{1-2q^*}P_X(X^+) + \e\Bigr\}\Bigr)
\Geq 1 - 3M e^{-2 \left(\frac \d M\right)^2 n}\ ,
\end{displaymath}
where $M := \ca N\bigl((X,d_k),\frac \d {\sqrt c}\bigr)$ is the covering number of $X$
with respect to the metric $d_k$ which is induced by the kernel $k$. Moreover,
$\opa$ denotes the outer probability measure
of $P^n$.
\end{theorem}

\noindent Note, that in order to avoid the (probably very difficult) question whether the sets
$$
\bigl\{T\in (X\times Y)^n :  \ca R_P (f^{2,k,c/n}_{T})\leq \a\bigr\}
$$
are measurable
we consider the outer probability measure, only.

Since the proof of Theorem \ref{HS1} is very technical
we like to explain the basic idea of the proof firstly.
Let us suppose that the supervisor has a constant level of noise $p\in [0,1/2)$.
Moreover, we assume that we have
an induced function $\langle w^*,\P(.)\rangle$
which has the constant values $1-2p$, resp. $-(1-2p)$ on $B_1(P)$, resp. $B_{-1}(P)$.
Now let us take a ``representative'' training set $T$ of length $n$.
Then one easily checks (cf.\ estimate (\ref{above})) that
$$
\langle w_T^{k,c/n},w_T^{k,c/n}\rangle + \frac c n\sum_{l=1}^n \x_l^2 \ \
\lesssim
\ \
\langle w^*,w^*\rangle + 4cp(1-p)
$$
holds. Here $\lesssim$ means that the relation $\leq$ only holds ``approximately''.
On the other hand, by the continuity of the decision function $w_T^{k,c/n}$
a misclassified (compared with the optimal Bayes decision rule)
element $z$ forces the sum of those slack variables,
which belong to samples in the ``neighbourhood'' of $z$, to be
``approximately'' greater than their
cardinality (cf.\ inequality (\ref{below2})).
Conversely, for a correctly classified element the corresponding
sum of the slack variables is ``approximately'' larger than $4p(1-p)$ times their
cardinality (cf.\ inequality (\ref{below3})).
Combining these considerations we obtain
\begin{eqnarray*}
c(1-2p)^2P_X(E)+4cp(1-p) & = & c\bigl(P_X(E) + 4p(1-p)P_X(X\setminus E)\bigr)\\
& \lesssim &
 \langle w_T^{k,c/n},w_T^{k,c/n}\rangle + \frac c n\sum_{l=1}^n \x_l^2 \\
& \lesssim &
\langle w^*,w^*\rangle + 4cp(1-p) \ ,
\end{eqnarray*}
where $E$ denotes the set of misclassified (compared with the optimal Bayes decision rule) elements.
Thus $P_X(E)$ must be ``small'' if we have chosen $c$ ``large enough''. 

The difficulty of the proof below is firstly, to transfer the idea to the general case
and secondly, to give exact formulations of ``representative'', ``neighbourhood'' and
``approximately''.\\



\begin{proofo}{of Theorem \ref{HS1}} Without loss of generality we may assume $\e\in(0,1]$.
Let $X^0:= X\setminus X^+$ be the deterministic part of the supervisor and
$X_{-1}^0 := X^0 \cap B_{-1}(P)$, $X_{1}^0 := X^0 \cap B_{1}(P)$ be the parts of the classes
$B_{-1}(P)$, $B_1(P)$
in $X^0$. Furthermore, let $X_{-1}^+ := X^+ \cap B_{-1}(P)$ and $X_{1}^+ := X^+ \cap B_{1}(P)$
be the parts of the classes
$B_{-1}(P)$ and $B_1(P)$
in $X^+$.
We define
$\d^* := \min\{2p^*,\frac \e {74}q^*  (1-2q^*)\}$ and fix $\d\leq \d^*$.
Since $P_X$ is regular \citep[cf.][p.~176]{Dud}
there exist compact subsets
$K_i^j$ of $X_i^j$ with $P_X(X_i^j \setminus K_i^j) \leq \d /4$ for $\iin$ and $\jin$.
Moreover, for a fixed feature map $\P:X\to H$ of $k$ Proposition \ref {approxstep}
ensures the existence of an element $w^*\in H$ with
\begin{equation*}
\langle w^*,\P(x) \rangle \quad\in\quad
    \begin{cases}
        [1,1+\d] & \mbox{ if } x\in K_1^0 \\
    [-(1+\d),-1] & \mbox{ if } x\in K_{-1}^0 \\
        [1-2p^*-\d, 1-2p^*] & \mbox{ if } x\in K_1^+ \\
    [-(1-2p^*),-(1-2p^*-\d)] & \mbox{ if } x\in K_{-1}^+ \\
        [-(1+\d),1+\d] & \mbox{ otherwise. }
        \end{cases}
\end{equation*}
We define $c^* := \frac 2 {\e(1-2q^*)} \xnorm{w^*} 2 2$ and for fixed $c\geq c^*$ let
$\s := \frac \d{\sqrt c}$. By Lemma \ref{part} we then obtain partitions $\tilde {\ca P}_i^j$ of
$K_i^j$ with $\diam_{d_k}(A) \leq \s$ for all $A\in \tilde {\ca P}_i^j$ and
$$
\Bigl|\bigcup_{\substack{\iin\\ \jin}} \tilde {\ca P}_i^j\Bigr| \Leq \ca N\bigl((X,d_k),\s\bigr)\ = \ M\ .
$$
Let $\ca P_i^j := \{ A \in \tilde {\ca P}_i^j : P_X(A) \geq \frac \d {q^*M}\}$ for $\iin$ and $\jin$.
To construct ``representative'' training sets we define
\begin{align*}
F_{n,A} & :=  \Bigl\{\bigl((x_1,y_1),\dots,(x_n,y_n)\bigr) \ : \
\bigl|\{l:x_l\in A\}\bigr|
\geq \Bigl(P_X(A)-\frac \d M\Bigr)\, n\Bigr\}
\intertext{for all $A\in \ca P_i^j$, $\iin$ and $\jin$. Moreover, for $A\in \ca P_i^+$, $\iin$ let}
F^+_{n,A} & :=  \Bigl\{\bigl((x_1,y_1),\dots,(x_n,y_n)\bigr) \ : \
\bigl|\{l:x_l\in A \mbox{ and } y_l=i\}\bigr|
\geq \Bigl((1-p^*)P_X(A)-\frac \d M\Bigr)\, n\Bigr\}\\
F^-_{n,A} & :=  \Bigl\{\bigl((x_1,y_1),\dots,(x_n,y_n)\bigr) \ : \
\bigl|\{l:x_l\in A \mbox{ and } y_l\neq i\}\bigr|
\geq \Bigl(q^*P_X(A)-\frac \d M\Bigr)\, n\Bigr\}\\
F_n & :=  \bigcap_{A\in {\cal P}^0_{-1}\cup {\cal P}^0_1} F_{n,A}
\quad \cap
\bigcap_{A\in {\cal P}^+_{-1}\cup {\cal P}^+_1} \bigl(F_{n,A} \cap F_{n,A}^+ \cap F_{n,A}^-\bigr)\ .
\end{align*}
Lemma \ref{prob} yields $P^n(F_n) \geq 1- 3M e^{-2 \left(\frac \d M\right)^2 n}$ and thus it suffices
to show that
$$
\ca R_P
(f^{2,k,c/n}_{T})\leq \ca R_P + 4 \frac{p^*-q^*}{1-2q^*}P_X(X^+) + \e
$$
holds for all $T\in F_n$.
Therefore, let us fix a training set $T=\bigl((x_1,y_1),\dots,(x_n,y_n)\bigr)\in F_n$.
For $c_n:= c/n$ we denote the solution of (\ref{prim2sno})
by $(w_T,b_T,\xi_T)$. Our first step is to estimate $\ca W(w_T,b_T,\xi_T)$
from above by comparing it with $\ca W(w^*,0,\xi^*)$, where $\xi^*$ is an admissible slack
vector of $(w^*,0)$. Hence we have to construct $\xi^*$. For this let us first assume that we have a
sample $(x_l,y_l) \in K_1^0$. Then we observe that
$$
y_l\langle w^*,\P(x_l)\rangle \Eq \langle w^*, \P(x_l)\rangle \Geq 1
$$
and thus we may define $\xi_l^*:= 0$. Analogous considerations yield
\begin{equation*}
 \xi_l^*  \quad\Defi\quad
    \begin{cases}
        0 & \mbox{ if } x_l\in K_i^0 \\
                2p^*+\d & \mbox{ if } x_l\in K_i^+, y_l=i \\
                2-2p^* & \mbox{ if } x_l\in K_i^+, y_l\neq i \\
        2+\d & \mbox{ otherwise. }
        \end{cases}
\end{equation*}
Moreover, our construction of $F_n$ guarantees that there are at most
$$
n - n\sum_{\substack{\iin \\ \jin}} \sum_{A\in {\cal P}_i^j} \Bigl(P_X(A)-\frac \d M\Bigr)
\Leq n \Bigl( 1-\sum_{\substack{\iin \\ \jin}} P_X(X_i^j) +\Bigl(2+ \frac 1 {q^*}\Bigr)\d\Bigr)
\Eq \frac 2 {q^*}\d n
$$
samples which are not elements of a suitable $K_i^j$. Furthermore, since there are at most
$$
n-n\sum_{\iin} \sum_{A\in {\cal P}_i^0} \Bigl(P_X(A)-\frac \d M\Bigr)
\Leq n\Bigl(P_X(X^+) +\frac 2 {q^*}\d\Bigr)
$$
samples in $K^+:= K_1^+\cup K_{-1}^+$ we also obtain that there are at most
$$
n\Bigl(P_X(X^+) +\frac 2 {q^*}\d\Bigr) -
n\sum_{\iin} \sum_{A\in {\cal P}_i^+} \Bigl((1-p^*)P_X(A)-\frac \d M\Bigr)
\Leq n \Bigl(p^* P_X(X^+) + \frac 4 {q^*}\d\Bigr)
$$
samples in $K^+$ which have incorrect labels. Since these have larger slack variables with respect to
$(w^*,0)$ than
the correctly labeled samples in $K^+$ we obtain
\begin{eqnarray}\nonumber \label{above}
\!\!\!\!&&\!\! \ca W(w_T,b_T,\xi_T)\\ \nonumber \!\!\!\!& \leq &\!\! \ca W(w^*,0,\xi^*) \\ \nonumber
\!\!\!\!&\leq & \!\!\langle w^*,w^*\rangle + \frac c n \sum_{\iin} \sum_{\substack{x_l\in K_i^+\\y_l=i}} (\xi_l^*)^2
+ \frac c n \sum_{\iin} \sum_{\substack{x_l\in K_i^+\\y_l\neq i}} (\xi_l^*)^2 + 2\d c(2+\d)^2 \\ \nonumber
\!\!\!\!&\leq &\!\! \langle w^*,w^*\rangle + c (1-p^*)P_X(X^+) (2p^*+\d)^2 +
c\Bigl(p^* (P_X(X^+) + \frac 4 {q^*}\d\Bigr) (2-2p^*)^2 +   2 \d c(2+\d)^2 \\
\!\!\!\!&\leq &\!\! \langle w^*,w^*\rangle + c \Bigl( 4p^*(1-p^*)P_X(X^+) + \frac {27} {q^*}\d\Bigr)\ .
\end{eqnarray}
For later purposes we note that we also have
$\ca W(w_T,b_T,\xi_T) \leq \ca W\bigl(0,0,(1,\dots,1)\bigr) \leq c$ and thus $\Norm{w_T}_2 \leq \sqrt c$.
In the second step of the proof we estimate $\ca W(w_T,b_T,\xi_T)$ from below. For this let us denote
the set of misclassified points in $X_i^j$ by
$$
E_i^j \Defi \{x\in X_i^j : f_T(x) \neq i\} \ .
$$
For brevity's sake we also write $E^j := E^j_1\cup E_{-1}^j$ and $E:= E^0 \cup E^+$.
Let us first consider an $A\in \ca P_i^0$ with $A\cap E\neq \emptyset$. Without loss
of generality we may assume that $i=1$. Then for $x_l\in A$ and $z\in A\cap E$ we obtain
\begin{eqnarray*}
1-\x_T(l)  &\leq& y_l\bigl(\langle  w_T, \P(x_l)\rangle +  b_T\bigr)\\
&= &  \langle  w_T,\P(x_l)-\P(z)\rangle  +
\langle  w_T,\P(z)\rangle +  b_T\\
&\leq & \snorm{ w_T}\ d_k(x_l,z)\\
 &\leq& \d ,
\end{eqnarray*}
i.e. $\xi_T^2(l) \geq (1-\d)^2 \geq 1-2\d$. Since the same estimate holds
in the case $i=-1$ our construction of $F_n$ implies
\begin{equation}\label{below1}
\frac 1 n \sum_{\iin}\sum_{\substack{A\in {\cal P}_i^0 \\ A\cap E\neq \emptyset}}
\sum_{x_l\in A} \xi_T^2(l) \Geq
\sum_{\iin}\sum_{\substack{A\in {\cal P}_i^0 \\ A\cap E\neq \emptyset}}
(1-2\d)\Bigl(P_X(A)-\frac \d M\Bigr) \Geq P_X(E^0) - \frac 3 {q^*}\d\ .
\end{equation}
Now let us consider an $A\in \ca P_i^+$ with $A\cap E\neq \emptyset$.
Without loss
of generality we may assume that $i=1$ again. Then for fixed $z\in A\cap E$ and
$a:= -\bigl(\langle w_T,\P(z)\rangle + b_T\bigr)\geq 0$ we obtain
\begin{equation*}
 \xi_T(l)  \quad\Geq\quad
    \begin{cases}
        1-\d+a & \mbox{ for } x_l\in A \mbox{ with } y_l=1\\
        \max\{0,1-\d-a\} & \mbox{ for } x_l\in A \mbox{ with } y_l=-1\\
        \end{cases}
\end{equation*}
analogously to the above considerations. We first treat the case $1-\d-a\geq 0$.
Since there are at least $\bigl(P_X(A) - \d/M\bigr)n$ samples in $A$ and at least
$\bigl((1-p^*)P_X(A) - \d/M\bigr)n$ correctly labeled samples in $A$
we get
\begin{eqnarray*}
\frac 1 n \sum_{x_l\in A} \xi_T^2(l) & \geq&
(1-\d+a)^2 \Bigl((1-p^*)P_X(A)-\frac \d M\Bigr) +
(1-\d-a)^2 p^* P_X(A) \\
& \geq & P_X(A) \bigl((1-\d+a)^2(1-p^*) + (1-\d-a)^2p^*\bigr) - \frac {4\d}M\ .
\end{eqnarray*}
Now an easy minimization with respect to $a\in [0,1-\d]$ yields
$$
\frac 1 n \sum_{x_l\in A} \xi_T^2(l) \Geq
P_X(A) \bigl((1-\d)^2(1-p^*) + (1-\d)^2p^*\bigr) - \frac {4\d}M \Geq
(1-2\d)P_X(A) -\frac {4\d}M \ .
$$
On the other hand, if $1-\d-a < 0$ we have $1-\d+a > 2-2\d$ and thus the same inequality
follows:
$$
\frac 1 n \sum_{x_l\in A} \xi_T^2(l) \Geq
(1-\d+a)^2 \Bigl((1-p^*)P_X(A)-\frac \d M\Bigr) \Geq (1-2\d)P_X(A) -\frac {4\d}M\ .
$$
Therefore, we obtain
\begin{equation}\label{below2}
\frac 1 n \sum_{\iin}\sum_{\substack{A\in {\cal P}_i^+ \\ A\cap E\neq \emptyset}}
\sum_{x_l\in A} \xi_T^2(l) \Geq
\sum_{\iin}\sum_{\substack{A\in {\cal P}_i^+ \\ A\cap E\neq \emptyset}}
\Bigl((1-2\d)P_X(A)-\frac {4\d} M\Bigr) \ .
\end{equation}
Finally, an analogous consideration yields
\begin{equation}\label{below3}
\frac 1 n \sum_{\iin}\sum_{\substack{A\in {\cal P}_i^+ \\ A\cap E= \emptyset}}
\sum_{x_l\in A} \xi_T^2(l) \Geq
\sum_{\iin}\sum_{\substack{A\in {\cal P}_i^+ \\ A\cap E=\emptyset}}
\Bigl((1-2\d)4q^*(1-q^*)P_X(A)-\frac {4\d} M\Bigr) \ .
\end{equation}
Now, the estimates (\ref{below2}) and (\ref{below3}) imply
\begin{eqnarray*}
\!\!\!\!&& \!\!\frac 1 n \sum_{\iin}\sum_{\substack{A\in {\cal P}_i^+ \\ A\cap E\neq \emptyset}}
\sum_{x_l\in A} \xi_T^2(l) +
\frac 1 n \sum_{\iin}\sum_{\substack{A\in {\cal P}_i^+ \\ A\cap E= \emptyset}}
\sum_{x_l\in A} \xi_T^2(l) \\
\!\!\!\!& \geq &\!\!
\sum_{\iin}\sum_{\substack{A\in {\cal P}_i^+ \\ A\cap E\neq \emptyset}}
\Bigl((1-2\d)P_X(A)-\frac {4\d} M\Bigr) +\!\!
\sum_{\iin}\sum_{\substack{A\in {\cal P}_i^+ \\ A\cap E=\emptyset}}
\Bigl((1-2\d)4q^*(1-q^*)P_X(A)-\frac {4\d} M\Bigr)\\
\!\!\!\!& \geq &\!\!
\sum_{\iin}\sum_{\substack{A\in {\cal P}_i^+ \\ A\cap E\neq \emptyset}}
(1-2\d)(1-2q^*)^2 P_X(A) +
\sum_{\substack{\iin \\ A\in {\cal P}_i^+ }}
(1-2\d)4q^*(1-q^*)P_X(A) - 4\d\\
\!\!\!\!& \geq &\!\!
(1-2\d)(1-2q^*)^2 P_X(E^+) + (1-2\d)4q^*(1-q^*) P_X(X^+) - 6\d - \frac 2 {q^*}\d\\
\!\!\!\!& \geq &\!\!
(1-2q^*)^2 P_X(E^+) + 4q^*(1-q^*) P_X(X^+) - \frac 7 {q^*}\d\ .
\end{eqnarray*}
The latter inequality together with (\ref{below1}) yields
$$
\frac 1 n \sum_{l=1}^n \xi_T^2(l)
\geq P_X(E^0) + (1-2q^*)^2 P_X(E^+) + 4q^*(1-q^*) P_X(X^+) - \frac {10} {q^*}\d\ .
$$
Combining this estimate with (\ref{above}) we now obtain
\begin{eqnarray}\nonumber\label{togeth1}
&&\langle w^*,w^*\rangle\\ & \geq &
c\, \Bigl(P_X(E^0) + (1-2q^*)^2 P_X(E^+) +
4q^*(1-q^*) P_X(X^+) - 4p^*(1-p^*)P_X(X^+) - \frac {37} {q^*}\d\Bigr)\nonumber\\
& \geq & c^* \Bigl(P_X(E^0) + (1-2q^*)^2 P_X(E^+)
- 4(p^*-q^*)P_X(X^+) - \frac {37} {q^*}\d\Bigr) \ .
\end{eqnarray}
Moreover, a simple calculation shows
$$
\ca R_P(f_T^{2,k,c/n}) \Eq \ca R_P + \Int_E (1-2p)\, dP_X \Leq
\ca R_P + \frac 1 {1-2q^*} \Bigl( P_X(E^0) + (1-2q^*)^2P_X(E^+)\Bigr)
$$
and thus
$P_X(E^0) + (1-2q^*)^2P_X(E^+) \geq (1-2q^*) (\ca R_P(f_T^{2,k,c/n}) -\ca R_P)$.
With this and (\ref{togeth1}) we find
$$
\langle w^*,w^*\rangle \Geq \frac {2\, \langle w^*,w^*\rangle}{\e(1-2q^*)}
\Bigl( (1-2q^*) \bigl(\ca R_P(f_T^{2,k,c/n})-\ca R_P\bigr) - 4(p^*-q^*)P_X(X^+) - \frac {37} {q^*}\d\Bigr)\ .
$$
The latter inequality finally implies
\begin{eqnarray*}
\ca R_P(f_T^{2,k,c/n}) - \ca R_P &\leq&
\frac \e 2 + 4 \frac {p^*-q^*}{1-2q^*} P_X(X^+) + \frac {37}{(1-2q^*)q^*} \,
\frac {\e(1-2q^*)q^*}{74} \\ & = &
4 \frac {p^*-q^*}{1-2q^*} P_X(X^+) + \e\ .
\end{eqnarray*}
Thus the assertion follows.
\end{proofo}
\newpage


\noindent We have to prove the remaining lemmas now. We begin with the lemma that constructs the
partitions $\ca P_i^j$ of the above proof:

\begin{lemma}\label{part}
Using the notations of the proof of Theorem \ref{HS1} there exist partitions $\ca P_i^j$ of
$K_i^j$, $\iin$, $\jin$, such that $\diam_{d_k}(A) \leq \s$ for all $A\in \ca P_i^j$, $\iin$, $\jin$,
and
\begin{equation}\label{cardpart}
\Bigl|\bigcup_{\substack{\iin\\ \jin}} \ca P_i^j\Bigr| \Leq \ca N\bigl((X,d_k),\s\bigr)\ .
\end{equation}
\end{lemma}

\begin{proof} By the definition of the covering numbers there exists a partition $\ca P$ of $X$ with
$\diam_{d_k}(A) \leq \s$ for all $A\in \ca P$ and $|\ca P| \leq \ca N \bigl((X,d_k),\s\bigr)$.
Let us define $\tilde{\cal P}_i^j := \{ A\in \ca P: A\cap K_i^j \neq \emptyset\}$
and $\ca P_i^j := \{ A\cap K_i^j : A \in \tilde{\cal P}_i^j\}$.
Therefore, to prove
(\ref{cardpart}) we have to show that the $\tilde {\cal P}_i^j$'s are mutually disjoint.
Assume the converse,
i.e. there exists $A\in \tilde {\cal P}_i^j \cap \tilde {\cal P}_{i'}^{j'}$
with $i\neq i'$ or $j \neq j'$.
By the definition of the partitions there exist $z_1,z_2\in A$ with $z_1\in K_i^j$ and
$z_2\in K_{i'}^{j'}$. Now on the one hand, we obtain
$$
|\langle w^*,\P(z_1)-\P(z_2)\rangle| \Leq \Norm{w^*} d_k(z_1,z_2) \Leq
\Norm{w^*} \s \Leq \Norm{w^*} \frac \d{\sqrt{c^*}}\ <\ \d
$$
but on the other hand we also have
$$
|\langle w^*,\P(z_1)-\P(z_2)\rangle| \Eq |\langle w^*,\P(z_1)\rangle - \langle w^*,\P(z_2)\rangle|
\Geq 1-(1-2p^*) \Geq \d^* \Geq \d\ .
$$
Therefore the assertion follows.
\end{proof}



\begin{lemma}\label{prob}
Using the notations of the proof of Theorem \ref{HS1} we have
$$
P^n(F_n) \geq 1- 3M e^{-2 \left(\frac \d M\right)^2 n}\ .
$$
\end{lemma}

\begin{proof} Let us recall Hoeffding's inequality \citep[cf.][Thm.~8.1]{DGL} which in particular
states that for all i.i.d.\ random variables $z_i:(\Om,\ca A,Q) \to \{0,1\}$ and all $\e>0$, $n\geq 1$
we have
$$
Q^n \Bigl( \sum_{i=1}^n z_i \leq n(q-\e)\Bigr) \Leq e^{-2\e^2n}\ ,
$$
where $q:= Q(z_i=1)$. Thus for $A\in \ca P_i^+$ we get
\begin{eqnarray*}
&&P^n(F_{n,A}^+)\\ & \geq &  P^n\Bigl(\!\Bigl\{\bigl((x_1,y_1),\dots,(x_n,y_n)\bigr)  :
\bigl|\{l:x_l\in A, y_l=i\}\bigr|
\geq \Bigl(\int_A(1-p)\, dP_X-\frac \d M\Bigr)\, n\Bigr\}\!\Bigl)\\
& \geq &
1-  P^n\Bigl(\!\Bigl\{\bigl((x_1,y_1),\dots,(x_n,y_n)\bigr)  :
\bigl|\{l:x_l\in A, y_l=i\}\bigr|
\leq \Bigl(\int_A(1-p)\, dP_X-\frac \d M\Bigr)\, n\Bigr\}\!\Bigl)\\
& \geq & 1- e^{-2\left(\frac \d M\right)^2n}\ ,
\end{eqnarray*}
i.e. $P^n(Z^n\setminus F_{n,A}^+) \leq e^{-2\left(\frac \d M\right)^2n}$, where $Z := X\times Y$.
Analogously, we obtain $P^n(Z^n\setminus F_{n,A}^-) \leq e^{-2\left(\frac \d M\right)^2n}$ for
all $A\in \ca P_i^+$ and $P^n(Z^n\setminus F_{n,A}) \leq e^{-2\left(\frac \d M\right)^2n}$
for
all $A\in \ca P_i^j$.
These estimates yield
\begin{eqnarray*}
&&P^n(F^n)\\ & = & P^n\Bigl(  \bigcap_{A\in {\cal P}^0_{-1}\cup {\cal P}^0_1} F_{n,A}
\quad \cap
\bigcap_{A\in {\cal P}^+_{-1}\cup {\cal P}^+_1} \bigl(F_{n,A} \cap F_{n,A}^+ \cap F_{n,A}^-\bigr) \Bigr)\\
& = & 1-
P^n\Bigl(  \bigcup_{A\in {\cal P}^0_{-1}\!\cup\!\!\!\!\!\! {\cal P}^0_1} \bigl(Z^n\setminus F_{n,A}\bigl)
\quad \cup\!
\bigcup_{A\in {\cal P}^+_{-1}\cup {\cal P}^+_1}\! \bigl(\bigl(Z^n\setminus F_{n,A}\bigl)
 \cup \bigl(Z^n\setminus F_{n,A}^+\bigl) \cup \bigl(Z^n\setminus F_{n,A}^-\bigl)\bigr)\!\Bigr)\\
& \geq & 1-
\sum_{\substack{A\in {\cal P}^j_{-1}\cup {\cal P}^j_1\\ \jin }} P^n\bigl(Z^n\setminus F_{n,A}\bigl)
- \sum_{\substack{A\in {\cal P}^+_{-1}\cup {\cal P}^+_1}} P^n\bigl(Z^n\setminus F_{n,A}^+\bigl)
- \sum_{\substack{A\in {\cal P}^+_{-1}\cup {\cal P}^+_1}} P^n\bigl(Z^n\setminus F_{n,A}^-\bigl)\\
& \geq & 1 -  3M e^{-2 \left(\frac \d M\right)^2 n}\ .
\end{eqnarray*}
\end{proof}



\noindent With the help of Theorem \ref{HS1} we can now investigate how to choose the parameter sequence
$(c_n)$ for a given universal kernel:




\begin{theorem}\label{HS2}
Let $(X,d)$ be a compact metric space and $k$ be a universal kernel on $X$
such that the covering numbers of $(X,d_k)$ fulfill
$\ca N\bigl((X,d_k),\e\bigr)\in \ca O(\e^{-\a})$ for some $\a>0$.
Suppose that we have a positive sequence $(c_n)$ with $(c_n)\in \ca O(n^{\b-1})$ for some
$0<\b<\frac 1 \a$ and $n c_n \to \infty$.
Then for all Borel probability measures $P$ on $X\times Y$ with $q^*,p^*\in (0,1/2)$ and all $\e>0$ we have
\begin{displaymath}
\lim_{n\to \infty}\opa \Bigl(\Bigl\{T\in (X\times Y)^n :  \ca R_P
(f^{2,k,c_n}_{T})\leq \ca R_P + 4 \frac{p^*-q^*}{1-2q^*}P_X(X^+) + \e\Bigr\}\Bigr)
\Eq  1\ .
\end{displaymath}
\end{theorem}

\begin{proof} Let $\g := \frac{1-\a\b}{4(1+\a)} > 0$ and $\d_n:= n^{-\g}$. By Theorem \ref{HS1}
there exist $c^*>0$ and $\d^*>0$ such that for all $c\geq c^*$, $0<\d\leq \d^*$ and all
$n\geq 1$ we have
\begin{displaymath}
\opa \Bigl(\Bigl\{T\in (X\times Y)^n :  \ca R_P
(f^{2,k,c/n}_{T})\leq \ca R_P + 4 \frac{p^*-q^*}{1-2q^*}P_X(X^+) + \e\Bigr\}\Bigr)
\Geq 1 - 3M e^{-2 \left(\frac \d M\right)^2 n}\ ,
\end{displaymath}
where $M := \ca N\bigl((X,d_k),\frac \d {\sqrt c}\bigr)$. Since $\d_n\to 0$ and
$nc_n\to \infty$ we may fix an $n_0\geq 1$ such that for all $n\geq n_0$ we have both
$\d_n\leq \d^*$ and $nc_n \geq c^*$. This yields
\begin{displaymath}
\opa \Bigl(\Bigl\{T\in (X\times Y)^n :  \ca R_P
(f^{2,k,c_n}_{T})\leq \ca R_P + 4 \frac{p^*-q^*}{1-2q^*}P_X(X^+) + \e\Bigr\}\Bigr)
\Geq 1 - 3M_n e^{-2 \left(\frac {\d_n} {M_n}\right)^2 n}\ ,
\end{displaymath}
where $M_n := \ca N\bigl((X,d_k),\frac {\d_n} {\sqrt {nc_n}}\bigr)$.
This implies $M_n\in \ca O(n^{\a(\g+\b/2)})$ and hence we easily check that
$M_n e^{- \left(\frac {\d_n} {M_n}\right)^2 n}\in \ca O(e^{-n^{2(1+\a)\g}})$ holds.
Thus the assertion follows. \end{proof}


\begin{corollary}
Under the assumptions of Theorem \ref{HS2} the 2-SMC with sequence $(c_n)$ is
consistent for all Borel probability measures $P$ on $X\times Y$ with $q^*=p^*< 1/2$.
In particular this holds for all Borel probability measures with constant noise level.
\end{corollary}

\begin{corollary}\label{Cor1}
Let $X\subset \R^d$ be compact and $k$ be a Gaussian RBF kernel on $X$. Let $0<\b<\frac 1 d$
and $(c_n)$ be a positive sequence with $(c_n)\in \ca O(n^{\b-1})$
and $n c_n \to \infty$. Then the 2-SMC with kernel $k$ and
sequence $(c_n)$ is consistent for all Borel probability measures $P$ on $X\times Y$ with $q^*=p^*< 1/2$.
\end{corollary}

\begin{proof} Let $\s>0$ and $k(x,y) := \exp(-\s^2\xnorm{x-y} 2 2)$. Since $1-e^{-t} \leq t$
for all $t\geq 0$ we observe
$$
d_k(x,y) \Eq \sqrt{2-2\exp(-\s^2\xnorm{x-y} 2 2)} \Leq
\sqrt 2 \s \ynorm{x-y} 2 \ .
$$
This yields $\ca N\bigl((X,d_k),\e\bigr)
\leq \ca N\bigl((X,\ynorm . 2),\frac \e{\sqrt 2 \s}\bigr)$ and thus
$\ca N\bigl((X,d_k),\e\bigr)\in \ca O(\e^{-d})$ \citep*[cf.][p.~9]{C-S}.\end{proof}




\noindent For the classification problems we have considered up to now we usually may not expect that
we obtain a large margin for sample sizes growing to infinity. In the following we restrict
ourselves to distributions that guarantee a fixed and strictly positive margin for all
training sets. Of course, these classification problems must have a deterministic
supervisor, i.e.\ a noise level that vanishes (almost) everywhere. In general, additional
assumptions are required. Using universal kernel these reduce to a simple geometric condition:



\begin{theorem}\label{HS3}
Let $(X,d)$ be a compact metric space and $k$ be a universal kernel on $X$.
Suppose that we have a Borel probability measure $P$ on $X\times Y$ with a deterministic supervisor and with classes
$B_{-1}(P),B_1(P)$ which have strictly positive distance, i.e.
$d\bigl(B_{-1}(P),B_1(P)\bigr)>0$. Then $k$ separates $B_{-1}(P)$ and $B_1(P)$ with margin $\g>0$ and for all
$c>0$, $\e>0$ and $n\geq mM$ we have
\begin{displaymath}
\opa\bigl(\{T\in (X\times Y)^n :  \ca R_{P,S}
(f^{2,k,c}_{T}) \leq \e \}\bigr) \Geq 1-M\ e^{-\frac {\e n}{2 M}+m}\ .
\end{displaymath}
Here, $M:= \ca N\bigr((X,d_k),\g/2\bigl)$ is the covering number
of $(X,d_k)$ and $m:=\bigl\lfloor \frac 4 {c\g^2} \bigr\rfloor + 1$.
\end{theorem}



\begin{proof} Since $(X,d_k)$ is precompact and $d\bigl(B_{-1}(P),B_1(P)\bigr)>0$
both $B_i(P)$ are compact, too. Thus
they can be separated with margin $\g>0$ by Corollary \ref{sepcpct}. In analogue to Lemma \ref{part}
we now construct partitions $\tilde{\ca  P}_i$ of $B_i(P)$, $\iin$ such that
$\diam_{d_k}(A) \leq \g/2$ for all $A\in \tilde{\ca  P}_i$ and
$\bigl|\tilde{\ca  P}_{-1} \cup \tilde{\ca  P}_1\bigr| \leq M$.
We define
${\cal P}_i := \{A\in \tilde{{\cal P}}_i: P_X(A) >\frac \e M\}$
for $\iin$.
Moreover, for $n\geq mM$ and
$A\in {\cal P}_{-1}\cup {\cal P}_1$ let
\begin{eqnarray*}
F_{n,A} & := &  \Bigl\{\bigl((x_1,y_1),\dots,(x_n,y_n)\bigr)\in (X\times Y)^n \ : \
\bigl|\{l:x_l\in A\}\bigr| \geq m\Bigr\}\\
F_n & := & \bigcap_{A\in {\cal P}_{-1}\cup {\cal P}_1} F_{n,A}\ .
\end{eqnarray*}
For $n\geq mM$ the Chernoff-Okamoto inequality \citep[see e.g.][]{Dud2} then yields
\begin{eqnarray*}
P^n\bigl((X\times Y)^n\setminus F_{n,A}\bigr)
&\leq &
\exp\Bigl(- \frac{\bigl(n \frac \e M - (m-1)\bigr)^2}{2n\frac \e M\bigl(1-\frac \e M\bigr)}\Bigr)\\
& \leq &
\exp\Bigl(- \frac{n^2\bigl(\frac \e M\bigr)^2 - 2 n \frac \e M (m-1) + (m-1)^2}{2n\frac \e M}\Bigr)\\\
& \leq & \exp\bigl(- \frac {\e n}{2 M}+ m\bigr)
\end{eqnarray*}
and thus $P^n(F_n) \geq 1-M\ e^{-\frac {\e n}{2 M}+m}$ for all $n\geq mM$.
Hence it suffices to show that for all
$T=\bigl((x_1,y_1),\dots,(x_n,y_n)\bigr) \in F_n$ the decision function $f^{2,k,c}_T$
classifies $\bigcup_{A \in {\cal P}_1} A$ and $\bigcup_{A\in {\cal P}_{-1}} A$
correctly. To see this we fix a feature map $\P:X\to H$ of $k$.
For brevity's sake the unique solution of problem
(\ref{prim2sno}) is denoted by $(w_T,b_T,\xi_T)$.
Furthermore, $(w^*,b^*)\in S_H\times \R$ is a hyperplane that separates
$\P\bigl(B_1(P)\bigr)$ and $\P\bigl(B_{-1}(P)\bigr)$ with margin $\g>0$.
Then we have
$y_l\bigl(\langle{w^*},\P(x_l)\rangle +{b^*}\bigr)\geq \g$
for all $l=1,\dots,n$ and therefore we obtain
\begin{displaymath}
\langle w_T,w_T\rangle + c\sum_{l=1}^n\x_l^2 \Leq
\Bigl\langle \frac {w^*}\g,\frac {w^*}\g\Bigr\rangle \Eq \frac 1 {\g^2}\ .
\end{displaymath}
In particular this implies $\snorm{w_T}\leq  1/ \g$.
Now, let us suppose that there exists a misclassified point $z$. Without loss of
generality we may assume that $z\in \bigcup_{A \in {\cal P}_1} A$.
Hence there
is an $A\in {\cal P}_1$ with $z\in A$ and for this there exist mutually
different indexes $l_1,\dots,l_m$ such that $x_{l_j}\in A$.
In particular this yields $d_k(x_{l_j},z)\leq  \g/ 2$ for all $j=1,\dots,m$.
Since $\langle w_T,\P(z)\rangle +b_T \leq 0$ we thus obtain
$$
1-\x_{l_j}  \Leq  \langle w_T,\P(x_{l_j})\rangle +b_T  \Eq
\langle w_T,\P(x_{l_j})-\P(z)\rangle + \langle w_T,\P(z)\rangle +b_T
\Leq  \snorm{w_T}\ d_k(x_{l_j},z)
\Leq \frac 1 2 .
$$
Hence we have $\x_{l_j}\geq 1/2$ and this leads to the contradiction
$$
\frac 1 {\g^2} \Geq \langle w_T,w_T\rangle + c\sum_{l=1}^n\x_l^2
\Geq c \sum_{j=1}^m \x_{l_j}^2 \Geq \frac{cm} 4
\Eq \frac c 4 \Bigl(\Bigl \lfloor \frac 4 {c\g^2} \Bigr\rfloor + 1   \Bigr)
\ > \ \frac  1 {\g^2} \ .
$$
Therefore the assertion follows. \end{proof}

\noindent Theorem \ref{HS3} shows that the 2-SMC works well with fixed weight factor $c$ whenever
it treats a classification problem that ensures a large margin. We believe that these
distributions are also the only ones for which a fixed $c$ is suitable. Our conjecture is based on the
observation that the constant $c$ controls the Lipschitz constant of the solution of
(\ref{prim2sno}) with respect to the metric $d_k$:
if we have a classification problem that does not guarantee a large margin the Lipschitz constant
may grow like $n$. The proofs of this section indicate that this may be too fast
since for large sample sizes the solution need not be ``almost'' constant on each element
of the partitions, i.e.
overfitting may occur.

In the proof of the above theorem we only used elements of the partitions $\ca P_i$
whose probability was larger than or equal to $\frac \e M$. If we extend our considerations
to all elements with strictly
positive probability we obtain the following theorem:




\begin{theorem}\label{HS4}
Let $(X,d)$ be a compact metric space and $k$ a universal kernel on $X$.
Suppose that we have a Borel probability measure $P$ on $X\times Y$ with a deterministic supervisor
and with classes
$B_{-1}(P),B_1(P)$ which have strictly positive distance, i.e.
$d\bigl(B_{-1}(P),B_1(P)\bigr)>0$. Then $k$ separates $B_{-1}(P)$ and $B_1(P)$ with margin $\g>0$ and for all
$c>0$ there exists a constant $\rho > 0$ such that for all
$n\geq \bigl(\bigl\lfloor \frac 4 {c\g^2} \bigr\rfloor + 1\bigr)\ca N\bigr((X,d_k),\g/2\bigl) $
we have
\begin{displaymath}
\opa\bigl(\{T\in (X\times Y)^n :  \ca R_{P,S}
(f^{2,k,c}_{T}) = 0 \}\bigr) \Geq 1- e^{-\rho n}\ ,
\end{displaymath}
\end{theorem}



\noindent Up to now we have only treated universal kernels. One may ask whether other classes
of kernels are also suitable to treat with the classification problems considered
in this work. One type often used are polynomial kernels of the form $(\langle .,.\rangle +c)^m$,
$c\geq 0$, $m\in \N$, on a subset $X$ of $\Rn$. For these kernels the functions generated
by the 2-SMC are polynomials on $X$ of degree up to $m$.  Thus the next proposition shows
that these kernels are not even capable to solve the problems of Theorems \ref{HS3} and
\ref{HS4}:




\begin{proposition}\label{Pro1}
Let ${\cal P}_n^d$ be the set of all polynomials on $X:=[0,1]^d$ whose degree is less than $n+1$.
Then for all $\e>0$ there exists a Borel probability measure $P$ on $X\times Y$ with
$d\bigl(B_1(P),B_{-1}(P)\bigr)>0$, $\ca R_{P}=0$ and
$$
\inf\{\ca R_{P}(\sign f) \ : \ f\in {\cal P}_n^d\} \Geq  \frac 1 2 -\e\ .
$$
\end{proposition}



\begin{proof} We first treat the case $d=1$. We fix an integer $m\geq  (3n+2)/\e$
and let $I_i:= [(i+\e)/ m,(i+1-\e)/m]$, $i=0,\dots,m-1$.
Denoting the Lebesgue measure on $I_i$ by $\lb_{I_i}$ let
$P_X:= ({1-2\e})^{-1}\sum_{i=0}^{m-1}\lb_{I_i}$.
Moreover, we define a
deterministic supervisor $P(.|.)$ by $P(y=1|x) := 1$ for $x\in I_{2i}$,
$i=0,\dots,\lfloor ( {m+1})/ 2\rfloor$, and $P(y=-1|x) := 1$ otherwise.
For a fixed polynomial $f\in {\cal P}_n^d$ we denote its mutually different and ordered
zeroes in $(0,1)$ by $x_1< \ldots <x_k$, $k\leq n$.
For brevity's sake let $x_0:=0$ and $x_{k+1}:=1$. Finally, we define
$a_j := \bigl|\bigl\{ i:
I_i \subset [x_j,x_{j+1}] \bigr\}\bigr|$
for $j=0,\dots,k$. Then by an easy observation we get
$\sum_{j=0}^k a_j \Geq m-k \Geq m-n$.
Moreover, at most $\lfloor( {a_j+1})/ 2\rfloor$
intervals $I_i$ are correctly classified on $[x_j,x_{j+1}]$ by the function $\sign f$.
Hence at least
$$
\sum_{j=0}^k \Bigl\lfloor \frac {a_j} 2 \Bigr\rfloor \geq
\sum_{j=0}^k \Bigl( \frac {a_j} 2 -1\Bigr) \geq
\frac{m-n} 2 - (k+1) \geq \frac{m-3n-2} 2
$$
intervals $I_i$ are not correctly classified on $[0,1]$ by $\sign f$.
Since $P_X(I_i)= 1/m$ we thus obtain
$$
\ca R_{P} (\sign f) \Geq \frac 1 m\sum_{j=0}^k \Bigl\lfloor \frac {a_j} 2 \Bigr\rfloor \Geq
 \frac 1 2 -\frac{3n+2} m \Geq \frac 1 2 -\e\ .
$$
To treat the case $d>1$ we have to embed $[0,1]$ into $[0,1]^d$
via $t\mapsto (t,0,\dots,0)$. Considering the above distribution $P$ embedded
into $[0,1]^d$ we then get the assertion since polynomials in $d$
variables on $[0,1]$ embedded into $[0,1]^d$ as above are essentially polynomials
in one variable. \end{proof}





\section{The 1-norm soft margin classifier}\label{1smc}

We now consider the 1-norm soft margin classifier. Again we fix a
kernel $k$ on $X$ with feature map
$\P:X\to H$. For a training set
$T = ((x_1,y_1),\dots,(x_n,y_n)) \in (X\times Y)^n$ and $c_n>0$
we denote a
solution of the optimization problem
\begin{equation}\label{prim1sno}
\begin{array}{rll}
\!\!\!\!\!\!\mbox{minimize} & \qquad
 \langle w,w\rangle + c_n\Sum_{i=1}^n\x_i   & \qquad \mbox{over } w,b,\x\\
\!\!\!\!\!\!\mbox{subject to} & \qquad y_i(\langle w,\P(x_i)\rangle +b) \geq 1-\x_i, & \qquad
i=1,\dots,n\\
& \qquad \x_i \geq 0, & \qquad i=1,\dots,n
\end{array}
\end{equation}
by $(w_{T}^{1,k,c_n},b_T^{1,k,c_n})\in H\times \R$. An algorithm $\ca C_k^{1,(c_n)}$
that provides the decision function
$$
f_T^{1,k,c}(x):= \sign(\langle w_T^{1,k,c_n},\P(x)\rangle +b_T^{1,k,c_n})\ , \qquad \qquad x\in X
$$
for every training set $T$
is called a 1-norm soft margin classifier (1-SMC) with kernel $k$ and parameter sequence
$(c_n)$. For an introduction to the 1-SMC
as well as for implementation techniques we refer to \citet*[][Ch.~6 and 7]{CST} and
\citet[][Ch.~10]{Vap}

\citet*{BC} proved that in general the optimization problem (\ref{prim1sno})
has no unique solution. Although the 1-SMC only has to construct
an arbitrary {\em solution} we show in this section that it enjoys all the properties
of the 2-SMC proven in this work. We begin with a statement which is analogous
to Theorem \ref{HS1}:



\begin{theorem}\label{HS5}
Let $(X,d)$ be a compact metric space and $k$ be a universal kernel on $X$.
Then for all Borel probability measures $P$ on $X\times Y$ with $q^*,p^*\in (0,1/2)$
and all $\e>0$ there exist
$c^*>0$ and $\d^*>0$ such that for all $c\geq c^*$, $0<\d\leq\d^*$ and all $n\geq 1$ we have
\begin{displaymath}
\opa \Bigl(\Bigl\{T\in (X\times Y)^n :  \ca R_P
(f^{1,k,c/n}_{T})\leq \ca R_P + 4 \frac{p^*-q^*}{1-2q^*}P_X(X^+) + \e\Bigr\}\Bigr)
\Geq 1 - 3M e^{-2 \left(\frac \d M\right)^2 n}\ ,
\end{displaymath}
where $M := 4 \ca N\bigl((X,d_k),\frac \d {\sqrt c}\bigr)$ is essentially the covering number of $X$
with respect to the metric $d_k$ which is induced by the kernel $k$.
\end{theorem}

\begin{proofs}
Since the proof is very similar to that of
Theorem \ref{HS1} we only point out the main differences besides adjusting the constants:
Firstly the vector $w^* \in H$ has to be chosen in a different manner, namely it has to fulfill 
\begin{equation*}
\langle w^*,\P(x) \rangle \quad\in\quad
    \begin{cases}
        i\,[1,1+\d] & \mbox{ if } x\in K_i^j \\
        [-(1+\d),1+\d] & \mbox{ otherwise. }
        \end{cases}
\end{equation*}
This condition also enforces the second modification since Lemma \ref{part} does not hold anymore in this 
setting. Indeed, we cannot guarantee that the definition of the $\tilde {\ca P}_i^j$'s in Lemma \ref{part}
implies
that they are mutually disjoint. Instead, we only obtain 
$|\tilde {\ca P}_i^j| \leq N\bigl((X,d_k),\frac \d {\sqrt c}\bigr)$ and thus
\begin{equation*}
\Bigl|\bigcup_{\substack{\iin\\ \jin}} \ca P_i^j\Bigr| \Leq 4\ca N\bigl((X,d_k),\s\bigr)\ .
\end{equation*}
The latter explains the definition of $M$, which is different to that of Theorem \ref{HS1}. \end{proofs} 





\noindent Following the proof of Theorem \ref{HS2} we obtain an analogous result for the 1-SMC:

\begin{theorem}\label{HS6}
Let $(X,d_k)$ be a compact metric space and $k$ be a universal kernel on $X$
such that the covering numbers of $(X,d_k)$ fulfill
$\ca N\bigl((X,d_k),\e\bigr)\in \ca O(\e^{-\a})$ for some $\a>0$.
Suppose that we have a positive sequence $(c_n)$ with $(c_n)\in \ca O(n^{\b-1})$ for some
$0<\b<\frac 1 \a$ and $n c_n \to \infty$.
Then for all Borel probability measures $P$ on $X\times Y$ with $q^*,p^*\in (0,1/2)$ and all $\e>0$ we have
\begin{displaymath}
\lim_{n\to \infty}\opa \Bigl(\Bigl\{T\in (X\times Y)^n :  \ca R_P
(f^{1,k,c_n}_{T})\leq \ca R_P + 4 \frac{p^*-q^*}{1-2q^*}P_X(X^+) + \e\Bigr\}\Bigr)
\Eq  1
\end{displaymath}
\end{theorem}


\noindent To complete our considerations of noisy problems
we mention that for the 1-SMC with Gaussian RBF kernel
we obtain the following consistency result which has already been proved for the 2-SMC:


\begin{corollary}\label{Cor3}
Let $X\subset \R^d$ be compact and $k$ be a Gaussian RBF kernel on $X$. Let $0<\b<\frac 1 d$
and $(c_n)$ be a positive sequence with $(c_n)\in \ca O(n^{\b-1})$ and 
$n c_n \to \infty$. Then the 1-SMC with kernel $k$ and
sequence $(c_n)$ is consistent for all Borel probability measures $P$ on $X\times Y$ with $q^*=p^*< 1/2$.
\end{corollary}




\noindent In the presence of large margins it turns out that we may fix the weight factor analogously
to the 2-SMC. For brevity's sake we only state a result that is similar to
Theorem \ref{HS3}. However, a result that is analogous to Theorem \ref{HS4} also holds.


\begin{theorem}\label{HS7}
Let $(X,d)$ be a compact metric space and $k$ a universal kernel on $X$.
Suppose that we have a Borel probability measure $P$ on $X\times Y$ with a deterministic supervisor and with classes
$B_{-1}(P),B_1(P)$ which have strictly positive distance, i.e.
$d\bigl(B_{-1}(P),B_1(P)\bigr)>0$. Then $k$ separates $B_{-1}(P)$ and $B_1(P)$ with margin $\g>0$ and for all
$c>0$, $\e>0$ and $n\geq mM$
we have
\begin{displaymath}
\opa\bigl(\{T\in (X\times Y)^n :  \ca R_{P,S}
(f^{1,k,c}_{T}) \leq \e \}\bigr) \Geq 1-M\ e^{-\frac {\e n}{2 M}+m}\ ,
\end{displaymath}
where $M:= \ca N\bigr((X,d_k),\g/2\bigl)$ 
is the covering number of $(X,d_k)$ and $m:=\bigl\lfloor \frac 2 {c\g^2} \bigr\rfloor + 1$.
\end{theorem}


\begin{proofs} 
The proof is completely analogous to that of Theorem \ref{HS3}. However, since 
the slack variables are not squared we obtain 
$$
 c \sum_{j=1}^m \xi_{l_j} \Geq \frac {cm}2
$$ 
in the last estimate of the proof of Theorem \ref{HS3} and this yields the slightly better value of 
$m$. \end{proofs}







\noindent Finally we mention that using polynomial kernels Proposition \ref{Pro1}
can also be applied. Thus problems that cannot be treated well with a fixed polynomial
kernel and the 2-SMC cannot be treated well with the 1-SMC and
the same kernel, either (and vice versa).






\section{The maximal margin hyperplane classifier}\label{mmh}





We finally consider the maximal margin classifier. Again we fix a
kernel $k$ on $X$ with feature map
$\P:X\to H$. For a training set
$T = ((x_1,y_1),\dots,(x_n,y_n)) \in (X\times Y)^n$
we denote the unique
solution of the optimization problem
\begin{equation}
\begin{array}{rll}
\!\!\!\!\!\!\mbox{minimize} & \qquad
 \langle w,w\rangle \qquad  &\mbox{over } w,b\\
\!\!\!\!\!\!\mbox{subject to} & \qquad y_i(\langle w,\P(x_i)\rangle +b) \geq 1, \qquad &
i=1,\dots,n
\end{array}
\end{equation}
by $(w_{T}^k,b_T^k)\in H\times \R$. An algorithm $\ca C_k$
that provides the decision function
$$
f_T^k(x):= \sign(\langle w_T^k,\P(x)\rangle +b_T^k)\ , \qquad \qquad x\in X
$$
for every training set $T$
is called a maximal margin classifier (MMC) with kernel $k$. For an introduction to the MMC
as well as for implementation techniques and a geometric motivation
we again refer to \citet*[][Ch.~6 and 7]{CST} and \citet*[][Ch.~10]{Vap}.

The MMC is assumed to work poorly in the absence of large margins \citep*[cf.][]{TZ}. Thus we only
consider the setting of Theorem \ref{HS3}. We begin with a result similar to Theorem \ref{HS3} and Theorem
\ref{HS7}:


\begin{theorem}\label{HS8}
Let $(X,d)$ be a compact metric space and $k$ a universal kernel on $X$.
Suppose that we have a Borel probability measure $P$ on $X\times Y$ with a deterministic supervisor and with classes
$B_{-1}(P),B_1(P)$ which have strictly positive distance, i.e.
$d\bigl(B_{-1}(P),B_1(P)\bigr)>0$. Then $k$ separates $B_{-1}(P)$ and $B_1(P)$ with margin $\g>0$ and
for all $\e>0$ and $n\geq M:= \ca N\bigr((X,d_k),\g/2\bigl)$
we have
\begin{displaymath}
\opa\bigl(\{T\in (X\times Y)^n :  \ca R_{P,S}
(f_{T}^k) \leq \e \}\bigr) \Geq 1-M\ e^{n \ln\left(1-\frac \e {2M}\right) }\ .
\end{displaymath}
\end{theorem}


\begin{proofs} 
We repeat the construction of the proof of Theorem \ref{HS3}
with $m:= 1$. An easy calculation then shows
$$
P^n(F_n) \Geq 1 - M\ e^{n \ln\left(1-\frac \e {2M}\right) }\ .
$$
Now suppose that for $T\in F_n$ we have an element $z\in  \bigcup_{A \in {\cal P}_1} A$
misclassified by $f_T^k$.
Hence there exist an $A\in {\cal P}_1$ with $z\in A$ and a sample $(x_l,y_l)$ of $T$
with $x_l \in A$. Then the following estimate yields a contradiction:
$$
0 \Geq \langle w_T^k,\P(z)\rangle +b_T^k \Eq
\langle w_T^k,\P(z)-\P(x_{l_j})\rangle + \langle w_T^k,\P(x_{l_j})\rangle +b_T^k
\Geq  \snorm{w_T^k}\ d_k(x_{l_j},z) + 1
\Geq \frac 1 2\ .
$$
Therefore the assertion follows. 
\end{proofs}


\noindent The above result and its proof indicate that in the presence of large margins it may be
more suitable to use the MMC instead of a soft margin algorithm. We mention that an
estimate which is very similar to Theorem \ref{HS8} can be obtained using data-dependent
margin-based bounds of \citet{SBWA}. To compare both we first state a corollary:


\begin{corollary}\label{Cor4}
Let $k_\s$ be the Gaussian RBF kernel with parameter $\s$ on the unit ball $X:= B_{\ell_2^d}$
of the $d$-dimensional Euclidean space. Let $P$ be a
Borel probability measure on $X\times Y$ which can be separated
by $k_\s$ with margin $\g>0$. Then for all $\d\in (0,1)$ and all $n\geq 2 \bigl(\frac{16\s}\g\bigr)^d$ we
have
$$
\opa \Bigl(\Bigl\{T\in (X\times Y)^n :  \ca R_{P,S}
(f_{T}^k) \leq \frac {4d} n \Bigl(\frac {16\s} \g \Bigr)^d \Bigl(d \ln \frac{16\s}\g + \ln \frac 2 \d\Bigr)
\Bigr\}\Bigr) \Geq 1-\d\ .
$$
\end{corollary}


\begin{proof} As already shown in the proof of Corollary \ref{Cor1} we have
$$
\ca N\bigl((X,d_k),\e\bigr)
\Leq \ca N\Bigl((X,\ynorm . 2),\frac \e{\sqrt 2 \s}\Bigr) \Leq
2 \cdot 5^d \Bigl(\frac \e {\sqrt 2 \s}\Bigr)^{-d} \Leq 2 (8\s)^d \e^{-d}
$$
and thus $M:= \ca N\bigr((X,d_k),\g/2\bigl)\ \leq\ 2 (16\s)^d \g^{-d}$. Now let
$\e:= 2M (1-\bigl(\frac \d M\bigr)^{1/n})$ for $n\geq 2 (16\s)^d \g^{-d}$,
i.e. $\d=M\bigl( 1-\frac \e {2M}\bigr)^n$.
Since $\e<\frac {2M} n \ln \frac M \d$ Theorem \ref{HS8} yields the assertion. \end{proof}





\noindent Corollary \ref{Cor4}
shows that the learning curve of the MMC
is of order $\ca O(n^{-1})$ provided that $P$ guarantees a large margin.
These conditions also allow the application of margin-based bounds
on generalization proved by \citet{SBWA} \citep*[cf.\ also][]{BST,CST}.
We then obtain that the learning curve is of order
$\ca O(n^{-1} (\log n)^2)$. However, to compare both results one also has
to consider the constants that arise since the sample size $n$ only varies in
a typical range. Here we observe that the influence
of the margin $\g$ is essentially of order $\ca O(\g^{-2})$ in the estimates
of \citet{SBWA} while the Corollary \ref{Cor4}
shows that our estimates are essentially influenced by order
$\ca O(\g^{-d})$, where $d$ is the dimension of the {\em input} space $X$.
We thus suppose that only for small input dimensions $d$ our estimates
are more suitable to treat realistic sample sizes.












\section{Conclusions}

The aim of this paper has been to investigate which kind of distributions
could be classified well by support vector machines (SVM's).
It has turned out that the ability of the kernel to approximate arbitrary
continuous functions plays a fundamental role for
this question. Since the resulting function classes represented by the classifier
are very large there always exists the risk of {\em overfitting}.
However, using soft margin support vector machines 
with specific sequences of regularization parameters
this risk can be controlled
at least for simple noise models, e.g. models with constant noise level.
In particular, the restriction to large margin problems in \citet*[][p.~442]{TZ} has been
significantly weakened.

Since the ansatz of this paper is new many questions remain open, and are
worth for further investigations. Firstly it is interesting whether the soft
margin algorithms yield arbitrarily good generalization for {\em all
distributions.} Up to now our results only provide consistency if the noise level
is constant. However, approximating an arbitrary noise level by step functions
it seems possible that the soft margin algorithms with certain parameter sequences
are universally consistent.
Of course, the universality of
kernels, which roughly speaking
enables us to do ``almost everything'' with induced functions on compact subsets would play
a crucial role for this ansatz, too. However, a solution is certainly also based
on a deeper investigation of the underlying optimization problems in order to remove the restriction
$q^*,p^* \in (0,1/2)$.
Moreover, we suppose that the universality of
kernels is also very important for related algorithms such as $\nu$-SVM's,
linear programming SVM's and leave-one-out SVM's as well as all kind
of SVM's used for regression, distribution estimation, etc..
However, since each treatment of one of these algorithms needs its own investigation of
the corresponding optimization problem many open questions
remain.\\[2mm]
%

\noindent{\bf Remark }
{\em Using the techniques of this work the author was recently 
able to prove universal consistency for the 1-SMC  \citep[cf.][]{Ste2}.}





\acks{We would like to acknowledge support for this project
from the Deutsche Forschungsgemeinschaft (DFG grant Ca 179/4-1). }






\begin{thebibliography}{99}
\bibitem[Bartlett et al.(1999)Bartlett \& Shawe-Taylor]{BST} { P. Bartlett and J. Shawe-Taylor}.
         Generalization performance of support vector machines and
         other pattern classifiers. In B. Sch\"olkopf, C.J.C. Burges, and A.J. Smola, editors,
         {\em Advances in
         Kernel Methods - Support Vector Learning}, pages 43-54, Cambridge, MA, 1999. MIT Press. 
\bibitem[Burges et al.(2000)Burges \& Crisp]{BC} { C.J.C. Burges and D.J. Crisp}.
	Uniqueness of the SVM solution. In M.I. Jordan,
	T.K. Leen, and K.-R. M\"uller, editors,
	{\em Advances in Neural Information Processing 12}, pages 223-229, Cambridge, MA, 2000. MIT Press.
\bibitem[Cristianini et al.(2000)Cristianini \& Shawe-Taylor]{CST} { N. Cristianini and J. Shawe-Taylor}.
	{\em An Introduction to
        Support Vector Machines and other
        kernel-based learning methods}. Cambridge University Press, Cambridge, 2000.
\bibitem[Carl et al.(1990)Carl \& Stephani]{C-S} { B. Carl and I. Stephani}. {\em Entropy, compactness and the 
        approximation of operators}. Cambridge University Press, Cambridge, 1990.
\bibitem[Devroye et al.(1997)]{DGL} { L. Devroye, L. Gy\"orfi and G. Lugosi}. {\em A probabilistic theory of pattern 
        recognition}. Springer, New York, 1997.
\bibitem[Dudley(1978)]{Dud2} { R.M. Dudley}. Central limit theorems for empirical measures,
        {\it Ann. Probab.}, { 6}(6):899-929, 1978. 
\bibitem[Dudley(1989)]{Dud} { R.M. Dudley}. {\em Real Analysis and Probability}.
        Chapman \& Hall, New York, 1989.
\bibitem[Gradstein et al.(1981)Gradstein \& Ryshik]{G-R} { S. Gradstein and I.M. Ryshik}. 
	{\em Summen- Produkt- und Integraltafeln *
        Tables of series, products and integrals}. Verlag Harri Deutsch, Frankfurt am Main, 1981.
\bibitem[Pedersen(1988)]{Ped} { G.K. Pedersen}. {\em Analysis Now}. Springer, Berlin, 1988.
\bibitem[Poggio(1975)]{Pog} { T. Poggio}. On optimal nonlinear associative recall. {\it Biological Cybernetics,}
        { 19}(4):201-209, 1975.  
\bibitem[Saunders et al.(1998)]{SSW} { C. Saunders, M.O. Stitson, J. Weston, L. Bottou, B. Sch\"olkopf, and A. Smola}.
        Support vector machine --- reference manual.
        {Technical Report} {CSD-TR-98-03}, Department of Computer Science,
        Royal Holloway, University of London, Egham, UK. 
        Available electronically at
        \url{http://www.dcs.rhbnc.ac.uk/research/compint/areas/comp_learn/sv/pub/report98-03.ps}, 1998.
\bibitem[Sch\"olkopf et al.(2001)]{SHSW} { B. Sch\"olkopf, R. Herbrich, A.J. Smola, and R.C. Williamson}.
        A generalized representer theorem. In {\it Proceedings of the 14th Annual 
	Conference on Computational Learning Theory, Lecture Notes in Artificial Intelligence 2111}, 
	pages 416-426, 2001. 
\bibitem[Shawe-Taylor et al.(1998)]{SBWA} { J. Shawe-Taylor, P.L. Bartlett, R.C. Williamson and M. Anthony}.
        Structural risk minimization over data-dependent hierarchies.
        {\it IEEE Trans. Inf. Theory} {44}(5):1926-1940, 1998.
\bibitem[Steinwart(2001a)]{Ste} { I. Steinwart}. On the influence of the kernel on the generalization
        ability of support vector machines. {Technical Report} {01-01}, Fakult\"at f\"ur
        Mathematik und Informatik, Friedrich-Schiller-Universit\"at, Jena, Germany, available electronically at
        \url{http://www.minet.uni-jena.de/Math-Net/reports/sources/2001/01-01report.ps}, 2001a.
\bibitem[Steinwart(2001b)]{Ste2} { I. Steinwart}. On the generalization ability of support vector machines.
	Submitted to {\em J. Complexity}, 2001b.
\bibitem[Tong Zhang(2001)]{TZ} { Tong Zhang}. A leave-one-out cross validation bound for kernel methods with
        applications in learning. In {\it Proceedings of the 14th Annual 
	Conference on Computational Learning Theory, Lecture Notes in Artificial Intelligence 2111}, 
	pages 427-443, 2001.
\bibitem[van der Vaart et al.(1996)van der Vaart \& Wellner]{V-W} { A.W. van der Vaart and J.A. Wellner}. 
	{\em Weak Convergence and Empirical Processes}.
        Springer, New York, 1996.
\bibitem[Vapnik(1998)]{Vap} { V.N. Vapnik}. {\em Statistical Learning Theory}. Wiley, New York, 1998.
\end{thebibliography}


\end{document}