
\documentclass[twoside,11pt]{article}

% Any additional packages needed should be included after jmlr2e.
% Note that jmlr2e.sty includes epsfig, amssymb, natbib and graphicx,
% and defines many common macros, such as 'proof' and 'example'.
%
% It also sets the bibliographystyle to plainnat; for more information on
% natbib citation styles, see the natbib documentation, a copy of which
% is archived at http://www.jmlr.org/format/natbib.pdf

\usepackage{jmlr2e}
\usepackage{latexsym}
%\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{graphics}
\usepackage{latexsym}
\usepackage{array}
%\usepackage{epsfig}
%\usepackage{nips00e,times}

%\DeclareMathAlphabet{\Bi}{OT1}{cmm}{b}{it}

\iffalse
\topmargin -0.5in
\oddsidemargin 0in
\evensidemargin 0in
%\textheight 9.1in
%\textwidth 6.7in
\textheight 8.9in
\textwidth 6.5in
\parskip 0.05in
%\parskip 0.0in
\setcounter{page}{1}
\pagestyle{plain}
\fi


%\newtheorem{remark}{Remark}
%\newtheorem{definition}{Definition}
%\newtheorem{theorem}{Theorem}
%\newtheorem{proposition}[theorem]{Proposition}
%\newtheorem{lemma}[theorem]{Lemma}
%\newtheorem{corollary}[theorem]{Corollary}
%\newtheorem{fact}[theorem]{Fact}

\newcommand{\note}[1]{{\marginpar{#1}}}
%\newcommand{\note}[1]{{{\bf #1}}}
%\newcommand{\note}[1]{}
\newcommand{\equref}[1]{(\ref{#1})}
\renewcommand{\vec}[1]{\mbox{\boldmath $#1$}}

\newcommand{\R}{{\mathcal{R}}}
\newcommand{\W}{{\mathcal{W}}}
\newcommand{\X}{{\mathcal{X}}}
\newcommand{\M}{{\mathcal{M}}}
\newcommand{\PP}{{\mathcal{P}}}
\newcommand{\D}{{\mathcal{D}}}
\newcommand{\E}{{\mathcal{E}}}

\newcommand{\diff}{d}
%\newcommand{\x}{{\bf x}}
\newcommand{\x}{\vec{x}}
\newcommand{\xt}{\x_t}
\newcommand{\xti}{x_{t,i}}
\newcommand{\xtp}{\x_{t+1}}
\newcommand{\xh}{\hat{\x}}

\renewcommand{\u}{{\vec{u}}}
\newcommand{\z}{{\vec{z}}}
\newcommand{\y}{{\vec{y}}}
%\renewcommand{\u}{{\bf u}}
%\newcommand{\w}{{\bf w}}
\newcommand{\w}{{\vec{w}}}
\newcommand{\wt}{\w_t}
\newcommand{\wti}{w_{t,i}}
\newcommand{\wT}{\w_T}
\newcommand{\wtp}{\w_{t+1}}
\newcommand{\wtpi}{w_{t+1,i}}
\newcommand{\wav}{{\widehat \w}_{T+1}}

\newcommand{\Tj}{T^{(j)}}
\newcommand{\Tpj}{T^{'(j)}}
\newcommand{\rhoj}{\rho^{(j)}}
\newcommand{\dej}{\delta^{(j)}}
\newcommand{\Bj}{B^{(j)}}

\newcommand{\gt}{g_t}

\newcommand{\yt}{y_t}
\newcommand{\ytp}{y_{t+1}}

\newcommand{\yh}{\hat{y}}
\newcommand{\yht}{\yh_t}
\newcommand{\yhtp}{\yh_{t+1}}

\newcommand{\at}{a_t}
\newcommand{\ah}{\hat{a}}
\newcommand{\aht}{\ah_t}
\newcommand{\ahtp}{\ah_{t+1}}

\newcommand{\aerr}{{\rm attr}}
\newcommand{\inte}{\mbox{int}\;}
\newcommand{\ttheta}{\mbox {{\boldmath $\theta$}}}
\newcommand{\zzeta}{\mbox {{\boldmath $\zeta$}}}
\newcommand{\rr}{\mbox {{\boldmath $r$}}}
\newcommand{\gbold}{\mbox {{\boldmath $g$}}}
\newcommand{\vv}{\mbox {{\boldmath $v$}}}
\newcommand{\zero}{\mbox {{\boldmath $0$}}}
\newcommand{\zz}{{\bf z}}

\newcommand{\fm}{{\vec{f}^{-1}}}
%\newcommand{\f}{{\vec{f}}}
\newcommand{\f}{{\bf{f}}}
\newcommand{\finv}{{\bf{f^{-1}}}}
\newcommand{\Pfinv}{P_{\bf{f^{-1}}}}
\newcommand{\ff}{{\bf f}}

\newcommand{\marg}{\bar{m}_{\u,r}(\M)}
\newcommand{\argmin}{\mbox{argmin}}

\renewcommand{\d}{d}
\newcommand{\df}{\d_\f}

\newcommand{\sfrac}[2]{\mbox{$\frac{#1}{#2}$}}
\newcommand{\half}{\sfrac{1}{2}}

\newcommand{\us}{\langle \u,s\rangle}
\newcommand{\uo}{\langle \u,0\rangle}
\newcommand{\sign}{{\mbox{sign}}}
\newcommand{\Dtgo}{D_{\gamma}(\u;(\xt,y_t))}
\newcommand{\ga}{{\hat \gamma}_{\u,\M}}
\newcommand{\Xinf}{X_{\infty}}
\newcommand{\lma}{{\sc alma}$_p$}
\newcommand{\lmatwo}{{\sc alma}$_2$}
\newcommand{\lmasix}{{\sc alma}$_6$}
\newcommand{\lmaten}{{\sc alma}$_{10}$}

\newcommand{\Xpm}{X_{p,\M}}


\jmlrheading{2}{2001}{213-242}{3/01}{12/01}{Claudio Gentile}
\ShortHeadings{Approximate Maximal Margin Classification}{Gentile}
\firstpageno{213}



\begin{document}

\title{A New Approximate Maximal Margin Classification Algorithm}

\author{\name Claudio Gentile \email gentile@dsi.unimi.it\\
                      \addr Dipartimento di Scienze dell'Informazione\\
                            Universita' di Milano\\ 
                            Via Comelico 39,\\
                            20135 Milano, Italy
}


\editor{Nello Cristianini, John Shawe-Taylor and Bob Williamson}




\maketitle

\begin{abstract}%
A new incremental learning algorithm is described which 
approximates the maximal margin hyperplane w.r.t. norm $p \geq 2$ for 
a set of linearly separable data.
Our algorithm, called \lma\ (Approximate Large Margin algorithm w.r.t. norm $p$),
takes $O\left(\frac{(p-1)}{\alpha^2\,\gamma^2}\right)$
corrections to separate the data with $p$-norm margin larger than $(1-\alpha)\,\gamma$,
where $\gamma$ is the (normalized) $p$-norm margin of the data.
\lma\ avoids quadratic (or higher-order) programming methods. It is
very easy to implement and is as fast as on-line algorithms, such as
Rosenblatt's Perceptron algorithm.
We performed extensive experiments on both real-world and artificial datasets.
We compared \lmatwo\ (i.e., \lma\ with $p = 2$) to standard 
Support vector Machines (SVM) and to 
two incremental algorithms: the Perceptron algorithm and Li and Long's ROMMA.
The accuracy levels achieved by \lmatwo\ are superior to those
achieved by the Perceptron algorithm and ROMMA, but slightly inferior to 
SVM's. On the other hand, \lmatwo\ is quite faster and easier 
to implement than standard SVM training algorithms.
%We also tested \lma\ with $p > 2$ on artificial data.
When learning sparse target vectors, \lma\ with $p > 2$ largely 
outperforms Perceptron-like algorithms, such as \lmatwo.
\end{abstract}

\begin{keywords}
Binary Classification, Large Margin, Support Vector Machines, On-line Learning
\end{keywords}


\section{Introduction}

Vapnik's Support Vector Machines (SVM) are a statistical model of data 
that simultaneously minimizes model complexity and data fitting error
\citep{v-book-98}.
SVM have attracted a lot of interest and have spurred voluminous 
work in Machine Learning, both theoretical and experimental.
The remarkable generalization ability exhibited by SVM 
can be explained through margin-based VC theory 
(e.g., Shawe-Taylor et al., 1998; Anthony and Bartlett, 1999; Vapnik, 1998;
Cristianini and Shawe-Taylor, 2000, and references therein).

At the core of SVM lies the problem of finding the so-called
{\em maximal margin hyperplane}. Briefly, 
given a set linearly separable data, the maximal margin
hyperplane classifies all data correctly and maximizes the
minimal distance (the margin) between the data and the hyperplane.
If euclidean norm is used to measure the distance then
computing the maximal margin hyperplane corresponds
to the, by now classical, SVM training problem 
\citep{cv-svn-95}. 
This task is naturally formulated as a quadratic programming problem.
If an arbitrary norm $p$ is used then such a task 
turns to a more general mathematical programming problem
(e.g., Mangasarian, 1997; Nachbar et al., 1993) 
to be solved by general purpose (and computationally intensive)
optimization methods. This more general task
naturally arises in feature selection problems when the target to be learned 
is sparse (i.e., when the target has many irrelevant features).
This is often the case in a number of natural language processing problems
(e.g., Golding and Roth, 1996; Dagan et al., 1997).

A fair amount of recent work on SVM centers on finding
simple and efficient methods to solve maximal margin hyperplane
problems (e.g., Osuna et al., 1997; Joachims, 1998; Friess et al., 1998;
Platt, 1998; Kowalczyk, 1999; Keerthi et al., 1999; Li and Long, 1999).
This paper follows that trend, giving two main contributions.
The first contribution is a new efficient
algorithm which {\em approximates} the maximal margin hyperplane 
w.r.t. norm $p$ to any given accuracy. We call this algorithm \lma\
(Approximate Large Margin algorithm w.r.t. norm $p$). \lma\ is naturally viewed as
an {\em on-line} algorithm, i.e., as an algorithm which processes 
the examples one at a time. 
A distinguishing feature of \lma\ is that its relevant parameters
(such as the learning rate) are dynamically adjusted over time.
In this sense, \lma\ is a refinement of the on-line algorithms 
recently introduced by Auer et al. (2001).

On-line algorithms are useful when the examples become available to
the learning algorithm one at a time, but also when the training
set is too large to consider all examples at once.
\lmatwo\ (i.e., \lma\ with $p=2$)
is a perceptron-like algorithm;
the operations it performs can be expressed as dot products, so that we
can replace them by kernel functions (Aizerman et al., 1964).
\lmatwo\ approximately solves the SVM training problem, 
avoiding quadratic programming.
Unlike previous approaches (Cortes and Vapnik, 1995; Osuna et al., 1997; Joachims, 1998; 
Friess et al., 1998; Platt, 1998), 
our algorithm operates directly on (an approximation to)
the primal maximal margin problem, instead of its (Wolfe) dual.
% Via this approach, we are able to avoid computationally intensive
% mathematical programming methods. 
\lma\ is more similar to algorithms such as Li and Long's ROMMA (Li and Long, 1999)
and the one analyzed by \cite{k-lmp-98} and Keerthi et al. (1999). 
However, it seems those algorithms have been specifically designed for euclidean norm.
Unlike those algorithms, \lma\ remains computationally efficient when measuring the
margin through a generic norm $p$.

As far as theoretical performance is concerned,
\lmatwo\ achieves essentially the same bound on the number of corrections
as the one obtained by a version of
Li and Long's ROMMA. 
In the case when $p$ is logarithmic in the dimension of 
the instance space (Gentile and Littlestone, 1999)
\lma\ yields results similar to multiplicative
algorithms, such as Littlestone's Winnow \citep{l-liaanla-88}
and the Weighted Majority algorithm 
(Littlestone and Warmuth, 1994; Grove et al., 2001).
The associated margin-dependent generalization bounds 
are very close to those obtained by estimators based on
linear programming (e.g., Mangasarian, 1968; 
Anthony and Bartlett, 1999, Chap. 14).


% ********************

% Given the availability  recent trend of Machine Learning practice to
 
% The power of SVM is fully shown when they are 
% used in conjunction with kernel functions
% \cite{abr-tfpf-64}. A kernel function implicitly maps original
% input space into a high dimensional feature space through a
% non-linear mapping, thus turning a non-linear problem into a 
% linear one.



The second contribution of this paper is an experimental investigation of 
\lma\ on both real-world and artificial datasets. 
In our experiments we emphasized the accuracy performance achieved by 
\lma\ as a fully on-line algorithm, 
i.e., after just one sweep through the training examples.
Following Freund and Schapire (1999), the hypotheses produced
by \lma\ during training are combined via Helmbold and Warmuth's (1995) 
leave-one out scheme to make a {\em voted} hypothesis. 
We ran  \lmatwo\ with kernels on the real-world
datasets and \lma\, with $p > 2$ {\em without} kernels on artificial datasets.
The real-world datasets are well-known 
Optical Character Recognition (OCR) benchmarks. 
On these datasets we followed the experimental setting described by 
Cortes and Vapnik (1995), Freund and Schapire (1999), 
Li and Long (1999) and Platt et al. (1999). 
We compared our algorithm to standard SVM, to the Perceptron algorithm
and to ROMMA.
We found that \lmatwo\ generalizes quite better
than both ROMMA and the Perceptron algorithm, but slightly worse than
SVM. On the other hand, \lmatwo\ is as fast and easy to implement
as the other Perceptron-like algorithms. Hence, compared to standard
algorithms, training SVM with \lmatwo\ saves a considerable amount of time.

In the experiments with the artificial datasets we have been mainly interested
in comparing the accuracy of Perceptron-like algorithms, such as 
\lmatwo, to the accuracy of non-Perceptron-like algorithms, 
such as \lma\ with $p > 2$.
When learning sparse target vectors with \lma, the performance gap between 
$p = 2$  and $p > 2$ is big. 
This is mainly due to
the different convergence speed of the two kinds of algorithms
(see also the paper by Kivinen et al., 1997).

The next section defines our major notation and recalls some basic preliminaries.
In Section \ref{s:anal} we describe \lma\ and claim its theoretical
properties. Section \ref{s:exp} describes our experiments.
Concluding remarks and open problems are given in the last section.



\section{Preliminaries and notation}\label{s:prnot}
This section defines our major notation and recalls some basic preliminaries.
\sloppypar{
An {\em example} is a pair $(\x,y)$, where $\x$ is an {\em instance} belonging
to a given {\em instance space} $\X \subseteq \R^n$ 
and $y \in \{-1,+1\}$ is the binary {\em label}
associated with $\x$. 
A weight vector $\w = (w_1, ..., w_n) \in \R^n$ represents an $n$-dimensional
hyperplane passing through the origin. It is natural to associate with $\w$ a 
linear threshold classifier with threshold zero: 
$\w\,:\, \x\rightarrow \sign(\w\cdot\x) = 1$ if $\w\cdot\x \geq 0$ 
and $= -1$ otherwise.
When $p \geq 1$
we denote by $||\w||_p$ the $p$-norm of $\w$, i.e.,
$||\w||_p$ $= \left(\sum_{i=1}^n |w_i|^p \right)^{1/p}$
(also, $||\w||_{\infty}$ 
$= \lim_{p\rightarrow \infty} \left(\sum_{i=1}^n |w_i|^p \right)^{1/p}$
$= \max_i |w_i|$).
We say that $q$ is {\em dual} to $p$\ 
if\ $\frac{1}{p} + \frac{1}{q} = 1$ holds.
For instance, the 1-norm is dual to the
$\infty$-norm and the 2-norm is self-dual. 
In this paper we assume that $p$ and $q$ are some pair of dual
values, with $p \geq 2$. We use $p$-norms for instances and 
$q$-norms for weight vectors.
For the sake of simplifying notation throughout this paper we use 
normalized instances $\xh = \x/||\x||_p$, where the norm $p$ will be clear
from the surrounding context.
The (normalized) $p$-norm margin (or just the margin, if $p$ is clear from the
context) of a hyperplane $\w$ with $||\w||_q \leq 1$
on example $(\x,y)$ is defined as $y\,\w\cdot\xh$. If this margin is 
positive\footnote{We assume that $\w\cdot\x = 0$ yields a wrong classification, 
independent of $y$.}
then $\w$ classifies $(\x,y)$ correctly.
Notice that from H\"older's inequality we have 
$|\w\cdot\xh| \leq ||\w||_q\,||\xh||_p \leq 1$. Hence 
$y\,\w\cdot\xh \in [-1,1]$.

% We are using a normalized version of the margin. This simplifies calculations 
% somewhat. We took the idea of using the normalized margin from \cite{ghw-fmts-00}.

Our goal is to approximate the
maximal $p$-norm margin hyperplane for a set of examples (the training set).
For this purpose, we use terminology and analytical tools from the
on-line learning literature.
We focus on an on-line learning model introduced 
by \cite{l-liaanla-88} and \cite{A88}.
An on-line learning algorithm processes the examples one at a time in {\em trials}.
In each trial, the algorithm observes an instance $\x$ and is required 
to predict the label $y$ associated with $\x$. 
We denote the prediction by ${\hat y}$. 
% In this paper ${\hat y}$ is a real number.
The prediction $\yh$ combines
the current instance $\x$ with the current internal state of the algorithm.
In our case this state is essentially a weight vector $\w$, representing the 
algorithm's current hypothesis about the maximal margin hyperplane.
After the prediction is made, the true
value of $y$ is revealed and the algorithm suffers a {\em loss}, measuring the 
``distance'' between the prediction ${\hat y}$ and the label $y$.
Then the algorithm updates its internal state.

In this paper the prediction $\yh$ can be seen as
the linear function $\yh = \w\cdot\x$ and
the loss is a margin-based 0-1 Loss: the loss of $\w$ on example $(\x,y)$
is 1 if $y\,\w\cdot\xh \leq (1-\alpha)\,\gamma$ and 0 otherwise, for 
suitably chosen $\alpha, \gamma \in [0,1]$. 
Therefore, if $||\w||_q \leq 1$ the algorithm incurs positive loss
if and only if $\w$ classifies $(\x,y)$ with ($p$-norm) margin not larger
than $(1-\alpha)\,\gamma$.
The on-line algorithms are typically {\em loss driven}, i.e., they do update 
their internal state only in those trials where they suffer a positive 
loss. We call a {\em correction} a trial where this happens.
In the special case when $\alpha = 1$ a correction is a {\em mistaken}
trial and a loss driven algorithm turns to a {\em mistake driven} 
\citep{l-liaanla-88} algorithm.

Throughout the paper we use the subscript 
$t$ for $\x$ and  $y$ to denote the instance and the label processed
in trial $t$. We use the subscript $k$ for those variables,
such as the algorithm's weight vector $\w$, which are updated
only within a correction. In particular, $\w_k$
denotes the algorithm's weight vector after $k-1$ corrections 
(so that $\w_1$ is the initial weight vector). 
The goal of the on-line algorithm is to bound the cumulative loss
(i.e., the total number of corrections or mistakes)
it suffers on an arbitrary sequence of examples $S = ((\x_1,y_1),...,(\x_T,y_T))$.
Consider the special case when $S$ is linearly separable
with margin $\gamma$. If we pick $\alpha < 1$ then 
a bounded loss clearly implies convergence in a finite number of steps
to (an approximation of) the maximal margin hyperplane for $S$.
When a training set is linearly separable with margin $\gamma$ and
hyperplane $\w$ is such that $y\,\w\cdot\xh \geq (1-\alpha)\,\gamma$ 
for any $(\x,y)$ in the training set we sometimes say that $\w$ is an 
{\em $\alpha$-approximation} to the maximal margin hyperplane (for that
training set).

\begin{remark}
Our definition of margin is restricted to zero-threshold linear classifiers, 
i.e., to hyperplanes passing through the origin. 
The usual definition of margin in SVM literature \citep{cv-svn-95}
actually considers the more general non-zero threshold linear classifiers.
The threshold of an SVM maximal margin hyperplane
is sometimes called the {\em bias} term of the SVM.
Restricting margin analyses to zero-threshold hyperplanes loses only a 
constant factor (e.g., Cristianini and Shawe-Taylor, 2000). 
Nonetheless, in practical applications such a constant
factor might make a significant difference.
\end{remark}




% {\bf maybe say that the sequence S is infinite, maybe throughout the theorem's
% statements...}

% the above goal is clearly related to the problem of 
% approximating the maximal margin hyperplane for $S$.


% the {\em (linear) hinge loss} \cite{gw-lhlam-98} 
% $$L_c(y,\yh) = L_c(y,\w\cdot\x) = \max\{0,c-y\,\w\cdot\x\},\ \  c > 0.$$
% (This is also called the {\em deviation} in \cite{fs-lmcupa-99}.)
% A relevant property of the linear hinge loss is that
% $L_c(y,\w\cdot\x)$ is convex in $\w$ for any fixed
% $y \in \R$, $\x \in \R^n$ and $c \in \R$.



% *********************************************
% The on-line learning analysis we perform follows a well-known pattern
% (e.g., \cite{n-cpp-62,l-liaanla-88,
% cfhhsw-huea-97,gls-gcrfldu-97,kw-rlbmrp-97} 
% and the references therein).
% We define a measure of progress based on the
% mapping $\f$ under consideration.
% %Let $\f$ be as in (\ref{e:link}).
% Our measure of progress $d_{\f}(.,.)$
% is ... dot product ...
% *****************************



\section{The approximate large margin algorithm \lma}\label{s:anal}
\lma\ is a large margin variant
of the $p$-norm Perceptron algorithm\footnote{
The $p$-norm Perceptron algorithm is
a generalization of the classical Perceptron
algorithm (Rosenblatt, 1962; Block, 1962; Novikov, 1962): $p$-norm Perceptron 
is actually Perceptron when $p = 2$. 
% The reader unfamiliar with the $p$-norm
% technology might want to set $p = q = 2$ throughout.
} 
(Grove et al., 2001; Gentile and Littlestone, 1999), and 
is similar in spirit to the variable learning rate algorithms
introduced by Auer et al. (2001).
We analyze \lma\ by giving upper bounds on the number of corrections. 
We do not resort to the proof techniques developed 
by (Auer et al., 2001), as they seem to give rise to suboptimal
results when applied to the algorithm described here.

The theoretical contribution of this paper is 
Theorem \ref{t:main} below.
This theorem has two parts. Part 1 bounds the number of corrections
in the linearly separable case. In the special case when 
$p=2$ this bound is very similar to the one proven by 
Li and Long for a version of ROMMA (called aggressive ROMMA).
Part 2 holds for an arbitrary sequence of examples.
A bound which is very close to the one proven
by Grove et al. (2001) and Gentile and Littlestone (1999) for the 
(constant learning rate)
$p$-norm Perceptron algorithm is obtained as a special case.
%
% The theoretical similarity between our algorithm and ROMMA suggested
% us an experimental comparison 
%

% Despite this theoretical similarity, 
% the experiments we report in Section \ref{s:exp}
% show that using our margin-sensitive algorithm yields a clear 
% increase in performance.

% {\bf pointer to the experiments with multiplicative algs.}



\begin{figure}
\begin{center}
  \fbox{ \hspace*{1em}\begin{minipage}{0.9\textwidth}
      \mbox{}\\
{\bf Algorithm} \lma$(\alpha; B, C)$\\
\ with $\alpha \in (0,1]$, $B$, $C > 0$.\\
{\bf Initialization:}\\ 
Initial weight vector $\w_1 = \zero$; $k = 1$.\\
{\bf For} $t = 1, ..., T$ {\bf do}:
\begin{itemize}
\item Get example $(\x_t,y_t) \in \X\times\{-1,+1\}$;
\item Update weights as follows:
\begin{align}
&{\mbox{Set\ \ }}
\gamma_k = B\,\sqrt{p-1}\,\frac{1}{\sqrt{k}};\notag\\
%\ \eta_k = \sqrt{\frac{2}{(p-1)\,X_p^2}}\,\frac{1}{\sqrt{k}};\notag\\
&\mbox{{\bf If}}\ y_t\,\w_k\cdot\xh_t 
                     \leq (1-\alpha)\,\gamma_k \ \ \mbox{{\bf then}}:
\ \ \eta_k = \frac{C}{\sqrt{p-1}}\,\frac{1}{\sqrt{k}},\notag\\
&\hspace{2.35in}\w'_k = \f^{-1}(\f(\w_k) + \eta_k\,y_t\,\xh_t),\ \notag\\
&\hspace{2.35in}\w_{k+1} = \w'_k/ \max\{1,||\w'_k||_q\},\notag\\
&\hspace{2.35in}k \leftarrow k + 1.\notag
\end{align}
\end{itemize}
\end{minipage}\hspace*{1em}}
\end{center}
\caption{The approximate large margin algorithm \lma.}\label{f:1}
\end{figure}


In order to define our algorithm, we need to recall the following mapping 
$\f$ (Gentile and Littlestone, 1999) (a $p$-indexing for $\f$ is understood):
$\f\,:\,\R^n \rightarrow \R^n$, $\f = (f_1,...,f_n)$, where
\[
f_i(\w) = \frac{\sign(w_i)\,|w_i|^{q-1}}{||\w||_q^{q-2}},\ \ \
\w = (w_1,...,w_n) \in \R^n.
\]
Observe that $p = q = 2$ yields the identify function.
The (unique) inverse $\f^{-1}$ of $\f$ is (Gentile and Littlestone, 1999)
$\f^{-1}\,:\,\R^n \rightarrow \R^n$, $\f^{-1} = (f_1^{-1},...,f_n^{-1})$, where
$$
f_i^{-1}(\ttheta) =
\frac{\sign(\theta_i)\,|\theta_i|^{p-1}}{||\ttheta||_p^{p-2}}, \ \
\ttheta = (\theta_1,...,\theta_n) \in \R^n,
$$
namely, $\f^{-1}$ is obtained from $\f$ by replacing $q$ with $p$.
It is easy to check that $\f$ is the gradient of
the scalar function $\half ||\cdot||_q^2$, while  $\f^{-1}$ is the gradient
of the (dual) function $\half ||\cdot||_p^2$.
The following simple property of $\f$ will be useful.

\begin{lemma}\label{l:normmap}(Gentile and Littlestone, 1999)
The function $\f$ maps vectors with a given $q$-norm to vectors with the 
same $p$-norm, i.e., for any $\w \in \R^n$ we have $||\f(\w)||_p = ||\w||_q$.\ $\BlackBox$
\end{lemma}

\lma\ is described in Figure \ref{f:1}.
The algorithm is parameterized by $\alpha \in (0,1]$, $B > 0$ and $C > 0$.
Parameter $\alpha$ measures the degree of approximation to
the optimal margin hyperplane, while $B$ and $C$ might be considered
as tuning parameters. Their use will be made clear
in Theorem \ref{t:main}. 
Let $\W$ be the $q$-norm unit ball, i.e., 
$\W = \{ \w \in \R^n : ||\w||_q \leq 1 \}$.
\lma\ maintains a vector $\w_k$ of $n$ weights in $\W$.
It starts from $\w_1 = \zero$.
At time $t$ the algorithm processes example $(\x_t,y_t)$.
If the current weight vector $\w_k$ classifies  $(\x_t,y_t)$ with
(normalized) margin not larger than $(1-\alpha)\,\gamma_k$ then a correction occurs.
Here $\gamma_k$ is intended as the current approximation to the 
unknown maximal margin (denoted by $\gamma^*$ in Theorem \ref{t:main}) on the data.
The update rule\footnote{In the degenerate case that $\x_t =\zero$ no update
takes place.} 
has two main steps. The first step gives $\w'_k$ through 
the classical update of a ($p$-norm) perceptron-like algorithm
(notice, however, that the learning rate $\eta_k$ scales with $k$, 
the number of corrections occurred so far). 
The second step gives $\w_{k+1}$ by {\em projecting}\footnote{
From the proof of Theorem \ref{t:main} the reader can see that
the only way we exploit this projection step is through the 
condition $||\w_k||_q \leq 1$ for all $k$. Therefore if we replaced
the update rule $\w_{k+1} = \w'_k/ \max\{1,||\w'_k||_q\}$ in Figure \ref{f:1}
by the simpler rule $\w_{k+1} = \w'_k/||\w'_k||_q$ Theorem \ref{t:main}
would still hold. The advantage of using the former rule is 
computational, as it requires less weight updating.
}
$\w'_k$ onto $\W$:
$\w_{k+1} = \w'_k/||\w'_k||_q$ if $||\w'_k||_q > 1$ and
$\w_{k+1} = \w'_k$ otherwise.
The projection step makes the new weight vector $\w_{k+1}$ belong to $\W$.
Figure \ref{f:2} gives a graphical representation of the update rule.



%************************************************************************

\begin{figure}
\begin{center}
\vspace{-0.65in}
\scalebox{0.75}{\includegraphics{fig_proj.eps}}

\setlength{\unitlength}{0.00083300in}%
%
\begingroup\makeatletter\ifx\SetFigFont\undefined
% extract first six characters in \fmtname
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y   % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
  \ifnum #1<47\tiny\else \ifnum #1<20\small\else
%*******  
\ifnum #1<24\normalsize\else \ifnum #1<29\large\else
  \ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
     \huge\fi\fi\fi\fi\fi\fi
  \csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
  \count@#1\relax \ifnum 25<\count@\count@25\fi
  \def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
  \expandafter\x
    \csname \romannumeral\the\count@ pt\expandafter\endcsname
    \csname @\romannumeral\the\count@ pt\endcsname
  \csname #3\endcsname}%
\fi
\fi\endgroup
\begin{picture}(5322,0)(391,-1220)
\put(3300,2350){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\w_k$}}}
\put(4050,2300){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\w'_k$}}}
\put(3900,1950){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\w_{k+1}$}}}
\put(4250,1150){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$y_t\,\xh_t$}}}
\put(3100,1040){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}o}}}
\end{picture}
\vspace{-0.85in}
\caption{The update rule of \lma\ when $p = q = 2$. The circle is 
a two-dimensional $\W$.}\label{f:2}
\end{center}
\end{figure}

%************************************************************************


It is worth discussing at this point how \lma\ is qualitatively different  
from previous on-line algorithms, such as the ($p$-norm) Perceptron algorithm 
and ROMMA.
First, we notice that \lma\ maintains bounded weight vectors, 
but it uses a decaying learning
rate. This is actually qualitatively similar to the standard ($p$-norm) Perceptron algorithm,
where weight vectors are unbounded but the learning rate is kept constant. 
In fact, in both cases later updating instances
have less influence on the direction of the current weight vector 
than earlier instances.
On the other hand, unlike the ($p$-norm) Perceptron algorithm, \lma\ is sensitive to
margins, i.e., the current weight vector gets updated even if the current margin is  
positive but smaller than desired. 
Compared to aggressive ROMMA, \lma\ requires the accuracy parameter $\alpha$ 
be fixed ahead of time; if $\alpha$ is not close to zero, this tends to make 
\lma's corrections less frequent than aggressive ROMMA's 
(see Section \ref{ss:mnist}).


We now claim the theoretical properties of \lma.
The following theorem has two parts. In  part 1 we treat the separable
case. Here we prove that a special choice of parameters $B$ and $C$ gives
rise to an algorithm which computes an $\alpha$-approximation to 
the maximal margin hyperplane, for any given accurary $\alpha$.
In part 2 we show that if 
a suitable relationship between $B$ and $C$ is satisfied
then a bound on the number of corrections can be proven
in the general (nonseparable) case.
The bound of part 2 is in terms of the margin-based
quantity $\D_{\gamma}(\u;(\x,y)) = \max\{0,\gamma - y\,\u\cdot\xh\}$, 
$\gamma > 0$. (Here a $p$-indexing for $\D_{\gamma}$ is understood).
$\D_{\gamma}$ is called {\em deviation} 
by \cite{fs-lmcupa-99} and {\em linear hinge loss} 
by \cite{gw-lhlam-98}. 

Notice that $B$ and $C$ in part 1
do not meet the requirements given in part 2. 
On the other hand, in the separable case
$B$ and $C$ chosen in part 2
do not yield, for any small $\alpha$, an $\alpha$-approximation 
to the maximal margin hyperplane.


% In the experiments we tried the param. of Th. 1 for $\eta$
% as it behaves a bit better (though it makes slightly more mistakes on any
% single binary classifier.)


\begin{theorem}\label{t:main}
Let $\X = \R^n$,
$\W =  \{\w \in \R^n\,:\, ||\w||_q \leq 1 \}$,
$S = ((\x_1,y_1),...,$ $(\x_T,y_T)) \in (\X \times \{-1,+1\})^T$,
and
$\M$ be the set of corrections of {\rm \lma}$(\alpha; B, C)$ running
on $S$ (i.e., the set of trials $t$ such that
$y_t\,\w_k\cdot\xh_t \leq (1-\alpha)\,\gamma_k$).

1. Let 
$\gamma^* = \max_{\w \in \W} \min_{t = 1,...,T}\, y_t\,\w\cdot\xh_t > 0$.
Then {\rm \lma}$(\alpha; \sqrt{8}/\alpha, \sqrt{2})$ 
achieves the following bound\footnote{We did not optimize the constants here.} 
on $|\M|$:
\begin{equation}\label{e:corrbound}
|\M| \leq \frac{2\,(p-1)}{(\gamma^*)^2}\left(\frac{2}{\alpha}-1\right)^2 
          + \frac{8}{\alpha}-4
= O\left(\frac{p-1}{\alpha^2\,(\gamma^*)^2} \right).
\end{equation}
Furthermore, throughout the run of {\rm \lma}$(\alpha; \sqrt{8}/\alpha, \sqrt{2})$ 
we have $\gamma_k \geq \gamma^*$. Hence (\ref{e:corrbound}) is also 
an upper bound on the number of trials $t$ such that 
$y_t\,\w_k\cdot\xh_t \leq (1-\alpha)\,\gamma^*$.

2. Let the parameters $B$ and $C$ in Figure \ref{f:1}
satisfy the equation\footnote{ 
Notice that $B$ and $C$ in part 1 do not satisfy this equation.}
$$C^2 + 2\,(1-\alpha)\,B\,C = 1.$$ 
Then for any $\u \in \W$, {\rm \lma}$(\alpha; B, C)$  
achieves the following bound on $|\M|$, holding for any  $\gamma > 0$,
where ${\displaystyle \rho^2 = \frac{p-1}{C^2\,\gamma^2}}$:
\[
|\M| \leq \frac{1}{\gamma}\sum_{t\in\M}\Dtgo + \frac{\rho^2}{2} +
\sqrt{\frac{\rho^4}{4} + \frac{\rho^2}{\gamma}\sum_{t\in\M}\Dtgo +\rho^2}.
\]
Observe that when $\alpha = 1$ 
the above inequality turns to a bound on the number of {\em mistaken trials}.
In such a case the value of $\gamma_k$ 
(in particular, the value of $B$)
is immaterial, while $C$ is forced to be 1. 
\end{theorem}
{\em Proof.} We assume throughout this proof that 
the $k$-th correction occurs on example $(\x_t,y_t)$.
We use the following shorthand notation:
$\ttheta_k = \f(\w_k)$, $\ttheta'_k = \f(\w'_k) = \ttheta_k + \eta_k\,y_t\,\xh_t $
and $N_{k+1} = \max\{1,||\w'_k||_q\}$. We now consider the two parts separately.

1. Let $\gamma^*_k =  y_t\,\u\cdot\xh_t$,
where $\u$ is the maximal margin hyperplane for the whole sequence $S$.
We study how fast the quantity $\u\cdot\ttheta_k$ increases from correction to 
correction. 
From the update rule of Figure \ref{f:1} we have
\begin{equation}\label{e:progress}
\u\cdot\ttheta_{k+1} = \frac{\u\cdot\ttheta_k+\eta_k\,y_t\,\u\cdot\xh_t}{N_{k+1}} 
= \frac{\u\cdot\ttheta_k+\eta_k\,\gamma_k^*}{N_{k+1}}.
%\geq \frac{\u\cdot\ttheta_k+\eta_k\,\gamma^*\,||\x_t||_p}{N_{k+1}}. 
\end{equation}
Observe that $\u\cdot\ttheta_k \geq 0$ for any $k$, since 
the data are linearly separable.

We need to find an upper bound on the normalization factor $N_{k+1}$. To this end,
we focus on  the square $N_{k+1}^2$. By virtue of Lemma \ref{l:normmap} 
we can write 
\begin{align}
N_{k+1}^2 &= \max\{1,||\w'_k||_q^2\}\notag\\ 
&= \max\{1,||\ttheta'_k||_p^2\}\notag\\ 
&= \max\{1,||\ttheta_k + \eta_k\,y_t\,\xh_t||_p^2\}.\notag
\end{align}

Furthermore,
\begin{align}
||\ttheta_k + \eta_k\,y_t\,\xh_t||_p^2 &\leq 
||\ttheta_k||_p^2 + \eta_k^2\,(p-1) + 
2\,\eta_k\,y_t\,\f^{-1}(\ttheta_k)\cdot\xh_t\notag\\
&= ||\w_k||_q^2 + \eta_k^2\,(p-1) + 2\,\eta_k\,y_t\,\w_k\cdot\xh_t\notag\\
&\leq 1 + \eta_k^2\,(p-1) + 2\,(1-\alpha)\,\eta_k\,\gamma_k\notag\\
&= 1+ 2\,A/k,\notag 
\end{align}
where the first inequality is essentially proven by Grove et al. (2001)
(see also Lemma 2 in the paper by Gentile and Littlestone, 1999),
the first equality is again an application of 
Lemma \ref{l:normmap}, the second inequality
derives from Figure \ref{f:1}, and the 
last equality derives from Figure \ref{f:1} by setting $A = 4/\alpha-3$.
Thus we conclude that
$$N_{k+1} \leq \sqrt{1 + 2\,A/k}.$$
Now, we set for brevity $m = |\M|$,
$\rho_k = \frac{\sqrt{p-1}}{\gamma^*_k}$ and $\rho = \frac{\sqrt{p-1}}{\gamma^*}$.
Since $\gamma_k^*$ and $\gamma^*$ are $p$-norm margins with 
$\gamma^* \leq \gamma_k$
we have $\rho_k \geq \rho \geq 1$.
We plug the bound on $N_{k+1}$ back into (\ref{e:progress}), unwrap the resulting 
recurrence and take into account that $\w_1 = \ttheta_1 = \zero$. We yield
$$\u\cdot\ttheta_{m+1} \geq \sum_{k=1}^m s_k \prod_{j=k+1}^m r_j,$$
where
\begin{eqnarray*}
r_j &=& \frac{1}{\sqrt{1+2\,A/j}},\\
s_k &=& \frac{1}{\rho_k}\,\frac{1}{\sqrt{A+k/2}}
\end{eqnarray*}
and the product $\prod_{j=k+1}^m r_j$ is assumed to be 1 if $k = m$.
From H\"older's inequality and 
Lemma \ref{l:normmap} it follows that
$$\u\cdot\ttheta_{m+1} \leq ||\u||_q\,||\ttheta_{m+1}||_p = 
||\u||_q\,||\w_{m+1}||_q \leq 1.$$ 
Therefore we have obtained:
\begin{equation}\label{e:sumprod}
1 \geq \sum_{k=1}^m s_k \prod_{j=k+1}^m r_j.
\end{equation}
We use this inequality to compute an upper bound on the number of corrections $m$.
We proceed by lower bounding the RHS of (\ref{e:sumprod}), as a function of $m$. 

We first lower bound $s_k$ by 
$\frac{1}{\rho}\,\frac{1}{\sqrt{A+m/2}}.$
Next, considering the product $\prod_{j=k+1}^m r_j$, we can write
\begin{eqnarray*}
- \ln \prod_{j=k+1}^m r_j 
&=& \frac{1}{2}\sum_{j=k+1}^m \ln\left(1+\frac{2\,A}{j} \right)\\
&\leq& \frac{1}{2} \sum_{j=k+1}^m \frac{2\,A}{j}\\
&\leq& A\,\int_k^m \frac{1}{j}\,d\,j\\ 
&=& A\, \ln\frac{m}{k}.
\end{eqnarray*}
This is equivalent to 
$\prod_{j=k+1}^m r_j \geq \left(\frac{k}{m}\right)^A.$
Therefore (\ref{e:sumprod}) implies
\begin{eqnarray*}
\rho
&\geq& \sum_{k=1}^m \frac{(k/m)^A}{\sqrt{A+m/2}}\\
&\geq& \int_{k=0}^m \frac{(k/m)^A}{\sqrt{A+m/2}}\,dk\\
&=& \frac{1}{A+1}\,\frac{m}{\sqrt{A+m/2}}.
\end{eqnarray*}
Solving for $m$ gives
\begin{eqnarray}
m 
&\leq& \frac{\rho^2\,(A+1)^2}{4} + \sqrt{\frac{\rho^4\,(A+1)^4}{16} + \rho^2\,(A+1)^2\,A}\notag\\
&\leq& \frac{\rho^2\,(A+1)^2}{4} + \rho\,(A+1)^{3/2}\sqrt{\frac{\rho^2\,(A+1)}{16} + 1}\notag\\
&\leq& \frac{\rho^2\,(A+1)^2}{4} + 
       \rho\,(A+1)^{3/2}\left(  \frac{\sqrt{\rho^2\,(A+1)}}{4} + 
                                     \frac{2}{\sqrt{\rho^2\,(A+1)}}\right)\notag\\
&=& \frac{\rho^2\,(A+1)^2}{2} + 2\,(A+1)\label{e:boundA}\\
&=& 2\rho^2\left(\frac{2}{\alpha}-1\right)^2 + \frac{8}{\alpha}-4\notag\\
&=& \frac{2\,(p-1)}{(\gamma^*)^2}\left(\frac{2}{\alpha}-1\right)^2 + \frac{8}{\alpha}-4,\notag
\end{eqnarray}
where the third inequality uses $\sqrt{x+1} \leq \sqrt{x} + \frac{1}{2\sqrt{x}}$, 
for $x > 0$. 
This proves that the number of corrections $m$ 
made by the algorithm is upper bounded as in (\ref{e:corrbound}). 
In order to show that 
this is also an upper bound on the number of trials $t$ 
such that $y_t\,\w_k\cdot\xh_t \leq (1-\alpha)\,\gamma^*$,  
it suffices to prove that $\gamma_k \geq \gamma^*$ for $k = 1,...,m$.
Recalling Figure \ref{f:1}, we see that
\begin{eqnarray*}
\gamma_k  
&=& \frac{\sqrt{8}\,\rho_k\,\gamma_k^*}{\alpha\,\sqrt{k}}\\
&\geq& \frac{\sqrt{8}\,\rho\,\gamma^*}{\alpha\,\sqrt{m}}\\
&\geq& \frac{\sqrt{8}\,\rho\,\gamma^*}{\alpha\,\sqrt{\rho^2\,\frac{(A+1)^2}{2} + 2\,(A+1) }}\\
&\geq& \frac{\gamma^*}{\alpha\,\sqrt{\frac{(A+1)^2}{16}+\frac{A+1}{4}} }\\
&=& \frac{\gamma^*}{\sqrt{1-\frac{\alpha^2}{4}}}\\
&\geq& \gamma^*,
\end{eqnarray*}
where the second inequality is (\ref{e:boundA}), the third inequality 
follows from $\rho \geq 1$ and the last equality follows from the definition of $A$.
This concludes the proof of part 1.

2.  The proof proceeds along the same lines as the proof 
of part 1. Thus we only sketch the main steps. 
Let $\u$ be an arbitrary vector in $\W$. We can write
% Recall the notation we used there. 
% {\bf say somewhere that trail t corresponds to the k-th correction}
% We have
\begin{equation}\label{e:progress2}
N_{k+1}\,\u\cdot\ttheta_{k+1} = \u\cdot\ttheta_k+\eta_k\,y_t\u\cdot\x_t,
\end{equation}
where
\begin{eqnarray*}
N_{k+1}^2 
&=& \max\{1,||\ttheta_k + \eta_k\,y_t\,\xh_t||_p^2 \}\\
&\leq& \max\{1,||\ttheta_k||_p^2 + \eta_k^2\,(p-1) + 
                        2\,(1-\alpha)\,\eta_k\,\gamma_k\\
&\leq& \max\{1,1 + \eta_k^2\,(p-1) + 
                        2\,(1-\alpha)\,\eta_k\,\gamma_k\}\\
&=&    1 + \frac{C^2 + 2\,(1-\alpha)\,B\,C}{k}\\
&=&    1 + \frac{1}{k}.
\end{eqnarray*}
From (\ref{e:progress2}) and the value of $\eta_k$ we obtain
\[
\sqrt{k+1}\,\u\cdot\ttheta_{k+1} \geq \sqrt{k}\;\u\cdot\ttheta_{k}
+ \frac{C}{\sqrt{p-1}}\,y_t\,\u\cdot\xh_t.
\]
Unwrapping, using the two inequalities $\u\cdot\ttheta_{m+1} \leq 1$ and
$y_t\u\cdot\xh_t \geq \gamma - \Dtgo$, and rearraning
yields
\[
\sqrt{m+1}\,\frac{\sqrt{p-1}}{C} + \sum_{t\in \M} \Dtgo \geq m\,\gamma,
\]
holding for any $\gamma > 0$ and any $\u \in \W$.
Solving for $m$ gives the desired inequality. This concludes the proof.\ $\BlackBox$





\iffalse
*****************************************************************
We now modify the algorithm in Figure \ref{f:1} as follows.
We parameterize $\gamma_k$ and $\eta_k$ by introducing the two positive
constants $B$ and $C$.
We set 
\begin{eqnarray}
\gamma_k &=& B\,\sqrt{p-1}\,\frac{1}{\sqrt{k}},\label{e:gamma_par}\\
\eta_k &=& \frac{C}{\sqrt{p-1}\ ||\xt||_p}\,\frac{1}{\sqrt{k}}\label{e:eta_par}.
\end{eqnarray}
We call the resulting algorithm \mdlma, 
where the prefix 'PAR' stands for 'parameterized'.
The original \lma\ in Figure \ref{f:1} 
is obtained by choosing $B = \sqrt{8}/\alpha$ and $C = \sqrt{2}$.

If a suitable relationship between the parameters $B$ and $C$ is satisfied
then we can prove a bound
on the number of corrections which holds in the nonseparable case as well.
The bound of the theorem below is in terms of the margin-based
quantity $\D_{\gamma}(\u;(\x,y)) = \max\{0,\gamma - \frac{y\,\u\cdot\x}{||\x||_p}\}$, 
$\gamma > 0$. A $p$-indexing for $\D_{\gamma}$ is understood.
$\D_{\gamma}$ is called {\em deviation} 
in \cite{fs-lmcupa-99} and {\em linear hinge loss} 
in \cite{gw-lhlam-98}. 

\begin{theorem}\label{t:main2}
Let $\X = \R^n$,
$\W =  \{\w \in \R^n\,:\, ||\w||_q \leq 1 \}$,
$S = ((\x_1,y_1),...,$ $(\x_T,y_T)) \in (\X \times \{-1,+1\})^T$.
Let the parameters $B$ and $C$ in (\ref{e:gamma_par}) and (\ref{e:eta_par})
satisfy the equation\footnote{ 
Notice that the values for $B$ and $C$ in Figure \ref{f:1} 
do not satisfy this relationship.}
$C^2 + 2\,(1-\alpha)\,B\,C = 1$.
Let $\M$ be the set of corrections of {\rm \mdlma}$(\alpha)$ running
on $S$ (i.e., the set of trials $t$ such that
$\frac{y_t\,\w_k\cdot\x_t}{||\x_t||_p} \leq (1-\alpha)\,\gamma_k$).
For any $\u \in \W$, {\rm \mdlma}$(\alpha)$
achieves the following bound on $|\M|$, holding for any  $\gamma > 0$,
where ${\displaystyle \rho^2 = \frac{p-1}{C^2\,\gamma^2}}$:
\[
|\M| \leq \frac{1}{\gamma}\sum_{t\in\M}\Dtgo + \frac{\rho^2}{2} +
\sqrt{\frac{\rho^4}{4} + \frac{\rho^2}{\gamma}\sum_{t\in\M}\Dtgo +\rho^2}.
\]
Observe that when $\alpha = 1$
the above inequality turns to a bound on the number of {\em mistaken trials}.
In such a case the value of $\gamma_k$ 
(in particular, the value of $B$)
is immaterial, while the parameter $C$ is forced to be 1. 
\end{theorem}
%
{\em Proof sketch.} We proceed along the same lines as in the proof 
of Theorem \ref{t:main}. Recall the notation we used there. 
%{\bf say somewhere that trail t corresponds to the k-th correction}
We have
\begin{equation}\label{e:progress2}
N_{k+1}\,\u\cdot\ttheta_{k+1} = \u\cdot\ttheta_k+\eta_k\,y_t\u\cdot\x_t,
\end{equation}
where
\begin{eqnarray*}
N_{k+1}^2 
&=& \max\{1,||\ttheta_k + \eta_k\,y_t\,\x_t||_p^2 \}\\
&\leq& \max\{1,||\ttheta_k||_p^2 + \eta_k^2\,(p-1)\,||\x_t||_p^2 + 
                        2\,(1-\alpha)\,\eta_k\,\gamma_k\,||\x_t||_p\}\\
&\leq& \max\{1,1 + \eta_k^2\,(p-1)\,||\x_t||_p^2 + 
                        2\,(1-\alpha)\,\eta_k\,\gamma_k\,||\x_t||_p\}\\
&=&    1 + \frac{C^2 + 2\,(1-\alpha)\,B\,C}{k}\\
&=&    1 + \frac{1}{k}.
\end{eqnarray*}
From (\ref{e:progress2}) and the value of $\eta_k$ we obtain
\[
\sqrt{k+1}\,\u\cdot\ttheta_{k+1} \geq \sqrt{k}\;\u\cdot\ttheta_{k}
+ \frac{C}{\sqrt{p-1}}\,\frac{y_t\,\u\cdot\x_t}{||\xt||_p}.
\]
Unwrapping, using the inequalities $\u\cdot\ttheta_{m+1} \leq 1$ and
$\frac{y_t\u\cdot\xt}{||\xt||_p} \geq \gamma - \Dtgo$, and rearraning
yields
\[
\sqrt{m+1}\,\frac{\sqrt{p-1}}{C} + \sum_{t\in \M} \Dtgo \geq m\,\gamma,
\]
holding for any $\gamma > 0$ and any $\u \in \W$.
Solving for $m$ gives the desired inequality.\ $\BlackBox$

*******************************************************************
\fi



Some remarks are in order at this point.



\begin{remark}
It is worth emphasizing that the difference between parts 1 and 2 in
Theorem \ref{t:main} is mainly theoretical. For instance, 
the condition\ $C^2 + 2\,(1-\alpha)\,B\,C = 1$\
in Part 2 is satisfied even by 
$B = \frac{1}{2\sqrt{\alpha}}$ and 
$C = \sqrt{\alpha}$, for any
$\alpha \in (0,1]$. 
It is not hard to see that in the separable case this setting yields 
the bounds 
$|\M| \leq \frac{1+\sqrt{5}}{2}\,\frac{p-1}{(\gamma^*)^2\,\alpha}$
and $\gamma_k \geq \gamma^*/3$ for all $k$ (independent of $\alpha$). 
Hence, if we set $\alpha = 1/2$ we obtain a hyperplane whose margin on
the data is at least $\gamma^*/6$ after no more than 
$(1+\sqrt{5})\,\frac{p-1}{(\gamma^*)^2}$ corrections.
This degree of data fitting could be enough for many practical purposes.
In fact, one should not heavily rely on Theorem \ref{t:main} 
to choose the ``best'' tuning for $B$ and $C$ in \lma$(\alpha; B,C)$, 
since many of the constants occurring in the statement of that 
theorem are just an artifact of our analysis. 
% \ $\Box$
% We refer the reader to 
% the next section to see how we tuned $B$ and $C$ in our experiments. \ $\Box$
\end{remark}


\begin{remark}
When $p=2$ the computations performed by \lma\ essentially 
involve only dot products 
(recall that $p=2$ yields $q=2$ and $\f = \f^{-1}$ = identity). 
Thus the generalization of \lmatwo\ to the kernel case is quite standard
(we just replace every dot product between instances by 
a kernel dot product).
In fact, the linear combination $\w_{k+1}\cdot\x$ can be computed recursively,
since 
\[
\w_{k+1}\cdot\x = \frac{\w_k\cdot\x+\eta_k\,y_t\xh_t\cdot\x}{N_{k+1}}.
\]
Here the denominator $N_{k+1}$ equals  $\max\{1,||\w'_k||_2\}$ and the 
norm $||\w'_k||_2$ is again computed recursively by
\[
||\w'_k||_2^2 = ||\w'_{k-1}||_2^2/N_k^2 + 
2\,\eta_k\,y_t\w_k\cdot\xh_t + \eta_k^2,
\]
where the dot product
$\w_k\cdot\xh_t$ is taken from the $k$-th correction (the trial 
where the $k$-th weight update did occur), and the normalization of instances
$\xh = \x/||\x||_2$ is computed as $\xh = \x/\sqrt{\x\cdot\x}$.
% \ $\Box$
\end{remark}



\begin{remark}
\lma\ with $p > 2$ is useful when learning {\em sparse} hyperplanes, namely
those hyperplanes having only a few relevant components. 
Gentile and Littlestone (1999) observe that setting $p = 2\,\ln n$ makes 
a $p$-norm algorithm similar to a purely multiplicative algorithm
such as Winnow and the Weighted Majority algorithm (Littlestone, 1988; Littlestone and 
Warmuth, 1994).
The performance of such algorithms is ruled by a limiting pair of dual norms,
i.e., the infinity norm of instances and the 1-norm
of weight vectors.
Likewise, \lma\ with $p = 2\,\ln n$ becomes a multiplicative approximate 
maximal margin classification algorithm, where the margin is meant to be an 
$\infty$-norm margin. 
To see this, observe that 
$||\x||_p \leq n^{1/p}\,||\x||_{\infty}$ for any $\x \in \R^n$. 
Hence  $p = 2\,\ln n$ yields
$||\x||_{(2\ln n)} \leq \sqrt{e}\,||\x||_{\infty}$. Also,
$||\w||_1 \leq 1$ implies $||\w||_q \leq 1$ for any $q > 1$. 
Thus if $||\w||_1 \leq 1$
the $(2\ln n)$-norm margin $\frac{y\,\w\cdot\x}{||\x||_{(2\ln n)}}$ is actually bounded
from below by the $\infty$-norm margin  
$\frac{y\,\w\cdot\x}{||\x||_{\infty}}$ divided by $\sqrt{e}$. 
The bound in part 1 of Theorem \ref{t:main} becomes 
$|\M| = O\left( \frac{\ln n}{\alpha^2\,(\gamma^*)^2} \right)$, where 
$\gamma^* = \max_{\w\,:\,||\w||_1 \leq 1} \min_{t = 1,...,T}\, 
\frac{y_t\,\w\cdot\x_t}{||\x_t||_{\infty}}$.
The associated margin-based generalization bounds
are very similar to those obtained by classifiers based on
linear programming (Mangasarian, 1968; Anthony and Bartlett, 1999, Chap. 14).
% We tested \lma\ with $p > 2$ on sparse learning problems. The results are
% given in Section \ref{ss:lmap}. 
% $\Box$
\end{remark}


\begin{remark}
\lma\ and its analysis could be modified to handle the case when
the margin is not normalized, i.e., when the margin of hyperplane $\w$ 
on example $(\x,y)$ is defined to be just $y\,\w\cdot\x$. 
We only need to introduce a new variable, call it $X_k$,
which stores the maximal norm of the instances seen in past corrections, 
and then normalize the new instance $\x_t$ in the update rule by $X_k$.
The resulting algorithm and the corresponding analysis would be slightly more
complicated than the one we gave in Theorem \ref{t:main}.
As a matter of fact, our initial experiments with \lma-like algorithms 
were performed with the unnormalized margin version. 
Such experiments, which are not reported in this paper, show that
the unnormalized margin version of \lma\ is fairly sensitive to example ordering
(in particular, the algorithm is quite sensitive to the position of outliers in
the stream of examples).
This is one of the reasons why in this paper we only treat 
the normalized margin version.
\end{remark}


% {\bf Mention margin-based generalization bounds?? Yes! But just mention them:
% 
% -leave-one out 
% 
% -margin based of voted hypotheses (Schapire et al.)
% 
% -Littlestone's } 
 

% *********************************
% 
% This result leaves open the problem of choosing the parameters 
% $\gamma$ and $\alpha$ in \lma. The best tuning of these
% parameters is in general related to features of the learning problem which are not
% necessarily visible to the learner, such as the amount of noise in the
% data. Our final goal is to build large margin classifiers.  
% Thus, if the data are linearly separable with margin $\gamma^*$, the best setting
% seems to be $\gamma \simeq \gamma^*$ with a small $\alpha$. On the other hand,
% when the data are not separable, it is not a-priori clear how to tune the two 
% parameters. This tuning problem is (partially) solved in section \ref{s:oltb}.
% There we assume the data are separable. Hence $\alpha$ turns to be a ``user'' 
% approximation parameter. The extension to the non-separable case needs to be
% done ...
% 
% take advantage of the fact that new hyp is old + something
% 
% **********************************




\section{Experimental results}\label{s:exp}
To see how our algorithm works in practice, we tested it on 
a number of classification datasets, both real-world and artificial.
We used \lmatwo\ with kernels on the real datasets and \lma\ with $p = 2,\ 6,\ 10$
{\em without} kernels on the artificial ones.
The real-world datasets are well-known OCR benchmarks:
the USPS dataset (e.g., Le Cun et al., 1995),
the MNIST dataset\footnote{It can be downloaded from 
Y. LeCun's home page: 
http://www.research.att.com/$\sim$yann/ocr/mnist/.}, and
the UCI Letter dataset (Blake et al., 1998). The artificial datasets
consist of examples generated by some random process 
according to the rules described in Section \ref{ss:lmap}.

For the sake of comparison, we tended to follow 
previous experimental setups, such as those described by
Cortes and Vapnik (1995), Freund and Schapire (1999), 
Friess et al. (1998), Li and Long (1999) and Platt et al. (1999).
We reduced an $N$-class problem to a set of $N$ binary problems,
according to the so-called {\em one-versus-rest} scheme.
That is, we trained the algorithms once for each of the $N$ classes.
When training on the $i$-th class all the examples with label $i$ are
considered positive (labelled $+1$) and all other examples are negative
(labelled $-1$). Classification is made according to the 
maximum output of the $N$ binary classifiers.
There are many other ways of combining binary
classifiers into a multiclass classifier. 
We refer the reader to the work by Dietterich and Bakiri (1995), 
Platt et al. (1999), Allwein et al. (2000) and to references therein. 
Yet another method for facing multiclass classification (which does not
explicitly reduce to binary) is mentioned in Section \ref{s:conc}.

Our experimental results are summarized in Tables 1 through 6 
and in Figures \ref{f:5}, \ref{f:3} and \ref{f:4}.
Following Freund and Schapire (1999), the output of a binary classifier is 
based on either the {\em last} hypothesis produced by the algorithms
(denoted by ``last'' throughout this section) 
or Helmbold and Warmuth's (1995)
leave-one-out {\em voted} hypothesis (denoted by ``voted''). 
In our experiments we actually used the variant called {\em average} 
by \cite{fs-lmcupa-99}. We denote it
by ``average'' or ``avg'', for brevity. 
This variant gave rise to slight accuracy improvements
compared to ``voted''.\footnote{Freund and Schapire (1999) seem to 
use the average variant without any theoretical justification.
When using margin sensitive classification algorithms, such as 
\lma\ with $\alpha < 1$, one can 
prove a bound on the expected generalization error of ``avg'' 
by first proving a bound 
on the expected hinge loss (Gentile and Warmuth, 2001) and then applying a simple convexity 
argument \citep{kw-avegulp-97}.}
When using ``last'', the output of the $i$-th binary classifier 
on a new instance $\x$ is 
\[
{\mbox{output}}_i(\x) = \w^{(i)}_{m^{(i)}+1}\cdot\x,
\] 
where $\w^{(i)}_{m^{(i)}+1}$ 
is the last weight vector produced during training by the $i$-th classification 
algorithm (namely, after $m^{(i)}$ corrections);
when using ``voted'' or ``avg'' we need to store the sequence of prediction
vectors $\w^{(i)}_1$, $\w^{(i)}_2$, ..., $\w^{(i)}_{m^{(i)}+1}$, 
as well as the number of trials the corresponding prediction vector 
survives until it gets updated. Let us denote by 
$c^{(i)}_k$ the number of trials the $k$-th weight vector of the $i$-th
classifier survives. Then if we use ``voted''  
the output of the $i$-th classifier on instance $\x$ is 
\[
{\mbox{output}}_i(\x) = \sum_{k = 1}^{m^{(i)}+1} c^{(i)}_k\,\sign(\w^{(i)}_k\cdot\x);
\]
if we use ``avg'' the output of the $i$-th classifier on instance $\x$ is
\[
{\mbox{output}}_i(\x) = \left(\sum_{k = 1}^{m^{(i)}+1} c^{(i)}_k\,\w^{(i)}_k\right)\cdot\x.
\]
In all cases the predicted label ${\hat y}$ associated with $\x$ is
\[
{\hat y} = {\mbox{argmax}}_{i = 1\ldots N}\ {\mbox{output}}_i(\x).
\]

%We refer the reader to \cite{fs-lmcupa-99} for details.

We trained the algorithms by cycling up to 3 times (``epochs'')
over the training set. 
All the results shown in Tables 1--6 and in Figures \ref{f:5}--\ref{f:4}
are averaged over 10 random permutations of
the training sequences. 

In Tables 1--6 the columns marked ``TestErr'' 
give the fraction of misclassified examples
in the test set. The columns marked ``Correct'' (or ``Corr'', for brevity)
give the total
number of corrections occurred in the training phase for the $N$ labels
(recall that for Perceptron and \lma\ with $\alpha = 1$ 
a correction is the same as a mistaken trial).

In Figures \ref{f:5}--\ref{f:4} we plotted a number
of {\em margin distribution} graphs (Schapire et al., 1998) 
yielded when running \lma\ on various datasets. 
For binary classification tasks 
the margin distribution of a (binary) classifier $\w$ with $||\w||_q \leq 1$
is the fraction of examples $(\x,y) \in \X\times \{-1,+1 \}$ 
in the training set whose 
margin $y\,\w\cdot\xh$ is at most $s$, as a function
of $s \in [-1,+1]$. For an $N$-class problem solved via the 
one-versus-rest scheme, it is natural to define the
margin as the difference between the output of the classifier
associated with the correct label and the maximal output of any
other classifier. This value lies in $[-1,+1]$ once 
weight vectors and instances are properly normalized.
Also, the margin is positive if and only if the example
is correctly classified. 

In our experiments no special attention has been paid to 
tune scaling factors and/or noise-control parameters. 
As far as parameters $B$ and $C$ is concerned, we 
have set $C = \sqrt{2}$ (as in Theorem \ref{t:main}, part 1)
and $B = \frac{1}{\alpha}$, where $\alpha$ is chosen in the 
set $\{ 1.0,\ 0.95,\ 0.9,\ 0.8,\ 0.5 \}$.
Notice that with this parameterization the weight update 
condition $y_t\,\w_k\cdot\xh_t \leq (1-\alpha)\,\gamma_k$ in Figure \ref{f:1}
actually becomes $y_t\,\w_k\cdot\xh_t 
\leq \beta\,\frac{\sqrt{p-1}}{\sqrt{k}}$,
where $\beta \in \{ 0,\ 0.0526,\ 0.111,\ 0.25,\ 1 \}$.
Below we are using the shorthand \lma$(\alpha)$ to denote 
\lma$(\alpha;\frac{1}{\alpha},\sqrt{2})$. Thus, for example,
\lma$(1.0)$ denotes \lma$(1;1,\sqrt{2})$.


We made no preprocessing on the data (beyond the implicit preprocessing
performed by the kernels).
All our experiments have been run on a PC with 
a single Pentium$^{\circledR}$ III MMX processor running at 447 Mhz.
The running times we will be mentioning are measured on this machine.

The rest of this section describes the experiments in some detail.


%{\bf say that, as expected, the difference between voted and last
%gets smaller when alpha is shrunk!!}



% In comparing to the state of the art, we tend to emphasize
%,k-lmp-98,ksbm-99






\subsection{Experiments with USPS dataset}
The USPS (US Postal Service) dataset
has 7291 training patterns and 
2007 test patterns. Each pattern is a 16$\times$16 vector
representing a digitalized image of a handwritten digit, along
with a  \{0,1,...,9\}-valued label. 
The components of such vectors lie in $[-1,+1]$.


\begin{figure}
%\begin{minipage}{0.92\textwidth}
%
\begin{center}
\begin{tabular}{p{1.4in}|p{0.5in}p{0.6in}|p{0.5in}p{0.6in}|p{0.5in}p{0.6in}}
%\hline\hline
\ &{\em\hspace{0.25in} 1} &{\em \hspace{-0.15in}Epoch} 
  &{\em\hspace{0.25in} 2} &{\em \hspace{-0.15in}Epochs}
  &{\em\hspace{0.25in} 3} &{\em \hspace{-0.15in}Epochs}\\
\ &TestErr &Correct &TestErr &Correct &TestErr &Correct\\
%\hline
\hline
Perceptron \ \, last &6.03\%  &1148   &5.45\%    &1395      &5.30\%     &1515\\
\hspace{0.87in} avg &5.49\%  &1148   &5.24\%    &1395      &5.00\%     &1515\\
\hline
\lmatwo(1.0)\ \ \  last 
                    &6.42\% &1250   &5.62\%   &1557  &5.38\%   &1713\\
\hspace{0.87in} avg
                    &5.51\% &1250   &4.93\%   &1557  &5.06\%   &1713\\
\hline
\lmatwo(0.95)\,  last   
                    &5.72\% &1752   &5.05\%   &2087  &4.85\%   &2239\\
\hspace{0.87in} avg  
                    &5.18\% &1752    &4.68\%   &2087  &4.73\%   &2239\\
\hline
\lmatwo(0.9)\ \ \  last 
                    &5.43\% &2251  &5.06\%   &2606  &4.90\%   &2746\\
\hspace{0.87in} avg
                    &5.06\% &2251  &4.72\%   &2606  &4.77\%   &2746\\
%\hline\hline
\end{tabular}
\end{center}
\vspace{0.2in}
Table 1: Experimental results on USPS database. We used Gaussian kernels
with width $\sigma = 3.5$. ``TestErr'' denotes the 
fraction of misclassified patterns in the test set, while ``Correct''
denotes the total number of training corrections for the 10 labels. 
\lmatwo($\alpha$) is shorthand for \lmatwo$(\alpha;\frac{1}{\alpha},\sqrt{2})$.
Recall that averaging takes place during the testing phase. Thus the number of 
corrections of ``last'' is the same as the number of corrections of ``avg''.
%\end{minipage}
\end{figure}


\begin{figure}
%\begin{minipage}{0.92\textwidth}
%
\begin{center}
\begin{tabular}{p{1.24in}|m{0.29in}|p{0.29in}|p{0.29in}|p{0.29in}|p{0.29in}|p{0.29in}
                |p{0.29in}|p{0.29in}|p{0.29in}|p{0.29in}}
%\hline\hline
\hspace{0.9in} digit &\ \  0 &\ \ 1 &\ \ 2 &\ \ 3 &\ \ 4 
                     &\ \ 5 &\ \ 6 &\ \ 7 &\ \ 8 &\ \ 9\\
\hline
%\hline
\lmatwo(1.0) &&&&&&&&&&\\
{\em\  1 Epoch}\ \, \, \, Corr   &\ \,99 &\ \,46 &129 &147 &142 &156 &101 &102 &169 &159 \\
\hspace{0.82in} SV               &\ \,99 &\ \,46 &129 &147 &142 &156 &101 &102 &169 &159 \\      
{\em\  2 Epochs}\, \, \, Corr &121 &\ \,58 &164 &183 &174 &195 &127 &128 &206 &201 \\ 
\hspace{0.82in} SV             &121 &\ \,55 &159 &182 &168 &193 &126 &124 &202 &195 \\
{\em\  3 Epochs}\, \, \, Corr  &138 &\ \,67 &174 &204 &193 &208 &145 &137 &228 &219 \\
\hspace{0.82in} SV            &138 &\ \,61 &173 &197 &184 &205 &142 &132 &224 &210 \\      
\hline
\lmatwo(0.95) &&&&&&&&&&\\
{\em\  1 Epoch}\ \, \, \, Corr &149 &\ \,79 &192 &203 &190 &214 &148 &144 &226 &207 \\ 
\hspace{0.82in} SV             &149 &\ \,79 &192 &203 &190 &214 &148 &144 &226 &207 \\ 
{\em\  2 Epochs}\, \, \, Corr  &176 &\ \,94 &226 &242 &226 &255 &174 &168 &277 &249 \\ 
\hspace{0.82in} SV             &175 &\ \,89 &224 &238 &221 &253 &173 &163 &272 &243 \\
{\em\  3 Epochs}\, \, \, Corr  &186 &105 &235 &256 &243 &273 &188 &184 &297 &272 \\
\hspace{0.82in} SV             &185 &\ \,94 &233 &250 &234 &268 &184 &173 &288 &254 \\
\hline
\lmatwo(0.9) &&&&&&&&&&\\
{\em\  1 Epoch}\ \, \, \, Corr   &200 &110 &246 &255 &243 &279 &193 &182 &287 &256 \\ 
\hspace{0.82in} SV               &200 &110 &246 &255 &243 &279 &193 &182 &287 &256 \\
{\em\  2 Epochs}\, \, \, Corr    &224 &128 &284 &294 &286 &320 &224 &209 &331 &306 \\
\hspace{0.82in} SV               &223 &123 &282 &290 &280 &314 &220 &201 &323 &294 \\
{\em\  3 Epochs}\, \, \, Corr    &230 &137 &294 &312 &300 &334 &235 &226 &349 &329 \\
\hspace{0.82in} SV               &229 &126 &288 &305 &289 &328 &227 &214 &337 &309 \\
\hline               
SVM \hspace{0.38in} (SV)         &219 &\ \,91 &316 &309 &288 &340 &213 &206 &304 &250\\
%\hline\hline
\end{tabular}
\end{center}
\vspace{0.2in}
Table 2: Experimental results on USPS database. ``Corr'' denotes 
the total number of training corrections for the 10 labels, while
``SV'' denotes the number  of ``support vectors'' for the 10 labels. 
\lmatwo($\alpha$) is shorthand for \lmatwo$(\alpha;\frac{1}{\alpha},\sqrt{2})$.
Clearly, for one epoch ``Corr'' = ``SV''.
%\end{minipage}
\end{figure}


This is a well-known SVM benchmark. The accuracy results
achieved by SVM range from 4.2\% (obtained by Cortes and Vapnik, 1995, 
after suitable data smoothing) to 4.4\% (reported by Sch\"olkopf et al., 1999)
to 4.7\% (reported by Platt et al., 1999, with no data preprocessing).
The best accuracy results we are aware of are those obtained 
by Simard, et al. (1993). They yield a test error of 2.7\% by using 
a notion of distance between patterns that encodes 
specific prior knowledge about OCR problems, such as invariance to 
translation and rotation.

We ran both the Perceptron algorithm and \lmatwo.
Following Sch\"olkopf et al. (1997, 1999), Platt et al. (1999) and 
Friess et al. (1998),
we used the Gaussian kernel 
$K(\x,\y) = \exp\left(-\frac{||\x-\y||_2^2}{2\,\sigma^2} \right)$.
To choose the best width $\sigma$, we ran the Perceptron algorithm 
and \lmatwo(1.0) for one epoch.
We used 5-fold cross validation on
the training set across the range $[0.5,10.0]$ with step 0.5.
The best $\sigma$ for both algorithms turned out to be $\sigma = 3.5$.
%while the  best $\sigma$ for \lmatwo\ was  $\sigma = 3.0$.
In Tables 1 and 2 only the best ($\sigma = 3.5$)
results are displayed. 
Observe that using a Gaussian kernel makes
the normalization of instances $\xh = \x/||\x||_2$ immaterial, 
since $K(\x,\x) = 1$ for any $\x$.


Table 1 gives test error and number of corrections for the Perceptron
algorithm and \lmatwo\ with different values of $\alpha$. 
Table 2 gives statistics for \lmatwo\ on the ten digits.
Here ``Corr'' denotes the number of corrections while ``SV'' denotes the
number of ``support vectors'', i.e., the number of examples that
are actually involved in computing the prediction function 
for each of the ten classes. 
For the sake of comparison, we also give the number of support
vectors yielded by SVM, as reported by Sch\"olkopf et al. (1999).
The standard deviations related to our averages are reasonably small; 
those concerning test errors are about 0.12\%.

% These experiments show that in all cases the ``avg'' hypothesis 
% is more accurate than the ``last'' hypothesis.

On this dataset \lmatwo(1.0) and the Perceptron algorithm perform
comparably. The accuracy of \lmatwo($\alpha$) improves significantly 
when we shrink the value of $\alpha$, whereas the computed
solution gets less and less sparse. 

As in the experiments performed by Freund and Schapire (1999) and 
Li and Long (1999), the accuracy of the classifiers tends to get better 
as we increase the number of training epochs. 
However, training \lmatwo\ ``avg'' for more than two epochs seems 
to hurt performance somewhat. This might be due to the fast convergence of
this algorithm (notice that the accuracy obtained by SVM is quite close). 





We found the accuracy of 5.06\% for \lmatwo(0.9) ``avg'' 
fairly remarkable, considering
that it has been obtained by sweeping through the examples just once 
for each of the ten classes. Indeed, the algorithm
is quite fast: training for one epoch the ten binary
classifiers of \lmatwo(0.9) takes on average only 6.5 minutes.

The reader might want to compare this performance to 
the similar accuracy of 5.00\% achieved by
the ``average'' Perceptron algorithm run for three epochs.
Despite Perceptron's solution is sparser than \lmatwo(0.9)'s,
it is worth saying that running Perceptron for three epochs 
% does not give rise to an
% online algorithm. This affects running time in a significant way,
% since training Perceptron for three epochs 
takes about twice as long as training \lmatwo(0.9) for one epoch.

We plot in Figure \ref{f:5} some of the margin distribution graphs
obtained. The value of the curves at a given point $s \in [-1,+1]$
gives the fraction of patterns in the training set whose margin 
(after training) is at most $s$. 
\lmatwo(0.9) tends to increase the number of examples 
with a strictly positive margin. This is actually more evident
with the ``average'' variant (plots on the right) than with 
the ``last'' variant (plots on the left).



\begin{figure} 
\centerline{%
\begin{tabular}{c@{\hspace{0.2pc}}c}
\includegraphics[width=3.0in]{usps_margdistr_last.eps} &
\includegraphics[width=3.0in]{usps_margdistr_voted.eps} \\
``last'' & ``average''\\
\end{tabular}}
\caption{Some of the margin distribution functions yielded by 
the Perceptron algorithm and by \lmatwo, run for 1 and 3 epochs
on USPS dataset.} \label{f:5}
\end{figure}






% ********************************
% 
% There are clearly several way to improve the practical performance of
% algorithms such as \lma\ and Perceptron. We briefly mention two
% of them, though implementation issues are not the main concern of
% this paper.
% 
% \begin{itemize}
% \item
% notice big increase from $\alpha = 0.9$ to $\alpha = 0.8$ in number of mistakes
% for digits 8 and 9. Maybe for these digits $\alpha = 0.8$ is too large
% Make more comments like this...
% \item
% {\bf in the multiepoch case say that a smart choice of the points to be 
% examined would probably get improved results.}
% \end{itemize}

% There seems to be a natural trade-off between accuracy
% and sparsity, as measured by the
% number of corrections (or the number of support vectors).
% Indeed, we have obtained a better result on this dataset by
% training \lmatwo(0.9) with width $\sigma$ to 3.0

% ********************************





\subsection{Experiments with the MNIST dataset}\label{ss:mnist}
Each example in MNIST dataset is a  28$\times$28 matrix, along with
a \{0,1,...,9\}-valued label. Each entry in this matrix is 
a value in \{0,1,...,255\}, representing a grey level. 
The database has 60000 training examples and 10000 test examples.

The best accuracy results for this dataset are those obtained by
Le Cun et al. (1995) through boosting on
top of the neural net LeNet4. They reported a test error rate of 0.7\%.
A soft margin SVM achieved an error rate of 1.1\% \citep{cv-svn-95}.

In this subsection we are comparing to SVM, 
the Perceptron algorithm and the
Perceptron-like algorithm ROMMA (Li and Long, 1999).
We followed closely the experimental setting
described by Cortes and Vapnik (1995), Freund and Schapire (1999), 
Li and Long (1999). 
We used a polynomial kernel $K$ of the form 
$K(\x,\y) = (1+\x\cdot \y)^d$. At the time of writing 
the conference version of this paper \citep{g-alma-00} 
we set the degree $d$ of the kernel to 4.
According to Freund and Schapire (1999) this choice was best. 
The same choice was made by Cortes and Vapnik (1995) and Li and Long (1999).
Later on we found that significant improvements could be obtained
by a larger $d$.

We give results for \lmatwo\ with  $d = 4, 5, 6$.
We have not investigated any careful tuning of scaling factors.
In particular, we have not determined the best instance scaling factor
$s$ for our algorithm (this corresponds to using the kernel 
$K(\x,\y) = (1+\x\cdot\y/s)^d$).  
In our experiments we set $s = 255$. This was actually the
best choice made by Li and Long (1999) for the Perceptron algorithm.

The experimental results are given in Tables 3 and 4.
Table 3 gives test error and number of corrections for the Perceptron
algorithm, ROMMA and \lmatwo($\alpha$) with $\alpha = 1.0,\ 0.9,\ 0.8$.

The first four rows of Table 1 summarize some of the results obtained by 
Freund and Schapire (1999), Li and Long (1999) and Li (2000). 
The first two rows refer to the Perceptron algorithm,
%
% \footnote{
% These results have been obtained with no noise control.
% It is not clear to us how to incorporate any noise control mechanism into
% the classical Perceptron algorithm.
% The method employed in \cite{k-lmp-98,ll-romma-99} does not seem helpful 
% in this case, at least for the first epoch.} 
%
while the third and the fourth rows refer
to the original ROMMA\footnote{According to Li and Long (1999),
ROMMA's last hypothesis seems to perform better than
ROMMA's voted hypothesis.} 
and the best noise-controlled version of ROMMA, 
called ``aggressive ROMMA''.
Both the Perceptron algorithm and ROMMA 
have been run with a degree 4 polynomial kernel.
Our own experimental results are given in the subsequent rows.

Table 4 is analogous to Table 2 and refers to \lmatwo\ with a degree 4
polynomial kernel.
%
% for \lmatwo\ on the ten digits.
%Here ``Corr'' denotes the number of corrections while ``SV'' denotes the
% number of ``support vectors'', i.e., the number of examples that
% are actually involved in computing the prediction function 
% for each of the ten classes. 
% For the sake of comparison, 
%
In the last row of Table 4 we give the number of support
vectors yielded by the soft-margin SVM employed by \cite{cv-svn-95}.
Again, the standard deviations about the averages we report in these
tables are not large. Those concerning test errors range in 
(0.03\%, 0.09\%).


Among these Perceptron-like algorithms, \lmatwo\ ``avg''
seems to be the most accurate.
Again, we would like to emphasize the good accuracy
performance yielded by \lmatwo\ on the first epoch.
Notice that if $d = 6$ \lmatwo(0.9) ``avg'' gets 1.48\%.

\lmatwo\ is quite fast.
%To have an idea of running times, consider the following.
Training for one epoch the ten binary classifiers of \lmatwo(1.0)
with $d = 4$ takes on average 2.3 hours and the corresponding testing time 
is on average about 40 minutes; 
training for one epoch the ten binary classifiers of \lmatwo(0.9)
with $d = 6$ takes on average 4.3 hours, while testing takes on 
average 1.2 hours.

% As far as convergence is concerned, 

There seems to be no big difference in accuracy
between \lmatwo(0.9) and \lmatwo(0.8) when $d=4$. 
This suggested us not to run \lmatwo(0.8) with $d > 4$.
Also, we have not run \lmatwo\ for more than 3 epochs. 
But it seems reasonable to expect 
the accuracy of \lmatwo(0.9) ``avg'' and 
\lmatwo(0.8) ``avg'' to get closer and closer 
to the one achieved by SVM.

\begin{figure}
%\begin{minipage}{0.92\textwidth}
%
\begin{center}
\begin{tabular}{p{1.5in}|p{0.5in}p{0.6in}|p{0.5in}p{0.6in}|p{0.5in}p{0.6in}}
%\hline\hline
\ &{\em\hspace{0.25in} 1} &{\em \hspace{-0.15in}Epoch} 
  &{\em\hspace{0.25in} 2} &{\em \hspace{-0.15in}Epochs}
  &{\em\hspace{0.25in} 3} &{\em \hspace{-0.15in}Epochs}\\
\ &TestErr &Correct &TestErr &Correct &TestErr &Correct\\
\hline
%\hline
Perceptron \ \, last &2.71\%  &\ \,7901   &2.14\%    &10421      &2.03\%     &11787\\
\hspace{0.86in} voted &2.23\%  &\ \,7901   &1.86\%    &10421      &1.76\%     &11787\\
\hline
ROMMA (last)
                    &2.48\% &\ \,7963   &1.96\%   &\ \,9995       &1.79\%    &10971\\
agg-ROMMA (last)          
                    &2.05\% &30088  &1.76\%  &44495      &1.67\%    &58583 \\
\hline
\lmatwo(1.0) &&&&&&\\
\hspace{0.5in} $d = 4$\, \ last
                    &2.52\% &\ \,7454   &2.01\%   &\ \,9658  &1.86\%   &10934\\
\hspace{0.99in} avg
                    &1.77\% &\ \,7454   &1.52\%   &\ \,9658  &1.47\%   &10934\\
\hspace{0.5in} $d = 5$\, \ last
                    &2.40\% &\ \,7105   &1.86\%   &\ \,9048  &1.65\%   &10004\\
\hspace{0.99in} avg
                    &1.67\% &\ \,7105   &1.46\%   &\ \,9048  &1.39\%   &10004\\
\hspace{0.5in} $d = 6$\, \ last
                    &2.35\% &\ \,7001   &1.83\%   &\ \,8782  &1.67\%   &\ \,9633\\
\hspace{0.99in} avg
                    &1.64\% &\ \,7001  &1.48\%   &\ \,8782  &1.36\%   &\ \,9633\\
\hline
\lmatwo(0.9) &&&&&&\\
\hspace{0.5in} $d = 4$\, \ last 
                    &2.10\% &\ \,9911   &1.74\%   &12711  &1.64\%   &14244\\
\hspace{0.99in} avg  
                    &1.69\% &\ \,9911    &1.49\%   &12711  &1.40\%   &14244\\
\hspace{0.5in} $d = 5$\, \ last
                    &1.93\% &10373   &1.64\%   &12700  &1.49\%   &13820\\
\hspace{0.99in} avg
                    &1.59\% &10373   &1.39\%   &12700  &1.32\%   &13820\\
\hspace{0.5in} $d = 6$\, \ last
                    &1.84\% &11652   &1.53\%   &13712  &1.45\%  &14598\\
\hspace{0.99in} avg
                    &1.48\% &11652  &1.32\%   &13712  &1.27\%   &14598\\
\hline
\lmatwo(0.8) &&&&&&\\
\hspace{0.5in} $d = 4$\, \ last  
                    &1.98\% &12810  &1.72\%   &16464  &1.60\%   &18528\\
\hspace{0.99in} avg
                    &1.68\% &12810  &1.44\%   &16464  &1.35\%   &18528\\
% \hline
% SC-Relax$_2$ last, $\gamma = 0.02$
%                     &1.83 &32907 &\ \ -- &\ \ \ -- &\ \ -- &\ \ \ -- \\
% SC-Relax$_2$ avg, $\gamma = 0.02$
%                     &1.56 &32907 &\ \ -- &\ \ \ -- &\ \ -- &\ \ \ -- \\
% \hline
% SVM &1.1 &\ \ \ -- &\ \ -- &\ \ \ -- &\ \ -- &\ \ \ -- \\
%\hline\hline
\end{tabular}
\end{center}
\vspace{0.2in}
Table 3: Experimental results on MNIST database. The results have been obtained
through the polynomial kernel
$K(\x,\y) = (1+\x\cdot\y/255)^d$. The Perceptron algorithm and ROMMA use
$d = 4$, while \lmatwo\ uses $d = 4, 5, 6$. ``TestErr'' denotes the 
fraction of misclassified patterns in the test set, while ``Correct''
denotes the total number of training corrections for the 10 labels. 
% \lmatwo($\alpha$) is shorthand for \lmatwo$(\alpha,\frac{1}{\alpha},\sqrt{2})$.
% Recall that voting takes place during the testing phase. Thus the number of 
% corrections of ``last'' is the same as the number of corrections of ``avg''.
%\end{minipage}
\end{figure}


\begin{figure}
%\begin{minipage}{0.92\textwidth}
%
\begin{center}
\begin{tabular}{p{1.1in}|m{0.30in}|p{0.30in}|p{0.30in}|p{0.30in}|p{0.30in}|p{0.30in}
                |p{0.30in}|p{0.30in}|p{0.30in}|p{0.30in}}
%\hline\hline
\hspace{0.75in} digit &\ \  0 &\ \ 1 &\ \ 2 &\ \ 3 &\ \ 4 
                     &\ \ 5 &\ \ 6 &\ \ 7 &\ \ 8 &\ \ 9 \\
%\hline
\hline
\lmatwo(1.0) &&&&&&&&&&\\
{\em\  1 Epoch}\ \, \,  Corr   &\ \,441 &\ \,373 &\ \,738 &\ \,914 &\ \,715 &\ \,792 
                                         &\ \,504 &\ \,727 &1076 &1174 \\
\hspace{0.74in} SV               &\ \,441 &\ \,373 &\ \,738 &\ \,914 &\ \,715 &\ \,792 
                                         &\ \,504 &\ \,727 &1076 &1174 \\
{\em\  2 Epochs}\, \, Corr    &\ \,572 &\ \,501 &\ \,952 &1181 &\ \,915  &1016 &\ \,654 
                                         &\ \,933  &1412 &1522\\
\hspace{0.74in} SV               &\ \,549 &\ \,467 &\ \,932 &1134 &\ \,887  &\ \,993 &\ \,635 
                                         &\ \,902  &1353 &1463\\
{\em\  3 Epochs}\, \, Corr   &\ \,642 &\ \,583 &1071 &1335 &1024 &1143 &\ \,719 
                                         &1076 &1604 &1737\\ 
\hspace{0.74in} SV              &\ \,616 &\ \,517 &1016 &1237 &\ \,978 &1087 &\ \,697
                                         &\ \,995 &1481 &1616\\
\hline
\lmatwo(0.9) &&&&&&&&&&\\
{\em\  1 Epoch}\ \, \,  Corr  &\ \,611 &\ \,492 &\ \,998 &1206 &\ \,970 &1065 
                                         &\ \,695 &\ \,948 &1420 &1506 \\
\hspace{0.74in} SV              &\ \,611 &\ \,492 &\ \,998 &1206 &\ \,970 &1065 
                                         &\ \,695 &\ \,948 &1420 &1506 \\
{\em\  2 Epochs}\, \, Corr   &\ \,776  &\ \,656 &1269 &1548 &1212 &1353 &\ \,876 
                                         &1224 &1829 &1968\\
\hspace{0.74in} SV              &\ \,741  &\ \,597 &1205 &1434 &1153 &1282 &\ \,821 
                                         &1139 &1719 &1832\\
{\em\  3 Epochs}\, \, Corr   &\ \,861 &\ \,753 &1401 &1721 &1354 &1489 &\ \,963 
                                         &1384 &2073 &2245\\
\hspace{0.74in} SV              &\ \,801 &\ \,647 &1301 &1575 &1247 &1394 &\ \,896 
                                         &1252 &1857 &1986\\
\hline
\lmatwo(0.8) &&&&&&&&&&\\
{\em\  1 Epoch}\ \, \, Corr  &\ \,814 &\ \,637 &1303 &1534 &1271 &1395 &\ \,906 
                                         &1214 &1819 &1917\\
\hspace{0.74in} SV              &\ \,814 &\ \,637 &1303 &1534 &1271 &1395 &\ \,906 
                                         &1214 &1819 &1917\\    
{\em\  2 Epochs}\, \, Corr   &1023 &\ \,841 &1652 &1986 &1585 &1755 &1152 
                                         &1586 &2370 &2514\\
\hspace{0.74in} SV              &\ \,953 &\ \,742 &1528 &1817 &1476 &1627 &1064 
                                         &1423 &2134 &2273\\ 
{\em\  3 Epochs}\, \, Corr   &1140 &\ \,965 &1846 &2231 &1768 &1956 
                                      &1279 &1788 &2684 &2871\\
\hspace{0.74in} SV              &1026 &\ \,803 &1642 &1951 &1593 &1744 
                                      &1135 &1532 &2318 &2438\\
\hline
% SC-Relax$_2$ &&&&&&&&&&\\ 
% {\em 1 Epoch}&&&&&&&&&\\
% \hspace{0.1in} $\gamma = 0.02$,\ \ \ Corr 
%                                      &1816 &1408 &3421 &4234 &3188 &3614 
%                                           &1997 &2963 &5055 &5211\\
% \hspace{0.87in} SV                   &1816 &1408 &3421 &4234 &3188 &3614 
%                                           &1997 &2963 &5055 &5211\\                  
% \hline
SVM  \hspace{0.3in} (SV)       &1379 &\ \,989 &1958 &1900 &1224 &2024 &1527 
                                                       &2064 &2332 &2765\\
%\hline\hline
\end{tabular}
\end{center}
\vspace{0.2in}
Table 4: Experimental results on MNIST database, when using a polynomial kernel of
degree $d = 4$. ``Corr'' denotes 
the total number of training corrections for the 10 labels, while
``SV'' denotes the number  of ``support vectors'' for the 10 labels. 
% \lmatwo($\alpha$) is shorthand for \lmatwo$(\alpha,\frac{1}{\alpha},\sqrt{2})$.
% Clearly, for one epoch ``Corr'' = ``SV''.
%\end{minipage}
\end{figure}




\subsection{Experiments with UCI Letter dataset}
The UCI Letter dataset has 20000 patterns divided into
26 classes (the letters `A' through 'Z'). 
The instance vectors have 16 integer attributes
(statistical moments and edge counts)
extracted from raster scan images of machine 
printed letters of 20 different fonts.
The attributes have values in \{0, 1, ..., 15\}.
A standard split is to consider the first 16000 patterns 
as training set and the remaining 4000 patterns as test set.

The best accuracy results we are aware of are those obtained
by \cite{sb-bnn-00} by boosting suitable neural 
network architectures.
They reported an error rate of 1.5\%. According to Platt et al. (1999), 
a Gaussian kernel SVM achieves 2.2\%. This accuracy 
difference might actually be due to different preprocessing.

This is a dataset where Perceptron-like algorithms such as
\lmatwo\ did not work as well as we would have liked. 
We ran both the Perceptron algorithm and \lmatwo. 
Both algorithms exhibited a somewhat slow convergence. Besides, they
both failed to converge to SVM's accuracy level.
We tried to speed up convergence by using 
a ``poly-Gaussian'' kernel of the form
$K(\x,\y) = 
\left(1 + \exp\left(-\frac{||\x-\y||_2^2}{2\,\sigma^2} \right) \right)^d$.
This kernel corresponds to a linear combination of $d$ Gaussian kernels
with different width parameters $\sigma$. 
We set $d = 5$ to make the kernel flexible enough.  
Again, in order to determine the best $\sigma$, 
we ran the Perceptron algorithm and \lmatwo(1.0) for one epoch, using 
4-fold cross-validation on the training set across the range 
[0.5, 10.0], with step 0.5. The best $\sigma$ for Perceptron was 4.0, 
while the best $\sigma$ for \lmatwo\ turned out to be 3.0.
The experiments are summarized in Table 5, where only the best
results are shown.

The conclusions we can draw from this table are similar to those
for Table 1 and Table 3. The main difference is that the test error
achieved by  \lmatwo\ after one epoch (\lmatwo(0.8) ``avg''
achieves 3.60\%) is 
significantly worse than SVM's (2.2\%). The accuracy of  \lmatwo\
tends to improve after the first epoch (it reaches 2.80\% 
after three epochs), but it stabilizes around 2.7\%, no
matter how many epochs one trains the algorithm for.
We observed a similar behavior with the Perceptron algorithm 
(with an even slower convergence).
This phenomenon might be due to the lack of SVM's bias term.

% the fact that, unlike SVMs', 
% the classifiers output by such algorithms do not have a bias term.

% The significant difference in accuracy between the
% first and the subsequent epochs is likely to be due to 
% the relatively large number of classes (26) in this learning
% task. 

As far as running time is concerned, 
training \lmatwo(1.0) for one epoch takes about 2.5 minutes, while 
training \lmatwo(0.8) for one epoch takes about 5.2 minutes.

% while training \lmatwo(0.8) for three epochs\footnote{
% No caching was used to store past kernel evaluations.}
% takes about 22 minutes.


\begin{figure}
%\begin{minipage}{0.92\textwidth}
%
\begin{center}
\begin{tabular}{p{1.5in}|p{0.5in}p{0.6in}|p{0.5in}p{0.6in}|p{0.5in}p{0.6in}}
%\hline\hline
\ &{\em\hspace{0.25in} 1} &{\em \hspace{-0.15in}Epoch} 
  &{\em\hspace{0.25in} 2} &{\em \hspace{-0.15in}Epochs}
  &{\em\hspace{0.25in} 3} &{\em \hspace{-0.15in}Epochs}\\
\ &TestErr &Correct &TestErr &Correct &TestErr &Correct\\
\hline
%\hline
Perceptron\ \ \, last &6.18\%  &\ \,5010   &4.50\%    &\ \,6131      &4.15\%     &\ \,7001\\
\hspace{0.87in}  avg  &4.83\%  &\ \,5010   &3.70\%    &\ \,6131      &3.33\%     &\ \,7001\\
\hline
\lmatwo(1.0)\ \ \ last   &7.00\%  &\ \,5484   &4.92\%    &\ \,6685   &4.45\%     &\ \,7194\\
\hspace{0.87in} avg    &4.82\%  &\ \,5484   &3.87\%    &\ \,6685   &3.47\%     &\ \,7194\\
\hline
\lmatwo(0.9)\ \ \ last   &4.90\% &\ \,8312   &3.85\%   &\ \,9644  &3.50\%   &10178\\
\hspace{0.87in} avg    &3.85\% &\ \,8312   &3.10\%   &\ \,9644  &3.02\%   &10178\\
\hline
\lmatwo(0.8)\ \ \ last   &4.20\% &11258  &3.55\%   &13003  &3.27\%   &13673\\
\hspace{0.87in}   avg  &3.60\% &11258  &2.97\%   &13003  &2.80\%   &13673\\
%\hline\hline
\end{tabular}
\end{center}
\vspace{0.2in}
Table 5: Experimental results on UCI Letter database. We used a ``poly-Gaussian'' 
kernel (see main text). ``TestErr'' denotes the 
fraction of misclassified patterns in the test set, while ``Correct''
denotes the total number of training corrections for the 26 labels. 
% \lmatwo($\alpha$) is shorthand for \lmatwo$(\alpha,\frac{1}{\alpha},\sqrt{2})$.
% Recall that voting takes place during the testing phase. Thus the number of 
% corrections of ``last'' is the same as the number of corrections of ``avg''.
%\end{minipage}
\end{figure}



% It is conceivable that different values of $\alpha$ for the 10 classifiers gets
% improved results.


% {\bf beware!! In this experiments we used the constant 1 under the sqrt of
% the expression for $\gamma_k$ instead of 8!! This is due to the fact that the constant
% 8 is just an artifact of our analysis. Anyway this change of constant is equivalent
% to the choice of a different value for $\alpha$.}





\subsection{Experiments with \lma\ on artificial datasets}\label{ss:lmap}
We tested \lma, $p \geq 2$, without kernels 
on medium-size artificial datasets. The datasets are about binary classification
tasks and have been generated at random according to the following rules.
We first generated a target vector $\u \in \{-1,0,+1 \}^{300}$, where the first $s$
components are selected independently at random in $\{-1,+1 \}$ and the
remaining $300-s$ components are 0. The value $s$ is intended as a measure
of the {\em sparsity} of target vector $\u$.  
In our experiments we set $s = 3,\ 10,\ 100,\ 300$.
We also added labelling noise with rate $\epsilon$. In our experiments
$\epsilon = 0.0,\ 0.05,\ 0.10,\ 0.15$.
For a given target vector $\u$ and a given noise rate $\epsilon$,
we randomly generated 10000 training examples and 10000 test examples.
The instance vectors $\xt$ have 300 components with values chosen in 
$[-1,+1]$. The training set is generated as follows.
We picked $\xt \in [-1,+1]^{300}$ at random.
If $\u\cdot\xt \geq 1$ then a $+1$ label is associated with
$\xt$. If $\u\cdot\xt \leq - 1$ then a $-1$ label is associated with $\xt$.
The labels so obtained are then flipped with probability $\epsilon$.
If $|\u\cdot\xt| < 1$ then $\xt$ is rejected and a new vector $\xt$ is drawn. 
The test set instances $\xt$ are again chosen at random in $[-1,+1]^{300}$, the
corresponding labels equal\footnote{
Clearly, the absence of noise in the test examples 
is not a real loss of generality here, as an independent noise rate $\epsilon$
in the test set would essentially be added to the noise-free test error rates of 
the algorithms.} $\sign(\u\cdot\xt)$. We did not force
a large margin on the test set.



On each of these $4 \times 4 = 16$ datasets we ran
\lma($\alpha$) (both ``last'' and ``avg'') for one epoch, 
with $p = 2,\ 6,\ 10$ and $\alpha = 1.0,\ 0.9,\ 0.8,\ 0.5$.
The accuracy results (test errors) are shown in Table 6.
These experiments had the purpose of investigating the behavior of
\lma\ on extreme scenarios. The differences in performance
are big and sometimes even huge.
In Table 6 we report what we believe are some of the most interesting
results. We picked 6 out of the 16 datasets.
% and we dropped the results for \lma(0.9).
The columns are marked according to the values of $\epsilon$ and $s$. 

On sparse target datasets ($s = 3$ in Table 6) \lmasix\ and \lmaten\ largely 
outperform \lmatwo. On these datasets the accuracy of all algorithms
improves as $\alpha$ is made smaller. Correspondingly, the number of corrections
(not shown in Table 6) increases. Like in the experiments of the previous 
subsections, there is a natural trade-off between the number of corrections 
the algorithms make and the accuracy of the resulting hypotheses.
To give an idea of this trade-off, we report three results: 
on the ``$\epsilon = 0.0,\ s = 3$'' dataset \lmatwo(1.0) makes on
average 142 corrections, while \lmatwo(0.5) makes 2720 
corrections;
on the same dataset,  \lmaten(1.0) makes on average only 14 corrections,
whereas \lmaten(0.5) makes 1050;
on the ``$\epsilon = 0.15,\ s = 3$'' dataset \lmaten(1.0) makes on
average 2656 corrections while \lmaten(0.5) makes on average
3594 corrections.
% \footnote{
% The relatively large number of corrections is
% obviously due to labelling noise.};
The reader might want to compare the accuracy results for these three cases.
For any given value of $\alpha$, there is essentially no difference 
between \lmaten($\alpha$)\ and \lmasix($\alpha$)
(the test errors shown in Table 6 are meaningful up to about 1\%).

On dense target datasets ($s = 300$) \lma($\alpha$) with $p = 2$ is best. 
Again, accuracy improves by shrinking $\alpha$, but it degrades as 
we increase $p$.

In Figure \ref{f:3} we plot the margin distribution graphs obtained on
4 of the 16 datasets generated. In these plots we put emphasis on
the comparison between the two extreme cases $\alpha = 1.0$ and 
$\alpha = 0.5$. 
% The two plots on the left refer to two ``$s = 3$'' datasets,
% while those on the right are about two ``$s = 300$'' datasets.
% The plots on the left appear to be more informative than those on the right,
% as the
The performance gap between \lmatwo($\alpha$) and
\lmasix($\alpha$) on the two ``$s = 3$'' datasets (plots on the left) 
is clearly reflected by the different behavior of the corresponding
margin distribution functions. The plots on the right, on the other hand,
are somewhat less informative. When learning  a dense (``$s = 300$'') 
target, a single training epoch is probably not sufficient to differentiate 
algorithms' performance through their margin properties.


\newcommand{\hb}{\hspace{6pt}}
\newcommand{\hs}{\hspace{5pt}}

\begin{figure}

%\begin{minipage}{0.92\textwidth}
%
\begin{center}
\begin{tabular}{p{1.0in}|m{0.62in}|p{0.62in}|p{0.62in}|p{0.62in}|
                         p{0.62in}|p{0.62in}}
                         &$\epsilon = 0.0$   &$\epsilon = 0.0$ 
                         &$\epsilon = 0.1$   &$\epsilon = 0.1$
                         &$\epsilon = 0.15$  &$\epsilon = 0.15$\\
                        \hspace{0.5in}dataset 
                         &$s = 3$          &$s = 300$
                         &$s = 3$          &$s = 300$
                         &$s = 3$          &$s = 300$\\
                         &(TestErr)          &(TestErr)
                         &(TestErr)          &(TestErr)
                         &(TestErr)          &(TestErr)\\
\hline
\lmatwo(1.0)&&&&&&\\     
\hspace{0.6in}last  & 11.9\% &\hb 8.3\% & 26.6\% &17.2\% & 29.6\% & 22.1\%\\
\hspace{0.6in}avg     & 10.9\% &\hb 5.0\% & 16.6\% &10.6\% & 18.7\% & 12.5\%\\         
\lmatwo(0.8)&&&&&&\\  
\hspace{0.6in}last   &\hb 7.0\% &\hb 7.8\% & 21.4\% &13.6\% & 23.7\% & 16.2\%\\   
\hspace{0.6in}avg     &\hb 4.9\% &\hb 4.4\% & 11.5\% &\hb 8.0\% & 14.0\% &\hb 9.7\%\\ 
\lmatwo(0.5)&&&&&&\\  
\hspace{0.6in}last   &\hb 4.9\% &\hb 9.0\% & 12.7\% &11.5\% & 15.0\% & 13.6\%\\ 
\hspace{0.6in}avg    &\hb 2.5\% &\hb 4.9\% &\hs 5.4\% &\hb 7.1\% &\hb 7.1\% &\hb 8.2\%\\    
\hline
\lmasix(1.0)&&&&&&\\       
\hspace{0.6in}last   &\hb 9.6\% & 12.0\%  & 28.4\% &18.8\% & 28.6\% & 22.5\%\\ 
\hspace{0.6in}avg    &\hb 8.5\% &\hb 9.4\% & 17.4\% &14.7\% & 20.0\% & 17.1\%\\ 
\lmasix(0.8)&&&&&&\\       
\hspace{0.6in}last   &\hb 1.5\% & 12.9\% & 17.0\% &16.0\% & 19.5\% & 17.5\%\\
\hspace{0.6in}avg    &\hb 1.2\% &\hb 8.7\%  &\hs 8.8\% &12.2\% & 11.0\% & 14.5\%\\
\lmasix(0.5)&&&&&&\\
\hspace{0.6in}last   &\hb 0.5\% & 18.7\%  &\hs 5.6\% &20.7\% &\hb 7.8\% & 22.0\%\\
\hspace{0.6in}avg    &\hb 0.3\% & 15.9\% &\hs 2.2\% &18.6\% &\hb 3.1\% & 20.2\%\\ 
\hline
\lmaten(1.0)&&&&&&\\       
\hspace{0.6in}last   & 10.7\% & 13.9\%   & 26.1\% &21.9\% & 30.2\% & 24.9\%\\
\hspace{0.6in}avg    &\hb 8.3\% & 14.1\% & 16.9\% &18.0\% & 18.8\% & 19.8\%\\ 
\lmaten(0.8)&&&&&&\\       
\hspace{0.6in}last   &\hb 1.2\% & 15.3\% & 16.1\% &19.3\% & 18.3\% & 21.0\%\\
\hspace{0.6in}avg    &\hb 0.9\% & 13.8\% &\hs 7.4\% &16.9\% &\hb 9.3\% & 19.0\%\\
\lmaten(0.5)&&&&&&\\  
\hspace{0.6in}last   &\hb 0.8\% & 25.7\% &\hs 4.6\% &27.3\% &\hb 6.8\% & 28.6\%\\
\hspace{0.6in}avg    &\hb 0.5\% & 25.0\% &\hs 1.3\% &26.2\% &\hb 1.9\% & 26.9\%\\
\end{tabular}
\end{center}
\vspace{0.2in}
Table 6: Results of experiments on artificially generated datasets. 
Recall that $\epsilon$ denotes the amount of labelling noise, while
$s$ is the number of nonzero components of target vector $\u$.
%\end{minipage}
\end{figure}



In all our experiments the average hypothesis vector ``avg'' is 
substantially more accurate than ``last''. This tends to be even more
evident on the noisy datasets ``$\epsilon = 0.15$''.
Again, such a big performance difference after just one training epoch is 
hardly explained via a margin analysis.
Figure \ref{f:4} contains a somewhat disappointing attempt to relate
the performance gap between ``last'' and ``avg'' to different margin
properties. 
A better theoretical explaination for the resistance of an ``avg''-like
classifier to labelling noise is provided by \cite{s-pluw-99}.






\begin{figure} 
\centerline{%
\begin{tabular}{c@{\hspace{0.2pc}}c}
\includegraphics[width=3.0in]{s3_eps00.eps} &
\includegraphics[width=3.0in]{s300_eps00.eps} \\
{\small Noise rate $\epsilon = 0.0$, sparsity $s = 3$. }
& {\small Noise rate $\epsilon = 0.0$, sparsity $s = 300$.} \\ \ \\
\includegraphics[width=3.0in]{s3_eps010.eps} &
\includegraphics[width=3.0in]{s300_eps010.eps} \\
{\small Noise rate $\epsilon = 0.1$, sparsity $s = 3$.} 
& {\small Noise rate $\epsilon = 0.1$, sparsity $s = 300$.} \\ 
\end{tabular}}
\caption{Margin distributions yielded by \lma$(\alpha)$ ``avg'' with 
$p = 2, 6$ and $\alpha = 1.0, 0.5$, run for one epoch on 4 of the
16 datasets generated.} \label{f:3}
\end{figure}



\begin{figure} 
\centerline{%
\begin{tabular}{c@{\hspace{2.3pc}}c}
\includegraphics[width=2.4in]{last_vs_voted_s3_e00_p2_a050.eps} &
\includegraphics[width=2.4in]{last_vs_voted_s3_e00_p6_a1.eps} \\
{\small Sparsity $s = 3$, noise rate $\epsilon = 0.0$.} 
& {\small Sparsity $s = 3$, noise rate $\epsilon = 0.0$.} \\[0.08in]
\includegraphics[width=2.4in]{last_vs_voted_s300_e00_p2_a050.eps} &
\includegraphics[width=2.4in]{last_vs_voted_s300_e00_p6_a050.eps} \\
{\small Sparsity $s = 300$, noise rate $\epsilon = 0.0$.} 
& {\small Sparsity $s = 300$, noise rate $\epsilon = 0.0$.} \\[0.08in]
%
\includegraphics[width=2.4in]{last_vs_voted_s3_e010_p6_a1.eps} &
\includegraphics[width=2.4in]{last_vs_voted_s3_e010_p6_a050.eps} \\
{\small Sparsity $s = 3$, noise rate $\epsilon = 0.1$. }
& {\small Sparsity $s = 3$, noise rate $\epsilon = 0.1$.} \\[0.08in] 
\includegraphics[width=2.4in]{last_vs_voted_s300_e010_p2_a050.eps} &
\includegraphics[width=2.4in]{last_vs_voted_s300_e010_p6_a050.eps} \\
{\small Sparsity $s = 300$, noise rate $\epsilon = 0.1$. }
& {\small Sparsity $s = 300$, noise rate $\epsilon = 0.1$.}
\end{tabular}}
\caption{Margin distributions yielded by \lma$(\alpha)$ with 
$p = 2, 6$ and $\alpha = 1.0, 0.5$, run for one epoch on 4 of the
16 datasets generated. The plots compare ``last'' and ``avg''
variants of \lma.}
\label{f:4}
\end{figure}


\subsection{Discussion and summary}\label{ss:disc}
Our empirical results show how accuracy and running time 
(as well as sparsity)
can be traded-off against each other in a transparent way.
In all our experiments the ``avg'' hypothesis 
significantly outperforms the ``last'' hypothesis, though
the accuracy levels of our algorithm are in general
slightly inferior to those achieved by SVM.
On the other hand, our algorithm is quite faster and easier to
implement than previous implementations of SVM, such as those
given by Platt (1998), Joachims (1998) and Friess et al. (1998).

One of the most relevant features of \lmatwo\ 
is that its approximate solution relies on fewer support vectors than 
SVM's solution.
% This turns out to be a simple consequence of the
% fact that our algorithm just {\em approximates} the ...
% margin-centered approximation criterion our algorithm is based on. 
As a matter of fact, it often happens that \lmatwo\ yields a 
good test error after just
one training epoch. This obviously makes our algorithm 
attractive for an on-line learning framework 
(notice that the $N$ binary classifiers in 
the one-versus-rest scheme can be trained ``in parallel'').
Still, a significant accuracy improvement can be observed when we run 
Perceptron-like algorithms for more than one epoch. For \lmatwo($\alpha$)
with $\alpha < 1$ this improvement might be explained by 
observing that sweeping through the training set more than once 
tends to increase the margin over the training examples
(this is also shown somewhat by the margin distributions plotted in 
Figure \ref{f:5}).
However, we do not have an explaination for this phenomenon when we
run \lmatwo(1.0) or the simple Perceptron algorithm.


%Unlike Perceptron, \lmatwo($\alpha$) with $\alpha < 1$, forces a strictly
%positive margin on the training set. 

\iffalse
********************************************************************
Compared to aggressive ROMMA \cite{ll-romma-99}, 
\lmatwo\ compute classifiers which seem to be both more accurate 
and sparser (see Section \ref{ss:mnist}).
%\footnote{
%Aggressive ROMMA seems to make a large number of corrections. 
The analysis shows that in the separable case the 
two algorithms have an identical convergence speed 
(both algorithms $\alpha$-approximate the maximal margin
hyperplane after $O\left(\frac{1}{\alpha^2\,(\gamma^*)^2}\right)$ corrections).
As a matter of fact, two little differences between the two algorithms do exist,
which have significant practical implications:
\begin{enumerate} 
\item Unlike ROMMA, \lmatwo\ requires the accuracy parameter $\alpha$ 
be fixed ahead of time; if $\alpha$ is not close to zero, this makes 
\lma\ be less aggressive than
ROMMA in updating weights;\footnote{
It should actually be possible to modify aggressive ROMMA
to make it work up to an approximation parameter $\alpha$ 
fixed in advance.}
\item \lma's noise control parameter is plugged into the constant defining
the learning rate (parameter $C$ in \lma($\alpha$; $B$, $C$) ). 
On the other hand,
ROMMA's noise control mechanism is borrowed from a by now standard technique
suggested by Friess et al. \cite{fcc-98} (see also \cite{ks-95,k-lmp-98,ksbm-99}).
This method consists in employing the modified kernel function 
${\widehat K}(\x,\y) =  K(\x,\y) + \lambda\, \delta(\x,\y)$,
where $\lambda$ is a predefined positive constant and 
$\delta(\x,\y)$ is the Kronecker $\delta$ function 
($\delta(\x,\y)$ equals 1 if $\x = \y$ and is 0 otherwise).
This can be shown to be equivalent to using a quadratic 
slack penalty in the underlying SVM's cost function, 
with $\lambda$ being the trade-off parameter. 

It is not clear to us how to make use of this method when running \lma\ for
one epoch only.


\end{enumerate}

******************************************************************
\fi


Finally, the experimental results in section 4.4 seem to be significantly 
captured by our theoretical analysis (Theorem \ref{t:main}).
On sparse target datasets, the performance gap between \lmatwo($\alpha$)\ and 
\lma($\alpha$)\ with $p > 2$ is big. 
This gap tends to be magnified as $\alpha$ gets smaller.
There is again a trade-off between accuracy achieved
on the test set and number of corrections made during the training phase.
This trade-off is ruled by the interplay between the value of $p$ 
in \lma\ and the sparsity of the underlying target vector.








\section{Conclusions and open problems}\label{s:conc}
We have introduced a new incremental learning algorithm, called \lma,
which approximates the maximal $p$-norm margin hyperplane 
for a set of linearly separable data. 
\lma\ avoids quadratic or higher-order programming methods. 
Unlike previous approaches (Cortes and Vapnik, 1995; Friess et al., 1998; 
Joachims, 1998 and Platt, 1998), 
our algorithm works directly with (an approximation to)
the primal maximal margin problem, instead of its dual.
Via this approach, we are able to avoid computationally intensive
mathematical programming methods. 
\lma\ is more similar to algorithms such as ROMMA (Li and Long, 1999)
and the one analyzed by Kowalczyk (1999) and Keerthi et al. (1999). 
However, unlike those
algorithms, \lma\ remains computationally efficient when measuring the
margin through a generic norm $p$.
The theoretical properties of \lma\ have been given in Theorem \ref{t:main}.
We have proven upper bounds on the number of corrections in both the separable
and the non-separable cases.
\lmatwo\ is a Perceptron-like algorithm. All its operations essentially
involve only dot products. Hence one can replace those dot products 
by kernel dot products. 
We have tested \lmatwo\ with kernel functions on three 
OCR benchmarks and have shown that our
algorithm has very interesting accuracy levels after
just one training epoch. Compared to SVM's, \lmatwo's
approximate solution is a bit less accurate, but it is also significantly
sparser. \lmatwo's training time is substantially shorter than SVM's.
This makes our algorithm attractive for learning very large
datasets.
We have also tested \lma\ with $p \geq 2$ on artificial datasets.
Setting $p > 2$ is useful when learning sparse
target vectors (this is not rare in text processing tasks).
In such problems the practical superiority of \lma\ with $p > 2$ over \lmatwo\ 
is largely predicted by our theoretical analysis. 
%See also the argument made by Kivinen and Warmuth \cite{kw-avegulp-97}

There are many directions in which this work could be extended. In the following
we briefly discuss four of them.
\begin{enumerate}
\item 
It is not clear to us whether
the convergence analysis provided by Theorem \ref{t:main}, part 1, is
optimal. In fact, in the case when the margin $\gamma^*$ is known to the algorithm 
we can prove a bound on $|\M|$ which scales with $\alpha$ as
$\sfrac{1}{\alpha}\,\ln\sfrac{1}{\alpha}$ (instead of
$\sfrac{1}{\alpha^2}$ occurring in both the bound of Theorem \ref{t:main} and 
in the analysis by Kowalczyk, 1999, and Li and Long, 1999). 
We should stress that when $\gamma^*$ is known a direct on-line
analysis based on Bregman divergences (such as the one performed by 
Auer et al., 2001) still gives dependence $\sfrac{1}{\alpha^2}$.
This is actually the reason why in the proof of Theorem \ref{t:main} 
we do not use a Bregman divergence measure of progress.

\item 
We would like to see if a variant of \lma\ exists which 
computes after a finite number of steps an $\alpha$-approximation to the 
fixed (non-zero) threshold maximal margin hyperplane. This is an important
question, as a relevant practical drawback of our algorithm is its inability to
handle SVM's bias term. 


\item 
Running Perceptron-like algorithms, such as Freund and Schapire's voted
Perceptron algorithm, Li and Long's ROMMA, and our \lma, for more than
one epoch tends to increase performance. This phenomenon 
does not seem to fully depend on the margin properties of the algorithms.
We would like to gain a deep theoretical understanding of 
this experimental evidence.

\item 
The way we handle multiclass classification problems is to reduce
to a set of binary problems. As a matter of fact, 
natural multiclass versions Perceptron-like algorithms do exist
(e.g., Duda and Hart, 1973, Chap. 5).
As in the one-versus-rest scheme, these algorithms associate one weight 
vector classifier with each class and predict according to the maximum output 
of these classifiers. Again, margin is defined as the difference between 
the output of the classifier associated with the correct label and the
output of any other classifier. However, only two weight vectors 
are updated within a correction. These are just the two weight vectors 
involved in computing the above margin. 
These multiclass algorithms can be motivated (Gentile and Warmuth, 2001)
within Kivinen and Warmuth's multidimensional regression framework 
(Kivinen and Warmuth, 1998).
The on-line analysis of such algorithms is a fairly straightforward 
extension of the analysis for the binary case.
We are not aware of any thorough experimental investigation of these 
multiclass on-line algorithms. It would be interesting to see
how they do compare in practice to the other multiclass methods 
available in the literature.
\end{enumerate}

%compare to ROMMA (similar theory, better performance, and also 
%romma makes more corrections) 
%and to perceptron (not margin sensitive)


\acks{
We would like to thank to Nicol\`o Cesa-Bianchi, Nigel Duffy, Dave Helmbold, Adam Kowalczyk, 
Yi Li, Nick Littlestone and Dale Schuurmans for valuable conversations and
email exchange. We would also like to thank the anonymous reviewers for 
their comments which helped us to improve the presentation of this paper.
The author has been partially supported by
%MURST, Cofin99 (Project: Stochastic Processes with Spatial Structure) and by 
ESPRIT Working Group EP 27150,
Neural and Computational Learning II (NeuroCOLT II). }

\vspace{0.3in}

\bibliographystyle{natbib}
\bibliography{jmlr_ver}

\iffalse

\begin{thebibliography}{}
 
\bibitem[ABR64]{abr-tfpf-64}
M. A. Aizerman, E. M. Braverman and  L. I. Rozonoer.
Theoretical foundations of the potential function method in pattern
recognition learning.
{\em Automation and Remote Control}, 25: 821--837, 1964.
 
\bibitem[ASS00]{ass-unify-00}
E. L. Allwein, R. E. Schapire and Y. Singer.
Reducing multiclass to binary: a unifying approach for margin
classifiers.
{\em Journal of Machine Learning Research}, 1: 113--141, 2000.

\bibitem[Ang88]{A88}
D. Angluin. Queries and concept learning. 
{\em Machine Learning},  2(4): 319--342, 1988.

\bibitem[AB99]{ab-book-99}
M. Anthony and P. Bartlett. 
{\em Neural Network Learning: Theoretical Foundations}, CMU, 1999.

\bibitem[ACBG00]{acg-ascolla-00}
P. Auer, N. Cesa Bianchi and C. Gentile.
Adaptive and self-confident on-line learning algorithms.
{\em Journal of Computer and System Sciences}, forthcoming.
Preliminary version in 
{\em Proceedings of 13th Annu. Conf. on Comput. Learning Theory},
pages 107--117, Palo Alto, CA, 2000.

%\bibitem[BAK90]{bak-90}
%J. Anlauf and M. Biehl,
%\newblock The adatron: an adaptive perceptron algorithm.
%\newblock {\em Europhysics Letters}, 10, 7:687--692, 1989.

\bibitem[BKC98]{uci}
C. Blake, E. Keogh and  C. Merz.
UCI repository of machine learning databases. Dept. of Information
and Computer Sciences, University of California, Irvine.\\
http://www.ics.uci.edu/$\sim$mlearn/MLRepository.html, 1998.

\bibitem[Blo62]{b-pmbf-62}
H. D. Block.
The perceptron: A model for brain functioning.
{\em Reviews of Modern Physics}, 34: 123--135, 1962.

%\bibitem[Bre67]{b-trhfcpc-67}
%L.M. Bregman.
%\newblock The relaxation method of finding the common point of convex sets and
%  its application to the solution of problems in convex programming.
%\newblock {\em USSR Computational Mathematics and Physics}, 7:200--217, 1967.

%\bibitem[CBFH{$^+$}97]{cfhhsw-huea-97}
%Cesa-Bianchi, N., Freund, Y., Haussler, D., Helmbold, D. P., 
%Schapire, R. E., \& Warmuth, M. K. (1997).
%How to use expert advice.
%{\em Journal of the ACM}, 44:3, 427--485.

%\bibitem[CL81]{b-cl-airamic-81}
%Y.~Censor and A.~Lent.
%\newblock An iterative row-action method for interval convex programming.
%\newblock {\em Journal of Optimization Theory and Applications},
%  34(3):321--353, July 1981.

\bibitem[CV95]{cv-svn-95}
C. Cortes and V. Vapnik.
Support-vector networks.
{\em Machine Learning}, 20(3): 273--297, 1995.

\bibitem[CST00]{cst-book-00}
N. Cristianini and J. Shawe-Taylor.
{\em An introduction to support vector machines and other kernel-based 
learning methods}, Cambridge University Press, 2000.

%\bibitem[Csi91]{c-wlsme-91}
%I. Csiszar.
%\newblock Why least squares and maximum entropy? An axiomatic approach for
%linear inverse problems.
%\newblock {\em The annals of Statistics},
%  19(4):2032--2066, 1991.

\bibitem[DKR97]{dkr-emnlp-97}
I. Dagan, Y. Karov and D. Roth.
Mistake-Driven Learning in Text Categorization. 
In {\em Proceedings of 2nd Conference on Empirical Methods in Natural Language Processing}, 
pages 55--63, Association for Computational Linguistics, 
Somerset, New Jersey, 1997.

\bibitem[DB95]{db-ecoc-95}
T. G. Dietterich and G. Bakiri.
Solving multiclass learning problems via error-correcting output codes.
{\em Journal of Artificial Intelligence Research}, 2: 263--286, 1995.

\bibitem[DH73]{dh-pcsa-73}
R.~O. Duda and P.~E. Hart.
\newblock {\em Pattern Classification and Scene Analysis},
\newblock Wiley, New York, 1973.

\bibitem[FS99]{fs-lmcupa-99}
Y. Freund and R. E. Schapire.
Large margin classification using the perceptron algorithm.
{\em Journal of Machine Learning}, 37(3): 277--296, 1999.

\bibitem[FCC98]{fcc-98}
T.-T. Friess, N. Cristianini and  C. Campbell.
The kernel adatron algorithm: 
a fast and simple learning procedure for support vector machines. 
In {\em Proceedings of 15th International Conference in Machine Learning}, 
pages 188--196, Morgan Kaufmann, San Mateo, CA, 1998.

\bibitem[GL99]{gl-99}
C. Gentile and N. Littlestone.
The robustness of the $p$-norm algorithms.
In {\em Proc. 12th Annu. Conf. on Comput. Learning Theory},
pages 1--11, ACM, 1999.

\bibitem[GW01]{gw-lhlam-98}
C. Gentile and M. K. Warmuth.
Linear Hinge Loss and Average margin.
Unpublished manuscript. Preliminary version in
{\em Proc. Advances in Neural Information Processing Systems 11},
pages 225--231, MIT Press, Cambridge, MA, 1999.

\bibitem[G00]{g-alma-00}
C. Gentile.
A new approximate maximal margin classification algorithm.
In {\em T. K. Leen, T. G. Dietterich, and V. Tresp editors, 
Advances in Neural Information Processing Systems 13},
pages 500--506, MIT Press, Cambridge, MA, 2001.


% \bibitem[GHW00]{ghw-fmts-00}
% T. Graepel, R. Herbrich, and R. C. Williamson. 
% From Margin to Sparsity. 
% In {\em 13th NIPS}, to appear.

\bibitem[GR96]{gr-spell-96}
A. R. Golding and D. Roth. 
Applying Winnow to Context-Sensitive Spelling Correction , 
In {\em Proceedings of 13th International Conference in Machine Learning}, pages 182--190, 
Morgan Kaufmann, San Mateo, CA, 1996.

\bibitem[GLS97]{gls-gcrfldu-97}
A. J. Grove, N. Littlestone and D. Schuurmans.
General convergence results for linear discriminant updates.
{\em  Journal of Machine Learning}, 43(3): 173--210. 

\bibitem[HW95]{hw-wl-95}
D. P. Helmbold and M. K. Warmuth.
On weak learning.
{\em Journal of Computer and System Sciences}, 50(3), 551--573, 1995.

\bibitem[J98]{j-svml-98}
T. Joachims.
Making large-scale support vector machines learning practical.
In {\em B. Scholkopf, C. Burges and A. Smola (eds.):
Advances in kernel methods: support vector machines}. 
MIT Press, Cambridge, MA, 2000.

\bibitem[KS$^+$99]{ksbm-99}
S. S. Keerthi, S. K. Shevade,  C. Bhattacharyya and K.R.K. Murthy.
A fast iterative nearest point algorithm for support vector machine classifier
design. Technical Report ISL-99-03, Indian Institute of Science, 1999.

\bibitem[KW97]{kw-avegulp-97}
J. Kivinen and M. K. Warmuth.
Additive versus exponentiated gradient updates for linear prediction.
{\em Information and Computation}, 132(1): 1--64, 1997.

\bibitem[KW98]{kw-rlbmrp-97}
J. Kivinen and M. K. Warmuth.
Relative loss bounds for multidimensional regression problems.
{\em Journal of Machine Learning}, forthcoming. 
Preliminary version in 
{\em Proc. Advances in Neural Information Processing Systems 10}, 
pages 287--293, MIT Press, Cambridge, MA, 1998.

\bibitem[KWA97]{kwa-paw-98}
J. Kivinen, M. K. Warmuth and P. Auer.
\newblock The perceptron algorithm vs. winnow: linear vs. logarithmic mistake
  bounds when few input variables are relevant.
\newblock {\em Artificial Intelligence}, 97, 325--343, 1997.

%\bibitem[KS95]{ks-95}
%Klasner, N., \& Simon, H. U. (1995).
%From Noise-Free to Noise-Tolerant and from On-line to Batch Learning.
%In {\em Proc. 8th Annu. Conf. on Comput. Learning Theory} (pp. 250--257). 
%ACM.

\bibitem[K98]{k-lmp-98}
A. Kowalczyk.
Maximal margin perceptron.
In {\em Smola, Bartlett, Scholkopf, and Schuurmans editors, 
Advances in large margin classifiers},
MIT Press, Cambridge, MA, 1999.

\bibitem[LCBD$^+$89]{lc-bahzcr-89}
Y. Le Cun, B. Boser, J. S. Denker, D. Henderson, R. E. Howard, W. Hubbard,
and L. J. Jackel.
Backpropagation applied to handwritten zip code recognition.
{\em Neural Computation}, 1: 541--551, 1989.

\bibitem[LCJB$^+$95]{lc-mnist-95}
Y. Le Cun, L. J. Jackel, L. Bottou, A. Brunot, C. Cortes,  
J. S. Denker, H. Drucker, I. Guyon, U. Muller, S. Sackinger,  
P. Simard and  V. Vapnik.
Comparison of learning algorithms for handwritten digit recognition.
In {\em Proceedings of ICANN 1995}, pages 53--60, 1995.

\bibitem[Li00]{l-thesis-00}
Y. Li.
{\em From support vector machines to large margin classifiers}.
Ph.D. thesis, School of Computing, National University of Singapore, 2000.

\bibitem[LL99]{ll-romma-99}
Y. Li and P. Long.
The relaxed online maximum margin algorithm.
{\em Journal of Machine Learning}. To appear.
Preliminary version 
in {\em S. A. Solla, T. K. Leen and K. R. Muller editors, Advances in Neural
Information Processing Systems 12}, pages 498--504,
MIT Press, Cambridge, MA, 2000.

\bibitem[L88]{l-liaanla-88}
N. Littlestone.
Learning quickly when irrelevant attributes abound: 
{A} new linear-threshold algorithm.
{\em Machine Learning}, 2: 285--318, 1988.

%\bibitem[Lit89]{l-obl-89}
%N.~Littlestone.
%\newblock From on-line to batch learning.
%\newblock In {\em 2nd COLT}, 269--284, 1989.

%\bibitem[Lit89]{l-mbllla-89}
%N.~Littlestone.
%\newblock {\em Mistake Bounds and Logarithmic Linear-threshold Learning
%  Algorithms}.
%\newblock PhD thesis, Technical Report UCSC-CRL-89-11, University of California
%  Santa Cruz, 1989.

%\bibitem[Lit91]{l-rnaaelt-91}
%N.~Littlestone.
%\newblock Redundant noisy attributes, attribute errors, and linear threshold
%  learning using {W}innow.
%\newblock In {\em Proc. 4th Annu. Workshop on Comput. Learning Theory}, pages
%  147--156, San Mateo, CA, 1991. Morgan Kaufmann.

\bibitem[LW94]{lw-wma-94}
N. Littlestone and  M.~K. Warmuth.
\newblock The weighted majority algorithm.
\newblock {\em Information and Computation}, 108(2): 212--261, 1994.

%\bibitem[LM97]{lm-aarw-97}
%N. Littlestone and C. Mesterharm,
%An Apobayesian Relative of Winnow,
%{\em Advances in Neural Information Processing Systems}, Morgan Kaufmann, 1997.

%\bibitem[LM99]{LMinprogress}
%N. Littlestone and C. Mesterharm,
%A Simulation Study of Winnow and Related Learning Algorithms
%(unpublished manuscript), 1999.

\bibitem[M97]{m-mpdm-97}
O. Mangasarian.
Mathematical programming in data mining.
{\em Data Mining and Knowledge Discovery}, 42(1): 183--201, 1997.

\bibitem[M68]{m-msmps-68}
O. Mangasarian.
Multi-surface method of pattern separation.
{\em IEEE Trans. on Information Theory}, 14, 801--807, 1968.

\bibitem[NNS93]{nns-93}
P. Nachbar, J. A. Nossek and  J. Strobl.
The generalized adatron algorithm.
In {\em Proceedings of 1993 IEEE ISCAS}, pages 2152--2155, 1993.

\bibitem[Nov62]{n-cpp-62}
A.~B.~J. Novikov.
On convergence proofs on perceptrons.
In {\em Proc. of the Symposium on the Mathematical Theory of
Automata, vol. XII}, pages 615--622, 1962.

\bibitem[OFG97]{ofg-97}
E. Osuna, R. Freund and F. Girosi.
An improved training algorithm for support vector machines.
In {\em Proceedings of IEEE NNSP'97}, 1997.

\bibitem[Pl98]{p-98}
J. C. Platt.
Fast training of support vector machines using sequential
minimal optimization.
In {\em Sch\"olkopf, Burges and Smola editors, 
Advances in kernel methods: support vector machines}, 
MIT Press, Cambridge, MA, 1998.

\bibitem[PCST99]{pcst-dags-99}
J. C. Platt, N. Cristianini and J. Shawe-Taylor.
Large margin DAGs for multiclass classification.
In {\em S. A. Solla, T. K. Leen and K. R. Muller editors, Advances in Neural
Information Processing Systems 12}, pages  547--553,
MIT Press, Cambridge, MA, 1999.

%\bibitem[Roc70]{r-ca-70}
%R. Rockafellar. {\em Convex Analysis}, Princeton University press, 1970.

\bibitem[R62]{r-pn-62}
F. Rosenblatt.
{\em Principles of neurodynamics: Perceptrons and the theory of
brain mechanisms}, Spartan Books, Washington, D.C., 1962.

\bibitem[SF$^+$98]{sfbs-bm-98}
R. E. Schapire, Y. Freund, P. Bartlett and W. S. Lee.
Boosting the margin: A new explanation for the effectiveness of voting methods. 
{\em The Annals of Statistics}, 26(5): 1651--1686, 1998.

\bibitem[SMB$^+$99]{smbkmrs-ivf-99}
B. Sch\"olkopf, S. Mika, C.J.C. Burges, P. Knirsch, 
K. M\"uller, G. R\"atsch and A. Smola.
Input space vs. feature space in kernel-based methods.
{\em IEEE Trans. on Neural Network}, 10(5): 1000--1017, 1999.

\bibitem[SSB$^+$97]{ssbgnpv-svmcomp-97}
B. Sch\"olkopf, K. Sung,  C.J.C. Burges,
F. Girosi, P. Niyogi, T. Poggio and V. Vapnik.
Comparing support vector machines with gaussian kernels to
radial basis function classifiers.
{\em IEEE Trans. on Signal Processing}, 45: 2758--2765, 1997.

%\bibitem[SBS98]{sbs-adv-98}
%Sch\"olkopf, B.,  Burges, C. \&  Smola, A. (Eds.) (1998). 
%{\em Advances in kernel methods: support vector machines}.
%Cambridge, MA: MIT Press.

\bibitem[SB00]{sb-bnn-00}
H. Schwenk and Y. Bengio. 
Boosting neural networks. 
{\em Neural Computation}, 12(8), 1869--1887, 2000.

\bibitem[Se99]{s-pluw-99}
R. A. Servedio. 
On PAC Learning Using Winnow, Perceptron, and a Perceptron-like Algorithm. 
In {\em Proc. 12th Annu. Conf. on Comput. Learning Theory}, pages 296--307, 
ACM, 1999.

\bibitem[ST$^+$98]{stbwa-srmoddh-98}
J. Shawe-Taylor, P.  Bartlett, R. Williamson and M. Anthony.
Structural Risk Minimization over Data-Dependent Hierarchies.
{\em IEEE Trans. on Information Theory}, 44(5): 1926--1940, 1998.

\bibitem[SLCD93]{sld-tr-93}
P. Simard, Y. LeCun and J. Denker. 
Efficient pattern recognition using a new transformation distance. 
In {\em S. Hanson, J. Cowan, and L. Giles, editors, Advances in 
Neural Information Processing Systems, volume 5},
Morgan Kaufmann, 1993.

%\bibitem[S$^+$98]{sbss-adv-99}
%Smola, A., Bartlett, P., Scholkopf, B., \& Schuurmans, D. (Eds.) (1999).
%{\em Advances in large margin classifiers}.
%Cambridge, MA: MIT Press.

\bibitem[V98]{v-book-98}
V. Vapnik.
{\em Statistical learning theory}. J. Wiley \& Sons, New York, 1998.

%\bibitem[WH60]{wh-asc-60}
%B. Widrow and M. E. Hoff, Adaptive switching circuits.
%In {\em 1960 IRE WESCON Conv. Record, Part 4}, pages 96--104, 1960.

\end{thebibliography}

\fi


\end{document}








