\documentclass[twoside,11pt]{article}

% Any additional packages needed should be included after jmlr2e.
% Note that jmlr2e.sty includes epsfig, amssymb, natbib and graphicx,
% and defines many common macros, such as 'proof' and 'example'.
%
% It also sets the bibliographystyle to plainnat; for more information on
% natbib citation styles, see the natbib documentation, a copy of which
% is archived at http://www.jmlr.org/format/natbib.pdf

\usepackage{jmlr2e}
\input{prel1}

%\documentstyle[jmlr2e,11pt]{article}
%\documentstyle[epsfig,times,jmlr,twoside,11pt,theapa]{article}
% \documentstyle[epsfig,amstex]{jmlr2e}

\iffalse
	$Header: /CS/phd/kobics/tex/MCSVM/RCS/paper.tex,v 1.34 2001/12/22 21:02:25 singer Exp $
\fi

\jmlrheading{2}{2001}{265-292}{03/01}{12/01}{Koby Crammer and Yoram Singer}

\ShortHeadings{Multiclass Kernel-based Vector Machines}{Crammer and Singer}

\firstpageno{265}

\begin{document}

\title{On the Algorithmic Implementation of \\
Multiclass Kernel-based Vector Machines}

\author{
\name \hspace{-0.05in}Koby Crammer
\email kobics@cs.huji.ac.il\\
% \addr School of Computer Science \& Engineering\\
% Hebrew University, Jerusalem \ 91904, \ Israel
% \AND
\name Yoram Singer
\email singer@cs.huji.ac.il \\
\addr School of Computer Science \& Engineering\\
Hebrew University, Jerusalem \ 91904, \ Israel
}

\editor{Nello Cristianini, John Shawe-Taylor and Bob Williamson}

\maketitle

\begin{abstract}%
In this paper we describe the algorithmic implementation of multiclass
kernel-based vector machines. Our starting point is a generalized
notion of the margin to multiclass problems. Using this notion we
cast multiclass categorization problems as a constrained optimization
problem with a quadratic objective function. Unlike most of previous
approaches which typically decompose a multiclass problem into multiple
independent binary classification tasks, our notion of margin yields
a direct method for training multiclass predictors. By using the
dual of the optimization problem we are able to incorporate kernels
with a compact set of constraints and decompose the dual problem into
multiple optimization problems of reduced size. We describe an efficient
fixed-point algorithm for solving the reduced optimization problems and
prove its convergence. We then discuss technical details that yield
significant running time improvements for large datasets. Finally,
we describe various experiments with our approach comparing it to
previously studied kernel-based methods.  Our experiments indicate that
for multiclass problems we attain state-of-the-art accuracy.
\end{abstract}

\begin{keywords}
Multiclass problems, SVM, Kernel Machines
\end{keywords}



\section{Introduction}
Supervised machine learning tasks often boil down to the problem of
assigning labels to instances where the labels are drawn from a finite
set of elements. This task is referred to as multiclass learning.
Numerous specialized algorithms have been devised for multiclass
problems by building upon classification learning
algorithms for binary problems, i.e., problems in which the set of
possible labels is of size two. Notable examples for multiclass
learning algorithms are the multiclass extensions for decision tree
learning~\citep{BreimanFrOlSt84,Quinlan93} and various specialized
versions of boosting
such as AdaBoost.M2 and AdaBoost.MH~\citep{FreundSc97,SchapireSi99}.
However, the dominating approach for solving multiclass problems 
using support vector machines has been based on reducing a single
multiclass problems into multiple binary problems. For instance,
a common method is to build a set of binary classifiers
where each classifier distinguishes between one of the labels
to the rest. This approach is a special case of using output codes
for solving multiclass problems~\citep{DietterichBa95}.
However, while multiclass learning using output codes provides
a simple and
powerful framework it cannot capture correlations between the different
classes since it breaks a multiclass problem into multiple {\em independent}
binary problems.

In this paper we develop and discuss in detail a direct approach
for learning multiclass support vector machines (SVM). SVMs have
gained an enormous popularity in statistics, learning
theory, and engineering
(see for instance~\citealp{Vapnik98,ScholkopfBuSm98,CristianiniSh00},
and the many references therein). With a few exceptions most support
vector learning algorithms have been designed for
binary (two class) problems.  A few attempts
have been made to generalize SVM to multiclass 
problems~\citep{WestonWa99,Vapnik98}. These attempts
to extend the binary case are acheived by adding 
constraints for every class
and thus the size of the quadratic optimization is proportional
to the number categories in the classification problems. The result
is often a homogeneous quadratic problem which is hard to solve and
difficult to store.

The starting point of our approach is a simple generalization of separating
hyperplanes and, analogously, a generalized notion of margins for multiclass
problems.  This notion of a margin has been employed in previous
research~\citep{AllweinScSi00} but not in the context of SVM. Using the
definition of a margin for multiclass problems we describe in
Section~\ref{main:sec} a {\em compact} quadratic optimization problem.
We then discuss its dual problem and the form of the resulting 
multiclass predictor. In Section~\ref{decomp:sec} we give a decomposition
of the dual problem into multiple small optimization problems. This
decomposition yields a memory and time efficient representation of
multiclass problems. We proceed and describe an iterative solution for
the set of the reduced optimization problems. We first discuss in
Section~\ref{choose_and_stop} the means of choosing which reduced problem
to solve on each round of the algorithm. We then discuss in
Section~\ref{small_problem} an efficient fixed-point algorithm for finding
an approximate solution for the reduced problem that was chosen. We analyze
the algorithm and derive a bound on its rate of convergence to the
optimal solution. The baseline algorithm is based on a main loop which
is composed of an example selection for optimization followed by an
invocation of the fixed-point algorithm with the example that was chosen.
This baseline algorithm can be used with small datasets but to make it practical for
large ones, several technical improvements had to be sought. We therefore devote
Section~\ref{impdet:sec} to a description of the different technical
improvements we have taken in order to make our approach applicable
to large datasets. We also discuss the running time and accuracy results
achieved in experiments that underscore the technical improvements.
In addition, we report in Section~\ref{exper:sec} the results achieved in
evaluation experiments, comparing them to previous work. Finally, we give
conclusions in Section~\ref{conc:sec}.

\paragraph{Related work} Naturally, our work builds on previous research
and advances in learning using support vector machines. The space is clearly
too limited to mention all the relevant work, and thus we refer the reader
to the books and collections mentioned above. As we have already
mentioned, the idea of casting multiclass problems as a single constrained
optimization with a quadratic objective function was proposed by
Vapnik~\citeyearpar{Vapnik98}, Weston and Watkins~\citeyearpar{WestonWa99},
Bredensteiner and Bennet~\citeyearpar{BredensteinerBe99},
and Guermeur et. al~\citeyearpar{GuermeurElPa00}.
However, the size of the resulting optimization problems devised in the above
papers is typically large and complex.
The idea of breaking a large constrained optimization
problem into small problems, where each of which employs a subset of the
constraints was first explored in the context of support vector machines
by Boser~{\em et~al.}~\citeyearpar{BoserGuVa92}. These ideas were further
developed
by several researchers (see~\citealp{Joachims98} for an overview). However,
the roots of this line of research go back to the seminal work of Lev
Bregman~\citeyearpar{Bregman67} which was further developed by Yair Censor
and colleagues (see \citealp{CensorZe97} for an excellent overview). These
ideas distilled in Platt's method, called SMO, for sequential minimal
optimization. SMO works with reduced problems that are derived from a pair
of examples while our approach employs a
single example for each reduced optimization
problem. The result is a simple optimization problem which can be
solved analytically in binary classification problems (see~\citealp{Platt98}) 
and leads to an efficient numerical algorithm (that is guaranteed to converge) in multiclass
settings. Furthermore, although not explored in this paper, it deems possible
that the single-example reduction can be used in parallel applications.
Many of the technical improvements we discuss in this paper have been
proposed in previous work.
In particular ideas such as using a working set and caching have been
described by Burges~\citeyearpar{Burges98}, Platt~\citeyearpar{Platt98},
Joachims~\citeyearpar{Joachims98}, and others. Finally, we would like to note
that this work is part of a general line of research on multiclass learning
we have been involved with. \citet{AllweinScSi00} described and analyzed a general approach
for multiclass problems using error correcting output
codes~\citep{DietterichBa95}. Building on that
work, we investigated the problem of designing good
output codes for multiclass problems~\citep{CrammerSi00}. Although the model of learning using
output codes differs from the framework studied in this paper, some of the
techniques presented in this paper build upon results from an earlier paper~\citep{CrammerSi00}. Finally, some of the ideas presented in this paper
can also be used to build multiclass predictors in {\em online} settings
using the mistake bound model as the means of analysis.
Our current research on multiclass problems concentrates on 
analogous online approaches~\citep{CrammerSi01}. 

\section{Preliminaries}
Let $S = \sample$ be a set of $m$ training examples. We assume that
each example $\vx_i$ is drawn from a domain $\X \subseteq \Re^n$ and
that each label $y_i$ is an integer from the set $\Y=\{1 \comdots k\}$.
A (multiclass) classifier is a function $H: \ispace \rightarrow \Y$ that
maps an instance $\vx$ to an element $y$ of $\Y$. In this paper we focus
on a framework that uses classifiers of the form
$$\HM(\vx) = \arg\max_{r=1}^k \{  \mr \cdot \vx \} ~ ,$$
where $\M$ is a matrix of size $k \times n$ over
$\Re$ and $\mr$ is the $r$th row of $\M$.
We interchangeably call the value of the inner-product of the $r$th row
of $\M$ with the instance $\vx$ the {\em confidence} and the {\em similarity
score} for the $r$ class. Therefore, according to our definition above,
the predicted label is the index of the row attaining the highest
similarity score with $\vx$. This setting is a generalization of linear
binary classifiers. Using the notation introduced above, linear binary
classifiers predict that the label of an instance $\vx$ is $1$ if
$\vw \cdot \vx > 0$ and $2$ otherwise ($\vw \cdot \vx \leq 0$).
Such a classifier can be implemented using a matrix of size $2\times n$
where $\rowM{1}=\vw$ and $\rowM{2}=-\vw$. Note, however, that this
representation is less efficient as it occupies twice the memory
needed. Our model becomes parsimonious when $k\geq 3$ in which
we maintain $k$ prototypes $\rowM{1}, \rowM{2} \comdots \rowM{k}$
and set the label of a new input instance by choosing the index of the most
similar row of $\M$. 

\begin{figure}[t]
\centerline{\epsfig{figure=PS/clip016.eps,width=0.9\textwidth}}
\caption{Illustration of the margin bound employed by the optimization problem.}
\label{bound:fig}
\end{figure}

Given a classifier $\HM(\vx)$ (parametrized by a matrix $\M$) and an example
$(\vx,y)$, we say that $\HM(\vx)$ misclassifies an example $\vx$
if $\HM(\vx) \neq y$.
Let $[\![ \pi ]\!]$ be $1$ if the predicate $\pi$ holds and $0$ otherwise.  
Thus, the empirical error for a multiclass problem is given by
\begin{equation} 
\costalla = \frac{1}{m} \sum_{i=1}^{m} [\![ \HM(x_i) \neq y_i ]\!] ~ .
\label{dloss:eqn}
\end{equation} 
Our goal is to find a matrix $\M$ that attains a small empirical error
on the sample $S$ and also generalizes well. Direct approaches that attempt
to minimize the empirical error are computationally
expensive (see for instance \citealp{HoffgenSi92,CrammerSi00}). 
%
Building on Vapnik's work
on support vector machines~\citep{Vapnik98}, we describe in the next section
our paradigm for finding a good matrix $\M$ by replacing the discrete
empirical error minimization problem with a quadratic optimization problem.
As we see later, recasting the problem as a minimization problem also enables
us to replace inner-products of the form $\bar{a} \cdot \bar{b}$ with kernel-based
inner-products of the form $K(\bar{a},\bar{b}) = \bar{\phi}(\bar{a}) \cdot \bar{\phi}(\bar{b})$.



\section{Constructing multiclass kernel-based predictors} \label{main:sec}

To construct multiclass predictors we replace the misclassification
error of an example, $\left([\![ \HM(x) \neq y ]\!]\right)$, with the
following piecewise linear bound,
\[
\max_r \{\rowM{r} \cdot \vx + 1 - \delta_{y,r} \} -
         \rowM{y}\cdot\vx~,\]
where $\delta_{p,q}$ is equal $1$ if $p=q$ and $0$ otherwise. 
The above bound is zero if the confidence value for the correct label 
is larger by at least one than the confidences assigned to the
rest of the labels. Otherwise, we suffer a loss which is linearly proportional
to the difference between the confidence of the correct label and the
maximum among the confidences of the other labels. A graphical
illustration of the above is given in Figure~\ref{bound:fig}. The circles
in the figure denote different labels and the correct label is plotted
in dark grey while the rest of the labels are plotted in light gray.
The height of each label designates its confidence. Three
settings are plotted in the figure. The left plot corresponds to the case
when the margin is larger than one, and therefore the bound
$\max_r \{\rowM{r} \cdot \vx + 1 - \delta_{y,r} \} - \rowM{y}\cdot\vx~$
equals zero, and hence the example is correctly classified. The
middle figure shows a case where the example is correctly classified
but with a small margin and we suffer some loss. The right plot depicts
the loss of a misclassified example.

Summing over all the examples in $S$ we get an upper bound on the
empirical loss,
\begin{equation}
\costalla \leq
\frac{1}{m}
\sum_{i=1}^m 
\left[ 
\max_r \{\mr \cdot \vx_i + 1 - \delta_{y_i,r} \} - \rowM{y_i} \cdot  \vx_i
\right] ~.
\label{bound}
\end{equation}

We say that a sample $S$ is {\em linearly separable by a multiclass machine} 
if there exists a matrix $\M$ such that the above loss is equal to zero for
all the examples in $S$, that is,
\begin{equation}
\forall i ~~\max_r \{ \mr \cdot\vx_i + 1 - \delta_{y_i,r}\} -  
 \rowM{{y_i}} \cdot\vx_i = 0 ~.   \label{all_sample_correct_conditions}
\end{equation}
Therefore, a matrix $\M$ that satisfies
Eq.~(\ref{all_sample_correct_conditions}) would also satisfy the constraints, 
\begin{equation}
\forall i,r ~~
 \rowM{{y_i}} \cdot\vx_i + \delta_{y_i,r} -  \mr \cdot\vx_i \geq 1 ~~ .
\label{margin_equations} 
\end{equation}
Define the $l_2$-norm of a matrix $\M$ to be the $l_2$-norm of the vector
represented by the concatenation of $\M$'s rows,
$ \Vert M \Vert_{2}^2 = \Vert (\rowM{1},\ldots,\rowM{k})\Vert_{2}^2 =
 	\sum_{i,j} M_{i,j}^2 ~ .  $
Note that if the constraints given by Eq.~(\ref{margin_equations}) are
satisfied, we can make the differences between $\rowM{{y_i}} \cdot\vx_i$
and $\mr \cdot\vx_i$ arbitrarily large. Furthermore, previous work on
the generalization properties of large margin DAGs~\citep{PlattCrSh00} for
multiclass problems showed that the generalization properties depend
on the $l_2$-norm of $\M$ \citep[see also][]{CrammerSi00}. We therefore
would like to seek a matrix $\M$ of a small norm that satisfies
Eq.~(\ref{margin_equations}). When the sample $S$ is linearly separable
by a multiclass machine, we seek a matrix $\M$ of the smallest norm that
satisfies Eq.~(\ref{margin_equations}). The result is the following
optimization problem,
\begin{eqnarray}
& {\displaystyle \min_{M}} &
	\half \Vert M \Vert_{2}^2  \label{optimization_problem_p_basic} \\
& \textup{subject to :} & \forall i,r \;\;\;
 \rowM{{y_i}} \cdot\vx_i  + \delta_{y_i,r} -  \mr\cdot \vx_i \geq 1 ~~ .
 \nonumber
\end{eqnarray}
Note that $m$ of the constraints for $r=y_i$
are automatically satisfied since,
\[ 
\rowM{{y_i}} \cdot\vx_i  + \delta_{y_i,y_i} - \rowM{{y_i}} \cdot \vx_i= 1 ~.
\]
This property is an artifact of the separable case. In the general case
the sample $S$ might not be linearly separable by a multiclass machine.
We therefore add slack variables $\xi_i \geq 0$ and modify
Eq.~(\ref{all_sample_correct_conditions}) to be,
\begin{equation}
\forall i \;\;\;
\max_r \{  \mr \cdot \vx_i+ 1 - \delta_{y_i,r} \} - 
 \rowM{{y_i}} \cdot \vx_i = \xi_i ~.
\label{constraints_on_xi} 
\end{equation}
We now replace the optimization problem
defined by Eq.~(\ref{optimization_problem_p_basic}) with
the following primal optimization problem,
\begin{eqnarray}
& \displaystyle
	\min_{M, \xi} &
	\half \beta \Vert M \Vert_{2}^2 + \sum_{i=1}^m \xi_i
	\label{primal_program_l2} \\
& \textup{subject to :} & \forall i,r \;\;\; 
 \rowM{{y_i}}\cdot\vx_i  + \delta_{y_i,r} -
 	\mr \cdot\vx_i \geq 1  - \xi_i ~~ .
	\nonumber 
\end{eqnarray} 
where $\beta > 0$ is a regularization constant and for $r=y_i$ the
inequality constraints become $\xi_i \geq 0$. This is an optimization
problem with ``soft'' constraints. We would like to note in passing that
it is possible to cast an analogous optimization problem with ``hard''
constraints as in \citep{Vapnik98}.

To solve the optimization problem we use the Karush-Kuhn-Tucker theorem~(see for
instance \citealp{Vapnik98,CristianiniSh00}). We add a dual set of variables,
one for each constraint and get the Lagrangian of the optimization problem,
\begin{eqnarray}
\mcal{L}(M, \xi, \eta) & = & 
\half \beta \sum_r \Vert \mr \Vert_{2}^2 +
\sum_{i=1}^m \xi_i \label{Lagrangian_l2} \\
& & + 
\sum_{i,r} \eta_{i,r} 
\left[
  \mr\cdot\vx_i - \rowM{{y_i}}  \cdot \vx_i
- \delta_{y_i,r}  + 1 - \xi_i
\right]
\nonumber \\
\textup{subject to :} && \forall i,r \;\;\; \eta_{i,r} \geq 0  ~~ . \nonumber
\end{eqnarray}
We now seek a saddle point of the Lagrangian, which would be the minimum 
for the primal variables $\{M, \xi\}$ and the maximum for the dual variables
$\eta$. To find the minimum over the primal variables we require,
\begin{eqnarray}
{\partial \over \partial \xi_i} \mcal{L} = 1 - \sum_{r} \eta_{i,r} = 0 
~~~~~
\Rightarrow 
~~~~~
\sum_{r} \eta_{i,r} = 1 ~~.\label{sum_eta_is_1}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
\end{eqnarray}
%
Similarly, for $\mr$ we require, 
\begin{eqnarray*}
\frac{\partial}{\partial \vdmatrix{r}} \mcal{L}  &=& 
\sum_i \eta_{i,r}\vx_i - \sum_{i, y_i=r} 
\underbrace{
\left(
\sum_{q} \eta_{i,q}
\right)
}_{=1}\vx_i  + \beta \vdmatrix{r} \\
  &=& 	
\sum_i \eta_{i,r}\vx_i - \sum_{i}
\delta_{y_ir} \vx_i  + \beta \vdmatrix{r} = 0 ~~ ,
\end{eqnarray*}
which results in the following form
\begin{equation}
\vdmatrix{r} = \beta^{-1} 
\left[ 
\sum_i(\delta_{y_i,r} - \eta_{i,r}) \vx_i
\right] ~. \label{value_of_mr}
\end{equation}
Eq.~(\ref{value_of_mr}) implies that the solution of the 
optimization problem given by Eq.~(\ref{optimization_problem_p_basic}) 
is a matrix $\M$ whose rows are linear combinations of the 
instances $\vx_1 \ldots \vx_m$. 
Note that from Eq.~(\ref{value_of_mr}) we get that the contribution of an 
instance $\vx_i$ to $\mr$ is $\delta_{y_i,r} - \eta_{i,r}$. 
We say that an example $\vx_i$ is a {\em support pattern} if there is a row $r$
for which this coefficient is not zero.
For each row $\mr$ of the matrix $\M$ we can partition the 
patterns with nonzero coefficients into two subsets by rewriting
Eq.~(\ref{value_of_mr}) as follows,
\[
\vdmatrix{r} = \beta^{-1} 
\left[ 
\sum_{i:y_i=r}    (1 - \eta_{i,r}) \vx_i +
\sum_{i:y_i\neq r}(-\eta_{i,r}) \vx_i
\right] ~.
\]
The first sum is over all patterns that belong to the $r$th class. Hence,
an example $\vx_i$ labeled $y_i=r$ is a support pattern only if
$\eta_{i,r} = \eta_{i,y_i} < 1$. The second sum is over the rest of the
patterns whose labels are different from $r$. In this case, an example
$\vx_i$ is a support pattern only if $\eta_{i,r} > 0$.
%
Put another way, since for each pattern $\vx_i$ the set
$\{\eta_{i,1},\eta_{i,2}\comdots\eta_{i,k}\}$ satisfies the constraints
$\eta_{i,1}\comdots\eta_{i,k} \geq 0$ and $\sum_{r} \eta_{i,r} =1$, 
each set can be viewed as a probability distribution over the labels
$\{1 \ldots k\}$.  Under this probabilistic interpretation an example
$\vx_i$ is a support pattern if and only if its corresponding distribution
is {\em not} concentrated on the correct label $y_i$. Therefore, the
classifier is constructed using patterns whose labels are uncertain;
the rest of the input patterns are ignored. 

Next, we develop the Lagrangian using only the dual variables 
by substituting Eqs.~(\ref{sum_eta_is_1}) and (\ref{value_of_mr}) into
Eq.~(\ref{Lagrangian_l2}).  Since the derivation is rather technical
we defer the complete derivation to App.~\ref{calculations}.
We obtain the following objective function of the dual program,
\begin{eqnarray}
\mcal{Q}(\eta) & = & 
- \half \beta^{-1} \sum_{i,j} (\vx_i \cdot \vx_j) 
\left[\sum_r (\delta_{y_i,r} - \eta_{i,r})(\delta_{y_j,r} - \eta_{j,r})\right] 
- \sum_{i,r} \eta_{i,r} \delta_{y_i,r} ~.\nonumber
\end{eqnarray}
Let $\ve_i$ be the vector whose components are all zero except for the 
$i$th component which is equal to one, and let $\ve$ be the vector whose 
components are all one. Using this notation we can rewrite the dual
program in the following vector form,
\begin{eqnarray}
& {\displaystyle \max_{\eta}} & \mcal{Q}(\eta) 
= -\half \beta^{-1} \sum_{i,j} 
\left(
\vx_i \cdot \vx_j
\right)
\left[
(\ve_{y_i} - \veta_{i}) \cdot  (\ve_{y_j} - \veta_{j})
\right] 
- \sum_{i} \veta_{i} \cdot  \ve_{y_i} \label{dual_program_l2}\\
&\textup{subject to :} & \forall i ~:~~
	\veta_{i} \geq 0 \quad\textup{and}\quad
	\veta_{i} \cdot \ve = 1   ~~ . \nonumber
\end{eqnarray}

It is easy to verify that $\mcal{Q}(\eta)$ is concave in $\eta$.
Since the set of constraints is convex, there is a unique maximum value
of $\mcal{Q}(\eta)$.
To simplify the problem we now perform the following change of variables.
Let $\vtau_i =  \ve_{y_i} - \veta_{i}$ be the difference between 
the point distribution $\ve_{y_i}$ concentrating on the correct label and 
the distribution $\veta_i$ obtained by the optimization problem. 
Then Eq.~(\ref{value_of_mr}) that describes the form of $\M$ becomes, 
\begin{equation} 
\vdmatrix{r} = \beta^{-1} \sum_i \tau_{i,r}  \vx_i ~.\label{value_of_mr_tau}
\end{equation}
Since we search for the value of the variables which maximize the objective
function $\mcal{Q}$ (and not the optimum value of $\mcal{Q}$ itself), we can
omit any additive and positive multiplicative constants and write the dual
problem given by Eq.~(\ref{dual_program_l2}) as,
\begin{eqnarray}
& {\displaystyle \max_{\tau}} & \mcal{Q}(\tau) 
= -\half \sum_{i,j} (\vx_i \cdot \vx_j)(\vtau_{i} \cdot  \vtau_{j}) +
\beta \sum_{i} \vtau_{i} \cdot \ve_{y_i} 	\label{dual_program_l2_tau}  \\
&\textup{subject to :} & \forall i \;\;\; \vtau_{i} \leq \ve_{y_i} 
\quad\textup{and}\quad\vtau_{i} \cdot \ve = 0 ~~ .
\nonumber
\end{eqnarray}
Finally, we rewrite the classifier $H(\vx)$ in terms of the variable $\tau$,
\begin{eqnarray}
H(\vx) 
& = & 
{\arg \max_{r=1}^k} 
\left\{ 
\mr \cdot \vx 
\right\}  =  
{\arg \max_{r=1}^k} 
\left\{ 
\sum_i \tau_{i,r} (\vx_i \cdot \vx)
\right\} ~. \label{dual_problem_algorithm}
\end{eqnarray}

As in Support Vector Machines \citep{CortesVa95}, 
the dual program and the resulting classifier 
depend {\em only} on inner products of the form $(\vx_i \cdot \vx)$. 
Therefore, we can perform inner-product calculations in some high dimensional inner-product 
space $\mcal{Z}$ by replacing the inner-products in Eq.~(\ref{dual_program_l2_tau}) 
and in Eq.~(\ref{dual_problem_algorithm}) with a kernel function $K(\cdot,\cdot)$ 
that satisfies Mercer's conditions \citep{Vapnik98}.
The general dual program using kernel functions is therefore,
\begin{eqnarray}
\label{dual_program_l2_tau_kip} 
& {\displaystyle \max_{\tau}} & \mcal{Q}(\tau) 
= -\half \sum_{i,j} K\left(\vx_i, \vx_j\right) ~(\vtau_{i} \cdot  \vtau_{j}) +
\beta\sum_{i} \vtau_{i} \cdot  \ve_{y_i}  \\
&\textup{subject to :} & \forall i \;\;\;
	\vtau_{i} \leq \ve_{y_i} \quad\textup{and}\quad\vtau_{i} \cdot \ve = 0 ~~,
        \nonumber
\end{eqnarray}
and the classification rule $H(\vx)$ becomes,
\begin{equation}
H(\vx) =  
{\arg \max_{r=1}^k} \left\{ \sum_i \tau_{i,r}  
K\left(\vx,\vx_i\right) \right\} ~.
\label{dual_problem_algorithm_kip}
\end{equation}
Therefore, constructing a multiclass predictor using kernel-based
inner-products is as simple as using standard inner-products.

Note the classifier of Eq.~(\ref{dual_problem_algorithm_kip}) does not
contain a bias parameter $b_r$ for $r=1 \dots k$. Augmenting these terms
will add $m$ more equality constraints to the dual optimization
problem, increasing the complexity of the optimization problem.
However, one can always use inner-products of the form
$K(\bar{a}, \bar{b}) + 1$ which is equivalent of using bias parameters,
and adding $\half \beta \sum_r b_r^2$ to the objective function. 

Also note 
that in the special case of $k=2$ Eq.~(\ref{primal_program_l2})
reduces to the primal program of SVM by setting $\vw = \rowM{1} -
\rowM{2}$ and $C= \beta^{-1}$. As mentioned above, Weston and
Watkins~\citeyearpar{WestonWa99} also developed a multiclass version for SVM.
Their approach compared the confidence $\rowM{y} \cdot\vx$ of the correct
label to the confidences of {\em all} the other labels $\mr\cdot\vx$
and therefore used $m(k-1)$ slack variables in the primal problem.
In contrast, in our framework the confidence of the correct label is
compared to the highest similarity-score among the rest of the labels
and uses only $m$ slack variables in the primal program. As we describe
in the sequel our compact formalization leads to a memory and time
efficient algorithm for the above optimization problem.


\begin{figure}[t]
%\figline\vspace{-0.1in}
{\pseudocodefont
{\bf Input} $\{(\vx_1, y_1) \comdots (\vx_m, y_m)\}$.\\
{\bf Initialize} $\vtau_1 =\vzero \comdots \vtau_m =\vzero$.\\
{\bf Loop}: 
\begin{enumerate}
\nolineskips
\item
Choose an example $p$.
\item 
Calculate the constants for the reduced problem:
\begin{itemize}
\item $A_p=K(\vx_p,\vx_p)$ 
\item $\bar{B}_p= \sum_{i \neq p} K(\vx_i,\vx_p) \vtau_{i}-\beta \ve_{y_p}   $ 
\end{itemize}
\item 
Set $\vtau_p$ to be the solution of the reduced problem :
\begin{eqnarray*}
& {\displaystyle \min_{\tau_p}} & \mcal{Q}(\vtau_p)
= \half A_p (\vtau_p \cdot \vtau_p) + \bar{B_p} \cdot \vtau_p \\
& \textup{subject to :} & \vtau_p \leq \ve_{y_p} \quad\textup{and}\quad \vtau_p \cdot \ve = 0
\end{eqnarray*}
\end{enumerate}
{\bf Output :} ${\displaystyle
	H(\vx) = {\arg \max_{r=1}^k}
	\left\{ \sum_i \tau_{i,r} K\left(\vx,\vx_i\right) \right\}}$.\\
}
\figline\vspace{-0.1in}
\caption{Skeleton of the algorithm for learning multiclass support vector
machine.}
\label{algorithm_1}
\end{figure}

\section{Decomposing the optimization problem} \label{decomp:sec}

The dual quadratic program given by Eq.~(\ref{dual_program_l2_tau_kip}) can be
solved using standard quadratic programming (QP) techniques. However, since
it employs $mk$ variables, converting the dual program given by 
Eq.~(\ref{dual_program_l2_tau_kip}) into a standard QP form yields
a representation that employs a matrix of size $mk \times mk$,
which leads to a very large scale problem in general.
Clearly, storing a matrix of that size is intractable for large problems.
We now introduce a simple, memory efficient algorithm for 
solving the quadratic optimization 
problem given by Eq.~(\ref{dual_program_l2_tau_kip}) by decomposing it into
small problems.

The core idea of our algorithm is based on separating the constraints
of Eq.~(\ref{dual_program_l2_tau_kip})
into $m$ disjoint sets,
$
\{\vtau_{i} | \vtau_{i} \leq \ve_{y_i} ,\,	\vtau_{i} \cdot \ve = 0 \}_{i=1}^{m} ~~ .$
The algorithm we propose works in rounds. 
On each round the algorithm chooses a pattern $p$ and improves the value of 
the objective function by updating the variables $\vtau_p$ 
under the set of constraints, $\vtau_{p} \leq \ve_{y_p}$ and
$\vtau_{p} \cdot \ve = 0$.

Let us fix an example index $p$ and write the objective function only
in terms of the variables $\vtau_p$.
For brevity we use $K_{i,j}$ to denote $K\left(\vx_i, \vx_j\right)$.
We now isolate the contribution of $\vtau_p$ in $\mcal{Q}$.
\begin{eqnarray}
\mcal{Q}_p(\vtau_p) 
& {\buildrel \rm def \over =} &
-\half \sum_{i,j} K_{i,j} (\vtau_{i} \cdot  \vtau_{j}) +
\beta \sum_{i} \vtau_{i} \cdot  \ve_{y_i} \nonumber \\
& = & -\half K_{p,p} (\vtau_{p} \cdot \vtau_{p}) - 
\sum_{i \neq p} K_{i,p} (\vtau_{p} \cdot \vtau_{i}) \nonumber \\
&&-\half \sum_{i \neq p,j \neq p} K_{i,j} (\vtau_{i} \cdot \vtau_{j}) +
\beta \vtau_{p} \cdot \ve_{y_p} + \beta \sum_{i \neq p} \vtau_{i} \cdot  \ve_{y_i} \nonumber \\
& = & -\half K_{p,p} (\vtau_{p} \cdot \vtau_{p}) -
\vtau_{p} \cdot \left[-\beta\ve_{y_p} + \sum_{i \neq p} K_{i,p} \vtau_{i}\right] \nonumber \\
&&+ \left[- \half \sum_{i\neq p,j\neq p} K_{i,j} (\vtau_{i} \cdot \vtau_{j}) +
 \beta\sum_{i \neq p} \vtau_{i} \cdot  \ve_{y_i}\right] ~.
\end{eqnarray}
Let us now define the following variables,
\begin{eqnarray}
A_p       & = & K_{p,p} > 0 \label{dual_problem_A_p}\\
\bar{B}_p & = & -\beta \ve_{y_p} + \sum_{i \neq p} K_{i,p} \vtau_{i}
	\label{dual_problem_B_p}\\
C_p       & = & - \half \sum_{i,j \neq p}
	K_{i,j} (\vtau_{i} \cdot \vtau_{j}) +
	\beta \sum_{i \neq p} \vtau_{i} \cdot  \ve_{y_i} \nonumber%\label{dual_problem_C_p}
	~~ .
\end{eqnarray}
Using the variables defined above the objective function becomes,
\[
\mcal{Q}_p(\vtau_p) = 
-\half A_p (\vtau_{p} \cdot \vtau_{p}) - \bar{B}_p \cdot \vtau_{p} + C_p ~. 
%\label{dual_program_l2_tau_p}
\]
%\marginpar{Scholkopf margin Thm. : $C^{-1}=$margin.}

For brevity, let us now omit all constants that do not affect the solution.
Each reduced optimization problem has $k$ variables and $k+1$ constraints, 
\begin{eqnarray}
& {\displaystyle \min_{\tau}} & \mcal{Q}(\vtau)
= \half A_p (\vtau_{p} \cdot \vtau_{p}) + \bar{B}_p \cdot \vtau_{p}
\label{dual_program_l2_tau_one_example}\\
& \textup{subject to :} & \vtau_p \leq \ve_{y_p} \textup{ and } \,
	\vtau_p \cdot \ve = 0~.
        \nonumber
\end{eqnarray}

The skeleton of the algorithm is given in Figure~\ref{algorithm_1}.
The algorithm is initialized with $\vtau_i=\vzero$ for $i = 1 \dots m$
which, as we
discuss later, leads to a simple initialization of internal variables
the algorithm employs for efficient implementation.  To complete the
details of the algorithm we need to discuss the following issues.  First,
we need a stopping criterion for the loop. A simple method is to run
the algorithm for a fixed number of rounds. A better approach which we
discuss in the sequel is to continue iterating as long as the algorithm
does not meet a predefined accuracy condition.  Second, we need a scheme
for choosing the pattern $p$ on each round which then induces the reduced
optimization problem given in Eq.~(\ref{dual_program_l2_tau_one_example}).
Two commonly used methods are to scan the patterns sequentially or to choose
a pattern uniformly at random. In this paper we describe a scheme for
choosing an example $p$ in a greedy manner. This scheme appears to
perform better empirically than other naive schemes. We address these
two issues in Section~\ref{choose_and_stop}.
 
The third issue we need to address is how to solve efficiently  
the reduced problem given by Eq.~(\ref{dual_program_l2_tau_one_example}).
Since this problem constitutes the core and the inner-loop of the algorithm
we develop an efficient method for solving the reduced quadratic optimization 
problem. This method is more efficient than using the standard QP
techniques, especially when it suffices to find an approximation to the
optimal solution. Our specialized solution enables us to solve problems
with a large number of classes $k$ when a straightforward approach could
not be applicable. This method is described in Section~\ref{small_problem}.

\section{Example selection for optimization}
\label{choose_and_stop}

To remind the reader, we need to solve Eq.~(\ref{dual_program_l2_tau_kip}),
\begin{eqnarray*}
& {\displaystyle \min_{\tau}} & \mcal{Q}(\tau) 
	= \half \sum_{i,j} K_{i,j} ~(\vtau_{i} \cdot  \vtau_{j}) -
	\beta \sum_{i} \vtau_{i} \cdot  \ve_{y_i} \\
&\textup{subject to :} & \forall i \;\;\;
	\vtau_{i} \leq \ve_{y_i}\quad \textup{and}\quad \vtau_{i} \cdot \ve = 0 ~,
\end{eqnarray*}
where as before $K_{i,j} = K\left(\vx_i, \vx_j\right)$.
We use the Karush-Kuhn-Tucker theorem \citep[see][]{CristianiniSh00} to find
the necessary conditions for a point $\tau$ 
to be an optimum of Eq.~(\ref{dual_program_l2_tau_kip}).
The Lagrangian of the problem is,
\begin{eqnarray}
\label{lagrangian_for_reduced}
\mcal{L}(\tau,u,v)
& = & 
\half \sum_{i,j} K_{i,j} \sum_r \tau_{i,r} \tau_{j,r} 
-\beta\sum_{i,r} \tau_{i,r} \delta_{y_i,r} 
\\
&&+\sum_{i,r} u_{i,r} (\tau_{i,r} - \delta_{y_i,r}) - \sum_{i} v_i \sum_r \tau_{i,r} \nonumber\\
\textup{subject to :} && \forall i,r \;\;\;
	u_{i,r} \geq 0  ~~.
        \nonumber
\end{eqnarray}
The first condition is,
\begin{eqnarray}
{\partial \over \partial \tau_{i,r}} \mcal{L} 
&=& 
\sum_j K_{i,j} \tau_{j,r} -
	\beta\delta_{y_i,r} + u_{i,r} - v_i = 0 \label{derative_of_l_tau_ir}~.
\end{eqnarray}
Let us now define the following auxiliary set of variables,
\begin{eqnarray}
F_{i,r} &=& \sum_j K_{i,j} \tau_{j,r} - \beta\delta_{y_i,r} ~.
\label{matrix_f_ir}
\end{eqnarray}
For each instance $\vx_i$, the value of $F_{i,r}$ designates the confidence
in assigning the label $r$ to $\vx_i$. A value of $\beta$ is subtracted from
the correct label confidence in order to obtain a margin of at least $\beta$.
\iffalse
for the special case of dot product kernel
$k(\bar{u}, \bar{v}) = \bar{u} \cdot \bar{v}$
we obtain that
\[ 
F_{i,r} = \mr \cdot \vx_i - \beta\delta_{y_i,r} ~.
\]
\fi
Note that from Eq.~(\ref{dual_problem_B_p}) we get,
\begin{equation}
F_{p,r} = B_{p,r} + k_{p,p} \tau_{p,r}~.
\label{ftob:eqn}
\end{equation}
We will make use of this relation between the variables $F$ and $B$ in the
next section in which we discuss an efficient solution to the quadratic
problem.

Taking the derivative with respect to the dual variables of
the Lagrangian given by Eq.~(\ref{lagrangian_for_reduced}) and
using the definition of $F_{i,r}$ from Eq.~(\ref{matrix_f_ir})
and KKT conditions
we get the following set of equality constraints on a feasible
solution for the quadratic optimization problem,
\begin{eqnarray}
\forall i,r \;\;
	& \quad & F_{i,r} + u_{i,r} = v_i ~~ ,
		\label{derivative_constraint}\\
\forall i,r \;\;
	& \quad & u_{i,r}  (\tau_{i,r} - \delta_{y_i,r}) = 0 ~~ ,
		\label{multiplication_constraint}\\
\forall i,r \;\;
	& \quad & u_{i,r} \geq 0 ~~ .
		\label{u_constraint}
\end{eqnarray}
We now further simplify the equations above. We do so by considering two
cases. The first case is when $\tau_{i,r} = \delta_{y_i,r}$. In this case
Eq.~(\ref{multiplication_constraint}) holds automatically. By combining 
Eq.~(\ref{u_constraint}) and Eq.~(\ref{derivative_constraint}) we get that,
\begin{equation}
	F_{i,r} \leq v_i ~~.
	\label{first_case}
\end{equation}
In the second case $\tau_{i,r} < \delta_{y_i,r}$. In order for
Eq.~(\ref{multiplication_constraint}) to hold we must have $u_{i,r}= 0$.
Thus, using Eq.~(\ref{derivative_constraint}) we get that,
\[
	F_{i,r} = v_i ~.
\]
We now replace the single equality constraint with the following two
inequalities,
\begin{eqnarray}
F_{i,r} \geq v_i ~ \mbox{ and } ~ F_{i,r} \leq v_i ~~ . \label{second_case}
\end{eqnarray}
To remind the reader, the constraints on $\vtau$ from the optimization problem
given by Eq.~(\ref{dual_program_l2_tau_kip}) imply that for all $i$,
$\vtau_{i} \leq \ve_{y_i}$ and $\vtau_{i} \cdot \ve = 0$.
Therefore, if these constraints are satisfied there must exist at least
one label $r$ for which $\tau_{i,r} < \delta_{y_i,r}$. We thus get that
$v_i = \max_r F_{i,r}$. Note also that if $\vtau_i=0$ then
$F_{i,y_i} = v_i = \max_r F_{i,r}$ and $F_{i,y_i}$ is the unique maximum.
We now combine the set of constraints from
Eqs.~(\ref{first_case})~and~(\ref{second_case}) into a single inequality,
\iffalse
Note that for all $i$ there is at least one class $r$ such that
the second case is true.  This is because the constraints of
Eq.~(\ref{dual_program_l2_tau_kip}) are $\forall i ~~~ \vtau_{i} \leq
\ve_{y_i}\quad\textup{and}\quad\vtau_{i} \cdot \ve = 0$.  Therefore,
$v_i = \max_r F_{i,r}$ which is equal to the maximal confidence.
Also note then when $\vtau_i=0$ then $F_{i,y_i} = v_i = \max_r
F_{i,r}$, and $F_{i,y_i}$ is the only maximal value.  We summarize
Eqs.~(\ref{first_case})~and~(\ref{second_case}) with,
\fi
\begin{equation}
	\max_r F_{i,r} \leq
		v_i \leq \min_{r~:~\tau_{i,r} < \delta_{y_i,r}} F_{i,r} ~~ .
\end{equation}
Finally, dropping $v_i$ we obtain, 
\begin{equation}
	\max_r F_{i,r} \leq
		\min_{r~:~\tau_{i,r} < \delta_{y_i,r}} F_{i,r} ~.
\end{equation}
We now define,
\begin{equation}
	\psi_i =
		\max_r F_{i,r} - \min_{r~:~\tau_{i,r} < \delta_{y_i,r}} F_{i,r} ~~. 
\label{psi_i}
\end{equation}
Since $\max_r F_{i,r} \geq \min_{r~:~\tau_{i,r} < \delta_{y_i,r}} F_{i,r}$
then the necessary and sufficient condition for a feasible vector $\vtau_i$
to be an optimum for Eq.~(\ref{dual_program_l2_tau_kip}) is that,
$\psi_i = 0$. In the actual numerical implementation it is sufficient to
find $\vtau_i$ such that $\psi_i\leq\epsilon$ where $\epsilon$ is a
predefined accuracy parameter. We therefore keep performing the main
loop of Figure~\ref{algorithm_1} so long as there are examples $(\vx_i,y_i)$
whose values $\psi_i$ are greater than $\epsilon$.

The variables $\psi_i$ also
serve as our means for choosing an example for an update. In our
implementation we try to keep the memory requirements as small as
possible and thus manipulate a single example on each loop. We choose
the example index $p$ for which $\psi_p$ is maximal. We then find the
vector $\vtau_p$ which is the (approximate) solution of the reduced
optimization problem given by Eq.~(\ref{dual_program_l2_tau_kip}). Due to
the change in $\vtau_p$ we need to update $F_{i,r}$ and $\psi_i$ for all
$i$ and $r$. The pseudo-code describing this process is deferred to the
next section in which we describe a simple and efficient algorithm for
finding an approximate solution for the optimization problem of
Eq.~(\ref{dual_program_l2_tau_kip}).
Lin~\citeyearpar{Lin01} showed that this scheme does converge to the solution in
a finite number of steps.
Finally, we would like to note that some of the underlying ideas described
in this section have been also explored by
Keerthi~and~Gilbert~\citeyearpar{KeerthiGi00}.


\section{Solving the reduced optimization problem}
\label{small_problem}

The core of our algorithm relies on an efficient method for solving the
reduced optimization given by Eq.~(\ref{dual_program_l2_tau_kip}) or
the equivalent problem as defined by
Eq.~(\ref{dual_program_l2_tau_one_example}). In this section we describe
an efficient fixed-point algorithm that finds an approximate solution
to Eq.~(\ref{dual_program_l2_tau_one_example}). We would like to note
that an exact solution can also be derived. In \citep{CrammerSi00} we
described a closely related algorithm for solving a similar quadratic
optimization problem in the context of output coding. A simple modification
of the algorithm can be used here. However, the algorithm needs to sort
$k$ values on each iteration and thus might be slow when $k$ is large.
Furthermore, as we discuss in the next section, we found empirically
that the quality of the solution is quite insensitive to how well we
fulfill the Karush-Kuhn-Tucker condition by bounding $\psi_i$. Therefore, it is enough
to find a vector $\vtau_p$ that decreases significantly the value of
$\mcal{Q}(\vtau)$ but is not necessarily the optimal solution.

We start by rewriting $\mcal{Q}(\vtau)$ from
Eq.~(\ref{dual_program_l2_tau_one_example}) 
using a completion to quadratic form and dropping the pattern index $p$,
\begin{eqnarray*}
\mcal{Q}(\vtau) 
& = & -\half A (\vtau \cdot \vtau) - \bar{B} \cdot \vtau \\
& = & -\half A [ (\vtau + {\bar{B} \over A}) \cdot (\vtau + {\bar{B} \over A})]
+ {\bar{B} \cdot \bar{B} \over 2A}~.
\end{eqnarray*}
We now perform the following change of variables,
\begin{equation}
\vnu = \vtau + {\bar{B} \over A} 
\qquad \quad
\vD = {\bar{B} \over A} +\ve_{y}~.
\label{value_of_vec_nu_and_vec_D}
\end{equation}
At this point, we omit additive constants and the multiplicative factor $A$
since they do not affect the value of the optimal solution. Using the above
variable the optimization problem from
Eq.~(\ref{dual_program_l2_tau_one_example}) now becomes,
\begin{eqnarray}
\label{dual_program_nu} 
& {\displaystyle \min_{\nu}} & \mcal{Q}(\vnu)
	= \Vert \vnu \Vert ^{2} \\
& \textup{subject to :} &
	\vnu \leq \vD \textup{ and } \vnu \cdot \ve = \vD \cdot \ve - 1 ~~ .
        \nonumber
\end{eqnarray}
We would like to note that since
\[
F_{i,r} = B_{i,r} + A_i \tau_{i,r} ~~ ,
\]
we can compute $\psi_i$ from $\bar{B}_i$ and thus need to store either
$B_{i,r}$ or $F_{i,r}$.

Let us denote by $\theta$ and $\alpha_i$ the variables of the dual problem
of Eq.~(\ref{dual_program_nu}). Then, the Karush-Kuhn-Tucker conditions imply that,
\begin{equation}
\forall r \;\;\;
\nu_r \leq D_r \; \mbox{ ; } \;
\alpha_r (\nu_r - D_r) = 0 \; \mbox{ ; } \;
\nu_r + \alpha_r - \theta = 0 ~~ .
\end{equation}
Note that since $\alpha_r\geq 0$ the above conditions imply that 
$\nu_r\leq \theta$ for all $r$. Combining this inequality with the
constraint that $\nu_r\leq D_r$ we get that the solution satisfies
\begin{equation}
	\nu_r \leq \min\{\theta, D_r\}~~.
	\label{nu_r:eqn}
\end{equation}
If $\alpha_r=0$ we get that $\nu_r=\theta$ and if $\alpha_r>0$ we
must have that $\nu_r = \D_r$. Thus, Eq.~(\ref{nu_r:eqn}) holds with
equality, namely, the solution is of the form,
$\nu_r = \min\{\theta, D_r\}$.
Now, since $\vnu \cdot \ve = \vD \cdot \ve - 1$ we get that
$\theta$ satisfies the following constraint,
\begin{equation}
	\sum_{r=1}^k \min_r \{\theta, D_r\}  = \sum_{r=1}^k D_r - 1~~.
	\label{equality_constraint_nu}
\end{equation}
The above equation uniquely defines $\theta$ since 
the sum $\sum_{r=1}^k \min_r \{\theta, D_r\}$ is a strictly monotone
and continuous function in $\theta$. For $\theta=\max_r D_r$ we have that
$\sum_{r=1}^k \min_r \{\theta, D_r\} > \sum_{r=1}^k D_r - 1$ while
$\sum_{r=1}^k \min_r \{\theta, D_r\}\rightarrow-\infty$ as
$\theta\rightarrow -\infty$. Therefore, there always exists 
a unique value $\theta^\star$
that satisfies Eq.~(\ref{equality_constraint_nu}). The following theorem
shows that $\theta^\star$ is indeed the optimal solution of the quadratic
optimization problem.

\begin{theorem}
Let $\nu^*_r = \min\{\theta^\star, D_r\}$ where $\theta^\star$ is the
solution of $\sum_{r=1}^k \min_r \{\theta, D_r\}  = \sum_{r=1}^k D_r - 1$.
Then, for every point $\vnu$ we have that 
$\Vert \vnu \Vert^2 > \Vert \vnu^* \Vert ^2$.
\end{theorem}
\begin{proof}
Assume by contradiction that there is another feasible point
$\vnu = \vnu^* + \bar{\Delta}$ which minimizes the objective function.
Since $\vnu \neq \vnu^*$ we know that $\bar{\Delta}\neq 0$.
Both $\vnu$ and $\vnu^*$ satisfy the equality constraint of
Eq.~(\ref{dual_program_nu}), thus $\sum_r \Delta_r = 0$.
Also, both points satisfy the inequality constraint of
Eq.~(\ref{dual_program_nu}) thus $\Delta_r \leq 0$ when $\nu^*_r = D_r$.
%KOBYC
%before
Combining the last two equations with the assumption that $\bar{\Delta}\neq 0$ 
we get that $\Delta_s>0$ for some $s$ with $\nu^*_s = \theta$.
Using again the equality $\sum_r \Delta_r = 0$ we have that there exists
an index $u$ with $\Delta_u<0$. 
%after
\iffalse
Combining the last two equations with the assumption that $\bar{\Delta}\neq 0$ 
we get that $\Delta_u<0$ for some index $u$.
Using again the equality $\sum_r \Delta_r = 0$ we have that there exists
an index $s$ with $\Delta_s>0$ and $\nu^*_s = \theta$.
\fi
%
Let us denote by
$\epsilon = \min\{\vert \Delta_s \vert, \vert \Delta_u \vert\}$.
We now define a new feasible point $\vnu'$ as follows.
Let $\nu'_s = \nu_s - \epsilon$, $\nu'_u = \nu_u + \epsilon$, and  
$\nu'_r = \nu_r$ otherwise. We now show that the norm of $\vnu'$ is smaller
than the norm of $\vnu$.  Since $\vnu$ and $\vnu'$ differ only in their
$s$ and $u$ coordinates, we have that,
\[
	\Vert \vnu' \Vert^2 - \Vert \vnu \Vert^2 = 
		(\nu'_s)^2 + (\nu'_u)^2 - (\nu_s)^2 - (\nu_u)^2 ~~.
\]
Writing the values of $\vnu'$ in terms of $\vnu$ and $\epsilon$ we get,
\[
	\Vert \vnu' \Vert^2 - \Vert \vnu \Vert^2 = 
		2 \epsilon \left(\epsilon - \nu_s + \nu_u\right) ~~ .
\]
From our construction of $\vnu'$ we have that either
$\nu_u - \epsilon = \theta > \nu_s$ or $\nu_u - \epsilon > \theta \geq \nu_s$
and therefore we get $\nu_u - \epsilon > \nu_s$. This implies that
\[
	\Vert \vnu' \Vert^2 - \Vert \vnu \Vert^2 < 0 ~~,
\]
which is clearly a contradiction.
%\QED
\end{proof}

We now use the above characterization of the solution to derive a simple
fixed-point algorithm that finds $\theta^\star$. We use the simple identity
$\min\{\theta, D_r\} + \max\{\theta, D_r\} = \theta + D_r$ and replace
the minimum function with the above sum in Eq.~(\ref{equality_constraint_nu})
and get,
\[
	\sum_{r=1}^k
	\left[ \theta + D_r - \max\{\theta, D_r\}\right] = \sum_{r=1}^k D_r -1 ~,
\] 
which amounts to,
\begin{equation}
\theta^* = \frac{1}{k}\left[\sum_{r=1}^k \max\{\theta^*, D_r\}\right] - \frac{1}{k} ~.
\label{fixed_point_of_theta}
\end{equation}
Let us define,
\begin{equation}
F(\theta) =
	\frac{1}{k}\left[\sum_{r=1}^k \max\{\theta, D_r\}\right] - \frac{1}{k}~.
\label{function_f}
\end{equation}
Then, the optimal value $\theta^*$ satisfies 
\begin{equation}
\theta^* = F(\theta^*) ~.
\label{fixed_point_of_theta_F}
\end{equation}
Eq.~(\ref{fixed_point_of_theta_F}) can be used for the following
iterative algorithm. The algorithm starts with an initial value of
$\theta$ and then computes the next value using Eq.~(\ref{function_f}).
It continues iterating by substituting each new value of $\theta$ in
$F(\cdot)$ and producing a series of values for $\theta$. The algorithm
halts when a required accuracy is met, that is, when two successive
values of $\theta$ are close enough. A pseudo-code of the algorithm
is given in Figure~\ref{algorithm_for_theta}. The input to the algorithm
is the vector $\vD$, an initial suggestion for $\theta$, and a required
accuracy $\epsilon$. We next show that if $\theta_1 \leq \max_r D_r$ 
then the algorithm does converge to the correct value of $\theta^*$.

\begin{figure}[t]
{\pseudocodefont
\underline{\bf\tt FixedPointAlgorithm($\vD,\theta,\epsilon$)}\\
{\bf Input} $\vD$,~$\theta_1$,~$\epsilon$. \\
{\bf Initialize} $l=0$. \\
{\bf Repeat}
\begin{itemize}
\nolineskips
\item $l \leftarrow l+1$.
\item $\theta_{l+1} \leftarrow  \frac{1}{k}\left[\sum_{r=1}^k \max\{\theta_l, D_r\}\right] - \frac{1}{k}$.
\end{itemize}
{\bf Until} $~~~{\displaystyle  \left\vert\frac{\theta_{l} - \theta_{l+1}}{\theta_{l}}\right\vert \leq \epsilon} $.\\
{\bf Assign}
	for $r=1 \comdots k$: $\nu_{r} = \min\{\theta_{l+1}, D_r\}$\\
{\bf Return:} $\vtau=\vnu-\frac{\vB}{A}$.\\
}
\figline\vspace{-0.1in}
\caption{The fixed-point algorithm for solving the reduced quadratic program.}
\label{algorithm_for_theta}
\end{figure}

\begin{theorem}
Let $\theta^*$ be the fixed point of Eq.~(\ref{fixed_point_of_theta_F}) $\left(\theta^* = F(\theta^*)\right)$.
Assume that $\theta_1\leq\max_r D_r$ and let $\theta_{l+1} = F(\theta_{l})$.
Then for $l\geq 1$ 
\[
\frac{\vert \theta_{l+1} - \theta^* \vert}{\vert \theta_{l} -
	\theta^* \vert} \leq 1-\frac{1}{k} ~,
\]
where $k$ is the number of classes.
\end{theorem}
\begin{proof}
Assume without loss of generality that $\max_r D_r = D_1 \geq D_2 \geq \dots \geq D_k \geq D_{k+1} \eqdef -\infty$.
Also assume that $\theta^* \in (D_{s+1}, D_s)$ and $\theta_{l} \in (D_{u+1}, D_u)$ where $u,s \in \{1,2\comdots k\}$. 
Thus,
\begin{eqnarray}
\theta_{l+1}
&=& 
F(\theta_{l}) \nonumber \\
&=&
\frac{1}{k}\left[\sum_{r=1}^k \max\{\theta_l, D_r\}\right] - \frac{1}{k} \nonumber \\
&=&
\frac{1}{k}\left(\sum_{r=u+1}^k \theta_l\right) + \frac{1}{k}\left(\sum_{r=1}^u D_r\right) - \frac{1}{k}\nonumber \\
&=&
\left(1-\frac{u}{k}\right)\theta_l + \frac{1}{k}\left(\sum_{r=1}^u D_r - 1\right) ~.
\label{theta_tag}
\end{eqnarray}
Note that if $\theta_l \leq \max_r D_r$ then $\theta_{l+1} \leq \max_r D_r$.
Similarly,
\begin{equation}
\begin{array}{c}
{\displaystyle \theta^*
= 
F(\theta^*)
=
\left(1-\frac{s}{k}\right)\theta^* + \frac{1}{k}\left(\sum_{r=1}^s D_r - 1\right)}\\
\displaystyle{\quad \Rightarrow 
\theta^* = \frac{1}{s}\left(\sum_{r=1}^s D_r -1\right)}~.
\end{array}
\label{theta_star}
\end{equation}
We now need to consider three cases depending on the relative order of
$s$ and $u$.
The first case is when $u=s$. In this case we get that,
\begin{eqnarray*}
\frac{\vert \theta_{l+1} - \theta^* \vert}{\vert \theta_{l} - \theta^* \vert} 
&=&
\frac{\vert \left(1-\frac{s}{k}\right)\theta_{l} + \frac{1}{k}\left(\sum_{r=1}^s D_r - 1\right)  - \theta^* \vert}
{\vert \theta_{l} - \theta^* \vert} \\
&=&
\frac{\vert \left(1-\frac{s}{k}\right)\theta_{l} + \frac{s}{k} \theta^* - \theta^* \vert}{\vert \theta_{l} - \theta^* \vert} \\
&=&
1-\frac{s}{k} \leq 1-\frac{1}{k} ~.
\end{eqnarray*}
where the second equality follows from Eq.~(\ref{theta_star}).
The second case is where $u>s$.
In this case we get that
%,\[
%D_{u+1} \leq \theta_l \leq D_{u} \leq D_{s+1} \leq \theta^* \leq D_{s} \leq D_1 ~.
%\]
%In addition, 
for all $r=s+1 \comdots u$ :
% we get,
\begin{equation}
\theta_l \leq D_r \leq \theta^* ~.
\label{theta_star_upper_bounds_D_r}
\end{equation}
Using Eq.~(\ref{theta_tag}) and Eq.~(\ref{theta_star}) we get,
\begin{eqnarray*}
\theta_{l+1}
&=&
\left(1-\frac{u}{k}\right)\theta_l + \frac{1}{k}\left(\sum_{r=1}^u D_r - 1\right) \\
&=&
\left(1-\frac{u}{k}\right)\theta_l + \frac{s}{k} \frac{1}{s}\left(\sum_{r=1}^s D_r-1\right)
+ \frac{1}{k}\left(\sum_{r=s+1}^u D_r\right) \\
&=&
\left(1-\frac{u}{k}\right)\theta_l + \frac{s}{k} \theta^*
+ \frac{1}{k}\left(\sum_{r=s+1}^u D_r \right) ~.
\end{eqnarray*}
Applying Eq.~(\ref{theta_star_upper_bounds_D_r}) we obtain,
\begin{eqnarray*}
\theta_{l+1}
&\leq&
\left(1-\frac{u}{k}\right)\theta_l + \frac{s}{k} \theta^*
+ \frac{1}{k}(u-s)\theta^*\\
&=&
\left(1-\frac{u}{k}\right)\theta_l +  \frac{u}{k}\theta^* ~.
\end{eqnarray*}
Since $\theta_{l+1}$ is bounded by a convex combination of $\theta_l$
and $\theta^*$, and $\theta^*$ is larger than $\theta_l$, then
$\theta^* \geq \theta_{l+1}$. We therefore finally get that,
\begin{eqnarray*}
\frac{\vert \theta_{l+1} - \theta^* \vert}{\vert \theta_{l} - \theta^* \vert} 
&=&
\frac{\theta^* - \theta_{l+1}}{\theta^* - \theta_{l}} \\
&\leq&
\frac{\theta^*- \left(1-\frac{u}{k}\right)\theta_{l} - \frac{u}{k} \theta^*}
{\theta^* - \theta_{l}} \\
&=&
1-\frac{u}{k} \leq 1-\frac{1}{k} ~.
\end{eqnarray*}
The last case, where $u<s$, is derived analogously to the second case, interchanging the roles of $u$
and $s$.
%\QED\\
\end{proof}
From the proof we see that the best convergence rate is obtained for large values of $u$.
Thus, a good feasible initialization for $\theta_1$ can be $\min_r D_r$.
In this case 
\[
\theta_{2}=F(\theta_1)=\frac{1}{k}\left(\sum_{r=1}^k D_r \right) - \frac{1}{k} ~.
\]
This gives a simple initialization of the algorithm which ensures that
the initial rate of convergence will be fast.

\begin{figure}[t]
%\figline\vspace{-0.1in}
{\pseudocodefont
{\bf Input} $\{(\vx_1, y_1) \comdots (\vx_m, y_m)\}$.\\
{\bf Initialize} for~$i=1,\dots,m$:
\begin{itemize}
\nolineskips
\item $\vtau_i =\vzero$
\item $F_{i,r} = -\beta \delta_{r,y_i}$ ($r = 1 \dots k$)
\item $A_i=K(\vx_i,\vx_i)$
\end{itemize}
{\bf Repeat}: 
\begin{itemize}
\item
Calculate for~$i=1\dots m$: \,
	$\displaystyle
		\psi_i=\max_r F_{i,r} - \min_{r~:~\tau_{i,r} < \delta_{y_i,r}} F_{i,r}$
\item Set: \, $p=\arg\max\{\psi_i\}$
\item 
Set for~$r=1 \dots k: \;$
	$ D_{r} = {F_{p,r} \over A_p} - \tau_{p,r} + \delta_{r,y_p} $
	\ and \
	$\theta =\frac{1}{k}\left(\sum_{r=1}^k D_r \right) - \frac{1}{k} $
	% $$B_{p,r}= F_{p,r} - A_p \tau_{p,r}$$
\item Call: \,
	$\vtau^\star_p=${\tt FixedPointAlgorithm($\vD,\theta,\epsilon/2$)}.
	~~~ (See Figure~\ref{algorithm_for_theta})
\iffalse
\item Let $\vtau^\star_p$ be the approximate solution of the problem:
\begin{eqnarray*}
& {\displaystyle \min_{\tau_p}} & \mcal{Q}(\vtau_p)
= \half A_p (\vtau_p \cdot \vtau_p) + \bar{B_p} \cdot \vtau_p \\
& \textup{subject to :} & \vtau_p \leq \ve_{y_p} \quad\textup{and}\quad \vtau_p \cdot \ve = 0
\end{eqnarray*}
\fi
\item Set: ~ $\Delta \vtau_p = \vtau_p^\star - \vtau_p$
\item Update for~$i=1 \dots m$ and $r=1 \dots k$:
			$F_{i,r} \leftarrow F_{i,r} + \Delta\tau_{p,r}\,K\left(\vx_p,\vx_i\right)$
\item Update: $\vtau_p \leftarrow \vtau_p^\star$
\end{itemize}
{\bf Until } $\psi_p < \epsilon \beta$\\
{\bf Output :} ${\displaystyle   H(\vx) = {\arg \max_r} \left\{ \sum_i \tau_{i,r} K\left(\vx,\vx_i\right) \right\}}$.\\
}
\figline\vspace{-0.1in}
%KOBYC
%changed Baseline to Basic
\caption{Basic algorithm for learning a multiclass, kernel-based,
support vector machine using KKT conditions for example selection.}
\label{algorithm_2}
\end{figure}
% \paragraph{Putting it all together:}
We are now ready to describe the complete implementation of the
algorithm for learning multiclass kernel machine. The algorithm gets a
required accuracy parameter and the value of $\beta$. It is initialized
with $\vtau_i=0$ for all indices $1\leq i\leq m$. This value yields a
simple initialization of the variables $F_{i,r}$. On each iteration we
compute from $F_{i,r}$ the value $\psi_i$ for each example and choose
the example index $p$ for which $\psi_p$ is the largest. We then call
the fixed-point algorithm which in turn finds an approximate solution
to the reduced quadratic optimization problem for the example indexed
$p$. The fixed-point algorithm returns a set of new values for $\vtau_p$
which triggers the update of $F_{i,r}$.  This process is repeated until
the value $\psi_i$ is smaller than $\epsilon$ for all $1\leq i \leq m$.
The pseudo-code of the algorithm is given in Figure~\ref{algorithm_2}.

\section{Implementation details} \label{impdet:sec}

\begin{figure}[t]
\centerline{
	\psfig{figure=PS/eps_vs_time.ps,width=0.45\textwidth}
	\hspace{1cm}
	\psfig{figure=PS/eps_vs_error.ps,width=0.45\textwidth}}
\caption{The run time (left) and test error (right) as a function
of required accuracy $\epsilon$.}
\label{epsilon_effect:fig}
\end{figure}

We have discussed so far the underlying principal and algorithmic issues
that arise in the design of multiclass kernel-based vector machines.
However, to make the learning algorithm practical for large datasets 
we had to make several technical improvements to the baseline implementation.
While these improvements do not change the underlying design principals
they lead to a significant improvement in running time.
We therefore devote this section to a description of the implementation
details. To compare the performance of the different versions presented
in this section we used the MNIST OCR dataset\footnote{Available at
{\tt http://www.research.att.com/$\,\tilde{\,}$yann/exdb/mnist/index.html}}.
The MNIST dataset contains $60,000$ training examples and $10,000$ test
examples and thus can underscore significant implementation improvements.
Before diving into the technical details we would like to note that many
of the techniques are by no means new and have been used in prior
implementation of two-class support vector machines (see for instance
\citealp{Platt98,Joachims98,CollobertBe01}). However, a few of our
implementation improvements build on the specific algorithmic design
of multiclass kernel machines.

Our starting point and base-line implementation is the algorithm described 
in Figure~\ref{algorithm_1} combined with the fixed-point algorithm for solving
the reduced quadratic optimization problem. In the base-line implementation
we simply cycle through the examples for a fixed number of iterations,
solving the reduced optimization problem, ignoring the KKT conditions for the
examples. This scheme is very simple to implement and use. However, it
spends unnecessary time in the optimization of patterns which are correctly
classified with a large confidence. We use this scheme to illustrate
the importance of the efficient example selection described in the previous
section. We now describe the different steps we took, starting from the
version described in Section~\ref{choose_and_stop}.

\paragraph{Using KKT for example selection} This is the algorithm described
in Figure~\ref{algorithm_2}. For each example $i$ and label $r$ we compute
$F_{i,r}$. These variables are used to compute $\psi_i$ as described in
Section~\ref{choose_and_stop}. On each round we choose the example $p$ for
which $\psi_p$ is the largest and iterate the process until the value of
$\psi_i$ is smaller for a predefined accuracy denoted by $\epsilon$. It turns
out that the choice of $\epsilon$ is not crucial and a large range of
values yield good results. The larger $\epsilon$ is the sooner we terminate
the main loop of the algorithm. Therefore, we would like to set $\epsilon$
to a large value as long as the generalization performance is not effected.
In Figure~\ref{epsilon_effect:fig} we show the running time and the test error as
a function of $\epsilon$. The results show that a moderate value of
$\epsilon$ of $0.1$ already yields good generalization. The increase in
running time when using smaller values for $\epsilon$ is between
$\%20$ to $\%30$.
Thus, the algorithm is rather robust to the actual choice of the
accuracy parameter $\epsilon$ so long as it is not set to a value which is
evidently too large.
\iffalse
We fixed the required accuracy to $0.001$ in comparing
the running time of the different versions we describe in the sequel.\YS
\fi

\begin{figure}[t]
\centerline{
	\psfig{figure=PS/cooling.ps,width=0.7\textwidth}
	%\psfig{figure=PS/q_vs_iter.ps,width=0.4\textwidth}
	%\hspace{1cm}
	% \psfig{figure=PS/q_vs_iter_cool.ps,width=0.4\textwidth}}
	%\psfig{figure=PS/q_vs_iter.ps,width=0.4\textwidth}
	}
\caption{The value of the objective function $\mcal{Q}$ as a function
of the number of iteration for a fixed and variable scheduling of the
accuracy parameter $\epsilon$.}
\label{epsilon_cooling:fig}
\end{figure}


\paragraph{Maintaining an active set} The standard implementation described
above scans the entire training set and computes $\psi_i$ for each example
$\vx_i$ in the set. However, if only a few support patterns constitute the
multiclass machine then the vector $\vtau$ is the zero vector for  many
example. We thus partition the set of examples into two sets. The first,
denoted by $A$ and called the active set, is composed of the set of examples 
that contribute to the solution, that is, $A = \{i | \vtau_i\neq\vzero\}$.
The second set is simply its complement, $A^c = \{i | \vtau_i=\vzero\}$.
During the course of the main loop we first search for an example to update 
from the set $A$. Only if such an example does not exist, which can happen
iff $\forall i\in A, \psi_i < \epsilon$, we scan the set $A^c$ for an example
$p$ with $\psi_p>\epsilon$. If such an example exists we remove it from
$A^c$, add it to $A$, and call the fixed-point algorithm with that example.
This procedure spends most of it time adjusting the weights of examples
that constitute the active set and adds a new example only when the active
set is exhausted. A natural implication of this procedure is that the support
patterns can come only from the active set.

\begin{figure}[t]
\centerline{\psfig{figure=PS/run_time.ps,width=0.70\textwidth}}
\caption{Comparison of the run-time on the MNIST dataset of the
different versions as a function of the training-set size.
Version 1 is the baseline implementation.
Version 2 uses KKT conditions for selecting an example to update.
Version 3 adds the usage of an active set and cooling of $\epsilon$.
Version 4 adds caching of inner-products.
Finally, version 5 uses data structures for representing and using
	sparse inputs.}
\label{run_time:fig}
\end{figure}

\paragraph{Cooling of the accuracy parameter} 
The employment of an active set yields significant reduction in running
time. However, the scheme also forces the algorithm to keep updating the
vectors $\vtau_i$ for $i\in A$ as long there is even a single
example $i$ for which $\psi_i>\epsilon$. This may result in minuscule
changes and a slow decrease in $\mcal{Q}$ once most examples in $A$ have
been updated. In Figure~\ref{epsilon_cooling:fig} we plot in bold line
the value of $\mcal{Q}$ as a function of the number of iterations
when $\epsilon$ is kept fixed. The line
has a staircase-like shape. Careful examination of the iterations
in which there was a significant drop in $\mcal{Q}$ revealed that
these are the iterations
on which new examples were added to the active set. After each addition
of a new example numerous iterations are spent in adjust the weights
$\vtau_i$. To accelerate the process, especially on early iterations
during which we mostly add new examples to the active set, we use a variable
accuracy parameter, rather than a fixed accuracy. On early iterations the
accuracy value is set to a high value so that the algorithm will mostly
add new examples to the active set and spend only a small time on adjusting
the weights of the support patterns. As the number of iterations increases
we decrease $\epsilon$ and spend more time on adjusting the weights of
support patterns. The result is a smoother and more rapid decrease in
$\mcal{Q}$ which leads to faster convergence of the algorithm. We refer
to this process of gradually decreasing $\epsilon$ as {\em cooling}.
We tested the following cooling schemes (for $t=0,1,\ldots$):
(a) exponential: $\epsilon(t) = \epsilon_0 \exp(-t)$;
(b) linear: $\epsilon(t) = \epsilon_0/(t+1)$
(c) logarithmic: $ \epsilon(t) = \epsilon_0/\log_{10}(t+10)$. The initial
accuracy $\epsilon_0$ was set to $0.999$.
We found that all of these cooling schemes improve the rate of decrease
in $\mcal{Q}$, especially the logarithmic scheme for which $\epsilon(t)$
is relatively large for a long period and than decreases moderately.
The dashed line in Figure~\ref{epsilon_cooling:fig} designate the value
of $\mcal{Q}$ as a function of the number of iterations using a
logarithmic cooling scheme for $\epsilon$. In the particular setting of
the figure, cooling reduces the number of iterations, and thus the running
time, by an order of magnitude.
 
\paragraph{Caching} Previous implementations of algorithms for support
vector machines employ a cache for saving expensive kernel-based
inner-products (see for instance~\citealp{Platt98,Joachims98,CollobertBe01}).
Indeed, one of the most expensive steps in the algorithm is the evaluation
of the kernel. Our scheme for maintaining a cache is as follows. For small
datasets we store in the cache all the kernel evaluations between each
example in the active set and all the examples in the training set. For
large problems with many support patterns (and thus a large active set)
we use a least-recently-used (LRU) scheme as a caching strategy. In this scheme,
when the cache is full we replace least used inner-products of an example
with the inner-products of a new example. LRU caching is also used in
$\mbox{SVM}^{light}$~\citep{Joachims98}.

\paragraph{Data-structures for sparse input instances}
Platt~\citeyearpar{Platt98}
and others have observed that when many components of
the input vectors are zero, a significant saving of space and time can be
achieved using data-structures for sparse vectors and computing only the
products of the non-zero components in kernel evaluations. Our
implementation uses linked list for sparse vectors. Experimental evaluation
we performed indicate that it is enough to have $20\%$ sparseness of the
input instances to achieve a speedup in time and reduction in memory over
a non-sparse implementation.

To conclude this section we give in Figure~\ref{run_time:fig}a comparison
of the run-time of the various technical improvements we outlined above.
Each version that we plot include all of the previous improvements. The
running-time of the version that includes all the algorithmic and
implementation improvements is two orders of magnitude faster than the
baseline implementation. It took the fastest version
3 hours and 25
minutes to train a {\em multiclass} kernel-machine on the MNIST dataset
using a Pentium III computer running at 600MHz with 2Gb of physical
memory.
(The fastest version included two more technical improvements which are
not discussed here but will be documented in the code that we will shortly
make available.)
%
In comparison, Platt~\citeyearpar{Platt98} reports a training time of 8
hours and 10 minutes for learning a single classifier that discriminates
one digit from the rest. Therefore, it takes over 80 hours to train a set
of 10 classifiers that constitute a multiclass predictor. Platt's results
were obtained using a Pentium II computer running at 266MHz. While there
are many different factors that influence the running time, the running
time results above give an indication of the power of our approach for
multiclass problems. We believe that the advantage of our direct approach
to multiclass problems will become even more evident in problems with a
large number of classes such as Kanji character recognition. We would
also like to note that the improved running time is not achieved at the
expense of deteriorated classification accuracy. In the next section
we describe experiments that show that our approach is comparable to,
and sometimes better than, the standard support vector technology.


\iffalse
\[
\begin{array}{rl}
\mbox{Exponential:} & \epsilon(t) = \epsilon_0\exp(-t) ; \\
\mbox{Linear:} & \epsilon(t) = \epsilon_0/t ; \\
\mbox{Logarithmic:} & \epsilon(t) = \epsilon_0/\log(t) ~~ .
\end{array}
\]
\begin{tabbing}
\hspace{3cm} \=
Exponential decrease: \= \hspace{0.5cm} \=
	$\epsilon(t) = \epsilon_0 \exp(-t)$; \\
\> Linear decrease: \> \> $\epsilon(t) = \epsilon_0 / t$; \\
\> Logarithmic decrease: \> \> $\epsilon(t) = \epsilon_0 / \log(t)$.
\end{tabbing}


\begin{enumerate}
\item Active set.
\item Cooling.
\item Chucking.
\item Caching.
\item Smart initialization.
\item Update KKT values on going.
\end{enumerate}
\fi

\iffalse
\begin{table}
\begin{center}
\begin{tabular}{lc} \hline
Name     & Test error(\%) \\ \hline
MNIST    & 1.42 \\
USPS     & 4.38 \\
Letter   & 1.95 \\ 
Shuttle  & 0.12 \\ \hline
\end{tabular}
\end{center}
\vspace{-0.5cm}
\caption{Summary of the results obtained in experiments with large datasets.}
\label{large_set_exper_result:table}
\end{table}
\fi

\begin{table}
\label{properties}
\begin{center}
\begin{tabular}{lcccc} \hline
Name & No. of & No. of & No. of & No. of \\
     & Training Examples & Test Examples & Classes & Attributes \\ \hline
satimage & 4435 & 2000        & 6   & 36 \\
shuttle  & 5000 & 9000        & 7   & 9  \\
mnist    & 5000 & 10000       &  10 & 784\\
isolet   & 6238 & 1559        & 26  & 617  \\
letter   & 5000 & 4000        &  26 & 16 \\
vowel    & 528  & 462         & 11  & 10 \\
glass    & 214  & 5-fold cval & 7   & 9 \\   \hline
% soybean  & 307  & 376         & 19  & 35 \\ \hline
\end{tabular}
\end{center}
\vspace{-0.5cm}
\caption{Description of the small databases used in the experiments.}
\label{small_set_exper:table}
\end{table}

\section{Experiments} \label{exper:sec}

In this section we report the results of experiments we
conducted in order to evaluate our implementation of the multiclass support
vector machine described in this paper. We selected seven datasets,
six from UCI
repository~\footnote{Available at http://www.ics.uci.edu/$\tilde{~}$mlearn/MLRepository.html}
and the {\tt MNIST} dataset.  All but one of the datasets ({\tt glass}),
contain a separate test set.  For the remaining dataset we used five fold
cross validation to compute the error rate.  A description of the datasets
we used is given in Table~1.  Note that on {\tt MNIST}, ~{\tt Letter}
and {\tt Shuttle} we used only a subset of the training set. On each data
set we ran two algorithms. The first algorithm uses the multiclass SVM of
this paper in a binary mode by training one classifier for each class.
Each such classifier is trained to distinguish between the class to the
rest of the classes.
To classify a new instance we compute the output of each of the binary
classifiers and predict the label which attains the highest confidence
value. The second algorithm we compared is the multiclass SVM described
in this paper.  
We used our multiclass SVM as the basis for the two algorithms in order
to have a common framework for comparison.  
In both algorithms we used Gaussian kernels. To determine
the value of $\beta$ and $\sigma$ we used cross validation on the training
set. We used 5-fold cross validation for the large datasets and 10-fold
cross validation for the small datasets. In all the experiments we set
the value of $\epsilon$ to $0.001$.

\begin{figure}[t]
\centerline{\psfig{figure=PS/test_error_graph.ps,width=0.7\textwidth}}
\caption{Comparison of a multiclass SVM build using the one-against-rest
approach with the multiclass support vector machines studied in this paper.}
\label{decrease_in_error_rate:fig}
\end{figure}

\begin{figure}[tp]
\centerline{		
	\psfig{figure=PS/mnist-digits.1.ps,width=0.5\textwidth}
	\hspace{.25cm}	
        \psfig{figure=PS/mnist-digits.2.ps,width=0.5\textwidth}
}
\centerline{		
	\psfig{figure=PS/mnist-digits.3.ps,width=0.5\textwidth}
	\hspace{.25cm}	
        \psfig{figure=PS/mnist-digits.4.ps,width=0.5\textwidth}
}
\centerline{		
	\psfig{figure=PS/mnist-digits.5.ps,width=0.5\textwidth}
	\hspace{.25cm}	
        \psfig{figure=PS/mnist-digits.6.ps,width=0.5\textwidth}
}
\iffalse
\centerline{		
	\psfig{figure=PS/mnist-digits.7.ps,width=0.5\textwidth}
	\hspace{.25cm}	
        \psfig{figure=PS/mnist-digits.8.ps,width=0.5\textwidth}
}
\fi
\caption{Examples of images which are support patterns with a large
norm of $\vtau$ (MNIST dataset).}
\label{demo:fig}
\end{figure}

\iffalse
\begin{figure}[tp]
\centerline{		
	\psfig{figure=PS/usps-digits.1.ps,width=0.5\textwidth}
	\hspace{.25cm}	
        \psfig{figure=PS/usps-digits.2.ps,width=0.5\textwidth}
}
\centerline{		
	\psfig{figure=PS/usps-digits.3.ps,width=0.5\textwidth}
	\hspace{.25cm}	
        \psfig{figure=PS/usps-digits.4.ps,width=0.5\textwidth}
}
\centerline{		
	\psfig{figure=PS/usps-digits.5.ps,width=0.5\textwidth}
	\hspace{.25cm}	
        \psfig{figure=PS/usps-digits.6.ps,width=0.5\textwidth}
}
\centerline{		
	\psfig{figure=PS/usps-digits.7.ps,width=0.5\textwidth}
	\hspace{.25cm}	
        \psfig{figure=PS/usps-digits.8.ps,width=0.5\textwidth}
}
\caption{Examples of images which are support patterns with a large
norm of $\vtau$ (USPS dataset).}
\label{demo_usps:fig}
\end{figure}
\fi

A summary of the results is depicted in
Figure~\ref{decrease_in_error_rate:fig}.  Each bar in the figure is
proportional to the difference in the test error between the two
algorithms. Positive value means that the algorithm proposed in this
paper achieved a lower error rate than the strawman algorithm based on
the `one-vs-rest' approach.
In general the multiclass support vector machine
achieved lower error rate than the `one-vs-rest' method where for the
datasets with a large example per class ratio this improvement is
significant.

We also ran the multiclass SVM on the complete training set
of {\tt MNIST}, ~{\tt Letter} and {\tt Shuttle} and on the {\tt
USPS}~\footnote{Available at ftp.kyb.tuebingen.mpg.de~} dataset.
The results are
summarized in Table~2.  Sch\"olkopf {\em et al.}~\citeyearpar{Scholkopf97}
achieved a test error of $1.4\%$ on the {\tt MNIST} dataset and a test
error of $4.2\%$ on the {\tt USPS}~\citep{ScholkopfSuBuGiNiPoVa97} using
a set of binary SVMs training by the `one-vs-rest' method. In another
work, Dietterich~\citeyearpar{Dietterich00} achieved a
test error of $2.71\%$ on the {\tt Letter} dataset and a test error of
$0.01\%$ on the {\tt Shuttle} dataset by boosting $C4.5$.  Note that
in the first three cases our algorithm is comparable to the other
algorithms, while it is 10 times worse on the {\tt Shuttle} dataset.
This gap can be explained by the fact that most of the instances in the
dataset belong to the same class, which is a phenomena that is easier
for boosting algorithms to cope with. Finally, we would like to note
that using the polynomial kernel of degree $9$ and the normalization
described by~\citet{DeCosteSc01} we achieve an error rate of $1.24\%$
on the test set of {\tt MNIST}.

In Figure~\ref{demo:fig} we give training images taken from the
{\tt MNIST} dataset with high norm of $\vtau$. All the images we plot
attain a margin value of less than $\beta$. For each image we give its
correct label and in parentheses the probability vector $\veta$ using
a sparse representation: each entry of the vector is of the form
{\em label:probability}. If there is a training error on an image we
added a trailing $E$ in the text. 
As discussed in the
previous sections, the vector $\veta$ is a probability vector that is
concentrated on the corrected label if it is {\em not} a support pattern.
Therefore, all the above images are support pattern and the vectors
$\veta$ designate the set of confusable labels -- the labels for which
the corresponding entries in $\veta$ are non-zero. Examination of the
different images reveal that the confusion is indeed correlated with
the actual digit each training example can be confused with.
For instance, the top image on the left hand-side from Figure~\ref{demo:fig}
can be mistakenly classified as $3$, $7$, or $9$. In many cases,
the examples with a large norm of $\vtau$ correspond to mislabeled
examples or corrupted images. Therefore, the norm of $\vtau$ can be
used as an aid for data cleaning.

\begin{table}
\begin{center}
\begin{tabular}{lc} \hline
Name     & Test error(\%) \\ \hline
mnist    & 1.42 \\
USPS     & 4.38 \\
shuttle  & 0.12 \\
letter   & 1.95 \\ \hline
\end{tabular}
\end{center}
\vspace{-0.5cm}
\caption{Summary of experiments with large datasets.}
\label{large_set_exper_result:table}
\end{table}

\section{Summary} \label{conc:sec}
In this paper we presented a novel approach for building multiclass
support vector machines. We described in detail an efficient learning
algorithm and discussed various implementation issues required for making
it practical. Our methods achieve state of the art results with a running
time that is competitive with previous methods, and is one order of
magnitude faster in some problems. An interesting questions that stem
from our research is whether the approach taken in this paper can also
be used for other machine learning tasks with kernels, such as ranking
problems and regression. We leave this for future research. Another
interesting direction that we are currently pursuing is analogous online
algorithms that use multiple prototypes for multiclass problems.

\section*{Acknowledgments}
We would like to thank Elisheva Bonchek for useful comments and carefully
reading the manuscript. Thanks also to Chih-Jen Lin for fruitful email
exchange.

\appendix

\iffalse
\section{Karush-Kuhn-Tucker Theorem}
\label{kkt-theorem}

We use the notation of
Cristianinini and Showe-Taylor~\citeyearpar{CristianiniSh00}.
Given a quadratic optimization problem with variables drawn from
a convex domain $\Omega \subset \Re^n$,
\[
\begin{array}{lll}
\mbox{minimize}   & f({\bf w})           & {\bf w} \in \Omega, \\
\mbox{subject to} & g_i({\bf w}) \leq 0, & i=1 \comdots k,\\
           & h_i({\bf w}) = 0   , & i=1 \comdots m,  
\end{array}
\]
with $f \in C^1$ convex and $g_i,~h_i$ affine, the necessary and sufficient
conditions for a point ${\bf w}^*$ to be the optimum are the existence of
${\bf \alpha}^*,~{\bf \beta}^*$ such that,
\begin{eqnarray*}
\frac{\partial L({\bf w}^*,{\bf \alpha}^*~, {\bf \beta}^*)}{\partial {\bf w}}&=&0, \\
\frac{\partial L({\bf w}^*,{\bf \alpha}^*~, {\bf \beta}^*)}{\partial {\bf \beta}}&=&0, \\
\alpha_i^* g_i({\bf w}^*)&=&0,~i=1\comdots k, \\
g_i({\bf w}^*)&\leq&0,~i=1\comdots k, \\
\alpha_i^* &\geq& 0,~i=1 \comdots k.
\end{eqnarray*}
\fi


\section{Derivation of the dual optimization problem}
\label{calculations}

We develop the Lagrangian using only the dual variables. For simplicity
we use the identity Kernel $K(x_i,x_j)=x_i\cdot x_j$. Substituting
Eq.~(\ref{sum_eta_is_1}) to Eq.~(\ref{Lagrangian_l2}) we obtain,
\begin{eqnarray}
\mcal{Q}(\eta) & = & 
\sum_{i=1}^m \xi_i +
\sum_{i,r} \eta_{i,r} x_i \cdot \mr -
\sum_{i,r} \eta_{i,r} x_i \cdot \rowM{{y_i}} -
\sum_{i} \xi_i \underbrace{\sum_{r} \eta_{i,r}}_{=1} +
\sum_{i,r} \eta_{i,r} b_{i,r} + \half \beta \sum_r \Vert \mr \Vert_{2}^2 \nonumber \\
& = & 
\overbrace{\sum_{i,r} \eta_{i,r} x_i \cdot \mr}^{\eqdef S_1} -
\overbrace{\sum_{i,r} \eta_{i,r} x_i \cdot \rowM{{y_i}}}^{\eqdef S2} +
\overbrace{\half \beta \sum_r \Vert \mr \Vert_{2}^2}^{\eqdef S_3} 
+ \sum_{i,r} \eta_{i,r} b_{i,r} ~. \label{q_stage_1} 
\end{eqnarray}
We substitute $\mr$ in the above equation using Eq.~(\ref{value_of_mr})
and get,
\begin{eqnarray}
S_1 & = & \sum_{i,r} \eta_{i,r} x_i \cdot \mr \nonumber \\
& = & \sum_{i,r} \eta_{i,r} x_i \cdot \beta^{-1} \sum_j x_j(\delta_{y_j,r} - \eta_{j,r}) \nonumber \\
& = & \beta^{-1} \sum_{i,j} x_i \cdot x_j \sum_r \eta_{i,r}
(\delta_{y_j,r} - \eta_{j,r}) \label{star_1} ~~ ,\\
%\end{eqnarray}
%\begin{eqnarray}
S_2 & = & \sum_{i,r} \eta_{i,r} x_i \cdot \rowM{{y_i}} \nonumber \\
& = & \sum_{i,r} \eta_{i,r} x_i \cdot \beta^{-1} \sum_j x_j(\delta_{y_j,y_i} - \eta_{j,y_i}) \nonumber \\ 
& = & \beta^{-1} \sum_{i,j} x_i \cdot x_j \sum_r \eta_{i,r} (\delta_{y_j,y_i} - \eta_{j,y_i})  \nonumber \\
& = & \beta^{-1} \sum_{i,j} x_i \cdot x_j (\delta_{y_j,y_i} - \eta_{j,y_i}) \underbrace{\sum_r \eta_{i,r}}_{=1} \nonumber \\
& = & \beta^{-1} \sum_{i,j} x_i \cdot x_j (\delta_{y_j,y_i} - \eta_{j,y_i}) \nonumber \\
& = & \beta^{-1} \sum_{i,j} x_i \cdot x_j \sum_r \delta_{y_i,r}
(\delta_{y_j,r} - \eta_{j,r}) ~~ , \label{star_2} \\
%\end{eqnarray}
%\begin{eqnarray}
S_3 & = & \half \beta \sum_r \mr \cdot \mr \nonumber \\
& = & \half \beta \sum_{r}[\beta^{-1} \sum_i x_i(\delta_{y_i,r} -
\eta_{i,r})] [ \beta^{-1} \sum_j x_j(\delta_{y_j,r} - \eta_{j,r}) ] \nonumber \\
& = & \half \beta^{-1} \sum_{i,j} x_i \cdot x_j \sum_r (\delta_{y_i,r} - \eta_{i,r})(\delta_{y_j,r} - \eta_{j,r}) ~.\label{star_3}
\end{eqnarray}
Taking the difference $S_1 - S_2$ while using
Eqs.~(\ref{star_1})~and~(\ref{star_2}) we get,
\begin{eqnarray}
S_1 - S_2 
& = & 
\beta^{-1} \sum_{i,j} x_i \cdot x_j \sum_r \eta_{i,r} (\delta_{y_j,r} - \eta_{j,r}) - 
\beta^{-1} \sum_{i,j} x_i \cdot x_j \sum_r \delta_{y_i,r} (\delta_{y_j,r} - \eta_{j,r}) \nonumber \\
& = &
- \beta^{-1} \sum_{i,j} x_i \cdot x_j \sum_r (\delta_{y_i,r} - \eta_{i,r})(\delta_{y_j,r} - \eta_{j,r}) ~. \label{star_12}
\end{eqnarray}
Finally, plugging the values for $S_1$,$S_2$ and $S_3$ from 
Eqs.~(\ref{star_3})~and~(\ref{star_12}) into Eq.~(\ref{q_stage_1}) we get,
\begin{eqnarray}
\mcal{Q}(\eta) & = & 
- \beta^{-1} \sum_{i,j} x_i \cdot x_j \sum_r (\delta_{y_i,r} - \eta_{i,r})(\delta_{y_j,r} - \eta_{j,r}) \nonumber \\
& & +
 \half \beta^{-1} \sum_{i,j} x_i \cdot x_j \sum_r (\delta_{y_i,r} - \eta_{i,r})(\delta_{y_j,r} - \eta_{j,r}) + \sum_{i,r} \eta_{i,r} b_{i,r} \nonumber \\
& = &
- \half \beta^{-1} \sum_{i,j} x_i \cdot x_j \sum_r (\delta_{y_i,r} - \eta_{i,r})(\delta_{y_j,r} - \eta_{j,r}) + \sum_{i,r} \eta_{i,r} b_{i,r} ~. \nonumber
\end{eqnarray}


%\bibliographystyle{plainnat}
\bibliography{bib}

\end{document}
