\documentclass[twoside,11pt]{article}

\bibliographystyle{abbrvnat}
\usepackage{./jmlr2e}
\bibpunct{(}{)}{;}{a}{,}{;}

%\usepackage{graphics}
\usepackage{amsmath}
%\usepackage{latexsym}

\usepackage{url}


%\usepackage{theapa}
%\usepackage{Format/icml2001}
%\usepackage{Format/mlapa}

%\newcommand{\rcubed}{\textsf{{\small R$^3$}}}
%\newcommand{\ripper}{\textsf{{\small Ripper}}}
%\newcommand{\slipper}{\textsf{{\small Slipper}}}
%\newcommand{\cfive}{\textsf{{\small C5}}}
%\newcommand{\cfiveboost}{\textsf{{\small C5-boost}}}

\newcommand{\learner}[1]{\textsf{#1}}
\newcommand{\learnersmall}[1]{\textsf{\footnotesize #1}}

\newcommand{\rcubed}{\learner{R$^3$}}
\newcommand{\ripper}{\learner{Ripper}}
\newcommand{\slipper}{\learner{Slipper}}
\newcommand{\cfive}{\learner{C5.0}}
\newcommand{\cfiveboost}{\learner{C5.0-boost}}
\newcommand{\cfiverr}{\learner{C5.0-rr}}
\newcommand{\cfiveboostrr}{\learner{C5.0-boost-rr}}
\newcommand{\cart}{\learner{CART}}

\newcommand{\rcubedsmall}{\learnersmall{R$^3$}}
\newcommand{\rippersmall}{\learnersmall{Ripper}}
\newcommand{\slippersmall}{\learnersmall{Slipper}}
\newcommand{\cfivesmall}{\learnersmall{C5.0}}
\newcommand{\cfiveboostsmall}{\learnersmall{C5.0-boost}}

% \newtheorem{theorem}{Theorem}[section]
% \newtheorem{proposition}[theorem]{Proposition}
% \newtheorem{corollary}[theorem]{Corollary}
% \newtheorem{lemma}[theorem]{Lemma}
% \newtheorem{definition}[theorem]{Definition}

%\newenvironment{proof}{\paragraph{{\rm \textit{Proof:}}}}{\hfill
%  $\Box$ \bigskip}

% JMLR stuff
\jmlrheading{2}{2002}{721-747}{08/01}{03/02}{F\"urnkranz}
\ShortHeadings{Round Robin Classification}{F\"urnkranz}
\firstpageno{721}

%% keine Hurenkinder und Schusterjungen!
\clubpenalty = 10000
\widowpenalty = 10000 
%% auch nicht nach Formel
\displaywidowpenalty = 10000

\renewcommand{\floatpagefraction}{0.6}
\renewcommand{\dblfloatpagefraction}{1.0}
\renewcommand{\textfraction}{0.0}
%\renewcommand{\baselinestretch}{2}


\begin{document}

\title{Round Robin Classification}
\author{\name Johannes F\"urnkranz \email juffi@oefai.at\\
\addr Austrian Research Institute for Artificial Intelligence \\
Schottengasse 3, A-1010 Wien, Austria}

\editor{Yoram Singer}

\maketitle

\begin{abstract}%
  In this paper, we discuss round robin classification (aka pairwise
  classification), a technique for handling multi-class problems with
  binary classifiers by learning one classifier for each pair of
  classes. We present an empirical evaluation of the method,
  implemented as a wrapper around the \ripper\ rule learning
  algorithm, on 20 multi-class datasets from the UCI database
  repository.  Our results show that the technique is very likely to
  improve \ripper's classification accuracy without having a high risk
  of decreasing it.
%
  More importantly, we give a general theoretical analysis of the
  complexity of the approach and show that its run-time complexity is
  below that of the commonly used one-against-all technique. These
  theoretical results are not restricted to rule learning but are also
  of interest to other communities where pairwise classification has
  recently received some attention.
%
  Furthermore, we investigate its properties as a general ensemble
  technique and show that round robin classification with \cfive\ may
  improve \cfive's performance on multi-class problems. However, this
  improvement does not reach the performance increase of boosting, and
  a combination of boosting and round robin classification does not
  produce any gain over conventional boosting.
%
  Finally, we show that the performance of round robin classification
  can be further improved by a straight-forward integration with
  bagging.
\end{abstract}

\begin{keywords}
pairwise classification, inductive rule learning, multi-class problems, class binarization, ensemble techniques
\end{keywords}


\section{Introduction}


Although real-world problems often have multiple classes, many
learning algorithms are inherently binary, i.e., they are only able to
discriminate between two classes. The reasons for this may be
constraints imposed by the hypothesis language (e.g., linear
discriminants or support vector machines), the learning architecture
(e.g., neural networks with single output nodes), or the learning
framework (e.g., many rule learning algorithms are tailored towards
concept learning, i.e., the problem of learning a concept description
from positive and negative examples).
%
There are two principal approaches for applying such algorithms to
multi-class problems: one approach is to generalize the algorithm---as
has, e.g., been done for support vector machines
\citep{PiecewiseLinearSVM,MultiClass-SVM} or boosting
\citep{AdaBoost}---the other is to
% In order to use such algorithms on multi-class problems, we have to
employ class binarization techniques (Section~\ref{sec:binarization}),
which reduce the multi-class problem into a series of binary problems.
% We will review such techniques in Section~\ref{sec:binarization}. 

One of the most common class binarization approaches is to learn
separate concept descriptions for each individual class, i.e., to form
a series of problems, one for each class, where all examples of this
class are regarded as positive examples, while all other examples are
regarded as negative.
%
Pairwise classification is an alternative class binarization
technique, which has lately received some attention in the neural
networks and support vector machines communities
(Section~\ref{sec:related}). Its basic idea is to reduce a multi-class
problem to multiple two-class problems by learning one classifier for
each pair of classes, using only training examples for these two
classes and ignoring all others (Section~\ref{sec:round-robin}).

In this paper, we will, on the one hand, evaluate the
technique---which we name \emph{round robin} binarization---for
inductive rule learning algorithms and show that it is significantly
more accurate than the ordered and unordered (one-against-all)
techniques that are currently used for rule learning algorithms like
\ripper\ (Section~\ref{sec:accuracy}). On the other hand, we provide a
detailed analysis of the complexity of the approach and show that the
$c(c-1)/2$ learning problems of a single round robin can be
learned more efficiently than the $c$ learning problems of the
conventional one-against-all technique (Section~\ref{sec:efficiency}).
This analysis is independent of the base learning algorithm, but we
show that the advantage increases with more expensive learners. The
theoretical results are confirmed by our experimental evaluation. In
Section~\ref{sec:ensembles}, we will investigate the properties of
round-robin classification as a general ensemble technique with mixed
results: while the technique is able to reduce \cfive's error on
average, the gains do not approach those of boosting. On the other
hand, a straightforward combination of round robin learning with
bagging does lead to additional gains for \ripper. Finally, we will
discuss a few other relevant properties of round-robin classification,
including its suitability for parallel implementations, possible
remedies for its inefficiency at classification time, and its
comprehensibility (Section~\ref{sec:issues}).


\section{Class Binarization}
\label{sec:binarization}

Many machine learning algorithms are inherently designed for binary
(two-class) decision problems. Prominent examples are 
% neural networks with single output nodes, 
perceptrons, support vector machines, the original \learner{AdaBoost}
algorithm, and separate-and-conquer rule learning. In addition, all
regression algorithms can, in principle, be used for binary decision
problems, but not for multi-class problems (unless, maybe, if the
class values can be ordered). On the other hand, real-world problems
often have multiple classes. Fortunately, there exist several simple
techniques for turning multi-class problems into a set of binary
problems. We will call such techniques class binarization techniques.

\begin{definition}[class binarization, decoding, base learner]
  A \emph{class binarization} is a mapping of a multi-class learning
  problem to several two-class learning problems in a way that allows
  a sensible \emph{decoding} of the prediction, i.e., it allows the
  derivation of a prediction for the multi-class problem from the
  predictions of the set of two-class classifiers. The learning
  algorithm used for solving the two-class problems is called the
  \emph{base learner}.
\end{definition}

The most popular class binarization technique is the unordered or
one-against-all class binarization, where one takes each class in turn
and learns binary concepts that discriminate this class from all other
classes. It has been independently proposed for rule learning
\citep{CN2-improvements}, neural networks \citep{ModularNN}, and support
vector machines \citep{SupportVectorNetworks}.

\newpage
\begin{definition}[unordered/one-against-all class binarization]
  The \emph{unordered class binarization} transforms a $c$-class
  problem into $c$ two-class problems. These are constructed by using
  the examples of class $i$ as the positive examples and the examples
  of classes $j$ ($j=1 \dots c,\, j\neq i$) as the negative examples.
\end{definition}

The name ``unordered'' originates from \citet{CN2-improvements}, who
proposed this approach as an alternative to the decision-list learning
approach that was originally used in CN2 \citep{CN2,DecisionLists}. As
our main concern is rule learning, we will primarily stick to the
terminology used there, but will also occasionally refer to it as
\emph{one-against-all}, which seems to be predominant in other fields.
% This terminology also helps to clarify the difference be ordered class
% binarization, which has its origins in separate-and-conquer rule
% learning.

\begin{definition}[ordered class binarization]
  The \emph{ordered class binarization} transforms a $c$-class problem
  into $c-1$ binary problems. These are constructed by using the
  examples of class $i$ $(i = 1 \dots c-1)$ as the positive examples
  and the examples of classes $j > i$ as the negative examples.
\end{definition}


\begin{figure}[t]
\begin{center}
\begin{minipage}{0.49\textwidth}
\begin{center}
  \resizebox{50mm}{!}{\includegraphics{multi-class.eps}}

(a) \textit{Multi-class learning}\newline one classifier that separates all
  classes.
\end{center}
\end{minipage}
\end{center}

\bigskip

%\begin{center}
\begin{minipage}{0.45\textwidth}
\begin{center}
 \resizebox{50mm}{!}{\includegraphics{unordered.eps}}

\bigskip
(b) \textit{Unordered learning}\newline $c$ classifiers, each separates one class
  from all other classes. Here: \textsf{+} against all
  other classes.
\end{center}
\end{minipage}
%\end{center}
%\begin{center}
\hfill
\begin{minipage}{0.48\textwidth}
\begin{center}
 \resizebox{49mm}{!}{\includegraphics{roundrobin.eps}}

\bigskip
\bigskip
\bigskip
(c) \textit{Round robin learning}\newline $c(c-1)/2$ classifiers, 
  one for each pair of classes. Here: $+$ against $\sim$.
\end{center}
\end{minipage}
%\end{center}

\caption[]{Unordered and round robin binarization for a 6-class problem.}
\label{fig:roundrobin}
\end{figure}

Note that ordered class binarization imposes an order on the induced
classifiers, which has to be adhered to at classification time: the
classifier learned for discriminating class 1 from classes $2 \dots c$
has to be called first. If this classifier classifies the example as
belonging to class 1, no other classifier is called; if not, the
example is passed on to the next classifier. Unordered class
binarization, on the other hand, has to call each of its constituent
binary classifiers and requires some external criterion for combining
the individual predictions into a final prediction. Typical decoding
rules vote the predictions of the individual classifiers, possibly by
taking into account the confidences of the predictions (cf.\ Section~\ref{sec:issues}). 


\section{Round Robin Classification}
\label{sec:round-robin}

In this section, we will discuss a more complex class binarization
procedure, the pairwise classifier. The basic idea is quite simple,
namely to learn one classifier for each pair of classes. In analogy to
round robin tournaments in sports and games, in which each participant
is paired with each other participant, we call this procedure
\emph{round robin binarization}.\footnote{%
  The etymology of this phrase is interesting and unclear. According
  to \emph{The Shorter Oxford English Dictionary} (1973), the phrase
  has at one time or another referred to the fish \emph{decapterus
    punctatus}, to the Holy Sacrament (in a blasphemous way), and to
  written complaints that were signed in a circular way to disguise
  the order in which the subscribers have signed the petition.  A
  posting on the usenet group \url{uk.culture.language.english}
  (thanks to Foster Provost for this information) claims that its
  current use in games and sports goes back to these circular
  signatures, and that the phrase originates from the French word
  \emph{ruban}, ribbon.  Finally---on a Web-site that is dedicated to
  analyzing lyrics from the \emph{Grateful Dead}---Psychology
  Professor Tom Malloy (Rhode Island College) points out that the use
  of round robin in the song \emph{The Wheel} might also refer to a
  research design used in biology and psychology where pairwise
  reactions of a group of subjects to each other's behaviors (e.g.,
  smiling) are measured.}
% http://arts.ucsc.edu/GDead/AGDL/wheel.html
% http://www.rain.org/~mkummel/stumpers/09feb01a.html
% http://members.tripod.com/philomanshomepage/word5.htm

\begin{definition}[round robin/pairwise binarization]
  The \emph{round robin} or \emph{pairwise} class binarization
  transforms a $c$-class problem into $c(c-1)/2$ two-class problems
  $<\!\!i,j\!\!>$, one for each set of classes $\{i,j\}, i= 1 \dots
  c-1$, $j = i+1 \dots c$. The binary classifier for problem
  $<\!\!i,j\!\!>$ is trained with examples of classes $i$ and $j$,
  whereas examples of classes $k \neq i,j$ are ignored for this
  problem.
\end{definition}

\enlargethispage*{12pt} Round robin binarization is illustrated in
Figure~\ref{fig:roundrobin}. For the 6-class problem shown in
Figure~\ref{fig:roundrobin}(a), the round robin procedure learns 15
classifiers, one for each pair of classes.
Figure~\ref{fig:roundrobin}(c) shows the classifier $<\!\!+,\sim>$. In
comparison, Figure~\ref{fig:roundrobin}(b) shows one of the
classifiers for the unordered class binarization, namely the one that
pairs class $+$ against all other classes. It is obvious that in the
round robin case, the base classifier uses fewer examples and thus has
more freedom for fitting a decision boundary between the two classes.
In fact, in our example, all binary classification problems of the
round robin binarization could be solved with a simple linear
discriminant, while neither the multi-class problem nor its unordered
binarization have a linear solution. The phenomenon that pairwise
decision boundaries can be considerably simpler than those originating
from unordered binarization has also been observed in real-world
domains. For example, \citet{StepwiseNN-DigitRecognition} observed
that the classes of a digit recognition task were pairwise linearly
separable, while the corresponding one-against-all task was not
amenable to single-layer networks. The results of
\citet{MultiClass-SVM-Comparison}, who obtained a larger advantage of
round robin binarization over unordered binarization for support
vector machines with a linear kernel than for support vector machines
with a non-linear kernel on several benchmark problems, could also be
explained by simpler decision boundaries in the round robin case.

A crucial point, of course, is how to decode the predictions of the
pairwise classifiers to a final prediction. We implemented a simple
voting technique: when classifying a new example, each of the learned
base classifiers determines to which of its two classes the example is
more likely to belong to. The winner is assigned a point, and in the
end, the algorithm will predict the class that has accumulated the
most points. We break ties by preferring the class that is more
frequent in the training set (or flipping a coin if the frequencies
are equal). 
%
Note that some examples will be forced to be classified erroneously by
some of the binary base classifiers because each classifier must label
all examples as belonging to one of the two classes it was trained on.
Consider the classifier shown in Figure~\ref{fig:roundrobin}(c): it
will arbitrarily assign all examples of class \textsf{o} to either $+$
or $\sim$ (depending on which side of the decision boundary they are).
In principle, such ``unqualified'' votes may lead to an incorrect
final classification. However, the votes of the five classifiers that
contain examples of class \textsf{o} should be able to overrule the
votes of the other ten classifiers, which pick one of their two
constituent classes for each \textsf{o} example.  If the class values
are independent, it is unlikely that all classifiers would unanimously
vote for a wrong class. However, the likelihood of such a situation
could increase if there is some similarity between the correct class
and some other class value (e.g., in problems with a hierarchical
class structure).
%\footnote{This observation is due to one of the
%  anonymous reviewers.}
In any case, if the five \textsf{o}
classifiers unanimously vote for \textsf{o}, no other class can
accumulate five votes (because they lost their direct match against
\textsf{o}). Nevertheless, this simple voting procedure is certainly
suboptimal. We will discuss alternative decoding techniques 
in Section~\ref{sec:issues}. 

In the above definition, we assume that the problem of discriminating
class $i$ from class $j$ is identical to the problem of discriminating
class $j$ from class $i$. This is the case if the base learner is
\emph{class-symmetric}. Rule learning algorithms, however, need not be
class-symmetric. Many of them choose one of the two classes as the
default class, and learn only rules to cover the other class. In such
a case, $<\!\!i,j\!\!>$ and $<\!\!j,i\!\!>$ may be two different
classification problems, if $j$ is used as the default class in the
former, and $i$ in the latter. A straightforward approach for
addressing this problem is to play a so-called double round robin, in
which separate classifiers are learned for both problems,
$<\!\!i,j\!\!>$ and $<\!\!j,i\!\!>$.\footnote{Another approach to
  tackle this problem could be to ensure that each class is used as
  the default class in approximately half of the classification
  problems.}


\begin{definition}[double round robin]
  The \emph{double round robin} class binarization transforms a
  $c$-class problem into $c(c-1)$ two-class problems $<\!\!i,j\!\!>$,
  one for each pair of classes $(i,j), i,j = 1 \dots c$, $j \neq i$.
  The examples of class $i$ are used as the positive examples and the
  examples of class $j$ as the negative examples.
\end{definition}

\enlargethispage*{24pt}

In the following section, we will evaluate a double round robin as an
alternative binarization strategy for the rule learner \ripper. 

\begin{table}[t]
\begin{small}
  \caption[]{\emph{Datasets used.} The first two columns show the training and 
    test set sizes (as specified in the description of the datasets),
    the next three columns show the number of symbolic and numeric
    attributes as well as the number of classes. The last column shows 
    the default error, i.e., the error one would get by always
    predicting the majority class.}
  \label{tab:datasets}
\begin{center}
\begin{tabular}{||l|rr|rrr|r||} 
\hline\hline
name             & train & test & sym & num & classes & def. error \\
\hline
abalone          & 3133 &  1044 &  1 &  7 & 29 & 83.5 \\ 
car              & 1728 &   --- &  6 &  0 &  4 & 30.0 \\
covertype    & 15,120 & 565,892 & 44 & 10 &  7 & 51.1 \\ 
glass            &  214 &   --- &  0 &  9 &  7 & 64.5 \\
image            & 2310 &   --- &  0 & 19 &  7 & 85.7 \\
letter         & 16,000 &  4000 &  0 & 16 & 26 & 95.9 \\
lr spectrometer  &  531 &   --- &  1 &101 & 48 & 89.6 \\
optical          & 5620 &   --- &  0 & 64 & 10 & 89.8 \\
page-blocks      & 5473 &   --- &  0 & 10 & 5  & 10.2 \\
sat              & 4435 &  2000 &  0 & 36 & 6  & 76.2 \\
shuttle        & 43,500 &14,500 &  0 &  9 & 7  & 21.4 \\
solar flares (c) & 1389 &   --- & 10 &  0 &  8 & 15.7 \\
solar flares (m) & 1389 &   --- & 10 &  0 &  6 &  4.9 \\
soybean          &  683 &   --- & 35 &  0 & 19 & 94.1 \\
thyroid (hyper)  & 3772 &   --- & 21 &  6 &  5 &  2.7 \\
thyroid (hypo)   & 3772 &   --- & 21 &  6 &  5 &  7.7 \\
thyroid (repl.)  & 3772 &   --- & 21 &  6 &  4 &  3.3 \\
vehicle          &  846 &   --- &  0 & 18 &  4 & 74.2 \\
vowel            &  528 &   462 &  0 & 10 & 11 & 90.9 \\
yeast            & 1484 &   --- &  0 &  8 & 10 & 68.8 \\
\hline\hline
\end{tabular}
\end{center}
\end{small}
\end{table}


\section{Accuracy}
\label{sec:accuracy}

In this section, we will briefly present an experimental evaluation of
round robin binarization in a rule learning context. We chose \ripper\ 
\citep{Ripper} as the base classifier, which---in our view---is the
most advanced algorithm of the family of separate-and-conquer (or
covering) rule learning algorithms \citep{jf:MLJ,jf:AI-Review}. 

The unordered and ordered binarization procedures were used as
implemented within \ripper. Ordered binarization is \ripper's default
mode, and unordered binarization can be invoked using the option
\texttt{\mbox{-a unordered}}. The round robin binarization was
implemented as a wrapper around \ripper, which provided it with the
appropriate training sets. The wrapper was implemented in
\texttt{perl} and had to communicate with \ripper\ by writing the
training sets to and reading \ripper's results from the disk. This
implementation is referred to as \rcubed\ (round robin ripper). 

\enlargethispage*{12pt} In principle, \ripper\ is a class-symmetric
learner. It will treat the larger class of a two-class problem as the
default class and learn rules for the smaller class. Although this
procedure is class-symmetric (problem $<\!\!i,j\!\!>$ is converted to
$<\!\!j,i\!\!>$ if $|c_i| > |c_j|$), we felt that it would not be
fair.  For example, the largest class in the multi-class problem would
be used as the default class in all round robin problems. This may be
an unfair advantage (or disadvantage) to this class.\footnote{The
  situation can be compared to a chess player that has to play all of
  his games with the same color or a football team that is allowed to
  play all (or none) of its games in the home stadium.  This can make a
  decisive difference and may invalidate the final result of the
  tournament.} For this reason, \rcubed\ implements a double round
robin binarization which calls \ripper\ with the option
\texttt{\mbox{-a given}} on each binary problem $<\!\!i,j\!\!>$ ($i,j= 1
\dots c, i\neq j$). This
instructs \ripper\ to use the classes in the order specified by the
user (i.e., the order in which they appear in the names file). Hence,
$<\!\!i,j\!\!>$ and $<\!\!j,i\!\!>$ are two different problems, which
ensures that each class is the default class in exactly half of its
binary classification problems. Note, that this procedure is basically
identical to the one that is employed by \ripper\ if it is used in
unordered mode on a two-class problem except that \ripper\ would
tie-break immediately between the theories learned for $<\!\!i,j\!\!>$
and $<\!\!j,i\!\!>$, while we first collect all votes from all
$c(c-1)$ binary problems.


\begin{table}[t]
\begin{small}
\caption[]{\emph{Error rates}:
  The first two column pairs show the results of \ripper\ (in
  unordered and in default, ordered mode). For both, we show the
  absolute error rate, and the improvement rate of \rcubed\ over the
  algorithm.  \rcubed's error rates are shown in the 5th column. The
  last column shows whether the difference between \rcubed\ and
  default \ripper\ (ordered) is significant (++ if $p > 0.99$, + if $p
  > 0.95$, McNemar test).  The first part of the table shows the
  results for cross-validation (17 sets). To allow a comparison with
  previous works, the last 6 lines show hold-out estimates for the six
  datasets with a designated test set (3 datasets occur in both
  blocks). The line in the middle shows the geometric averages of the
  improvement ratios.}
\label{tab:results}
\begin{center}
\begin{tabular}{||l|rc|rc|rc||}
\hline\hline
& \multicolumn{4}{c|}{\ripper} &  \multicolumn{1}{c}{\rcubed} & McNemar\\
dataset  & 
\multicolumn{1}{c}{unord.} & ratio & \multicolumn{1}{c}{ordered} & ratio & 
& test \\
\hline
\hline                                                                     
abalone              &81.64 & \emph{0.911}&  81.18 & \textit{0.916}& 74.34  & ++\\
car                  & 5.79 & \emph{0.390}&  12.15 & \textit{0.186}&  2.26  & ++\\
glass                & 35.51& \emph{0.724}&  34.58 & \textit{0.743}& 25.70  & ++\\
image                & 4.15 & \emph{0.823}&   4.29 & \textit{0.808}&  3.46  & + \\
lr spectrometer      & 64.22& \emph{0.827}&  61.39 & \textit{0.865}& 53.11  & ++\\
optical              & 7.79 & \emph{0.479}&   9.48 & \textit{0.394}&  3.74  & ++\\
page-blocks          &  2.85& \emph{0.968}&   3.38 & \textit{0.816}&  2.76  & ++\\
sat                  &13.18 & \emph{0.785}&  13.04 & \textit{0.794}& 10.35  & ++\\
solar flares (c)     & 15.91& \emph{0.991}&  15.91 & \textit{0.991}& 15.77  & = \\
solar flares (m)     &  4.90& \emph{1.029}&   5.47 & \textit{0.921}&  5.04  & = \\
soybean              &  8.79& \emph{0.717}&   8.79 & \textit{0.717}&  6.30  & ++\\
thyroid (hyper)      & 1.25 & \emph{0.893}&   1.49 & \textit{0.749}&  1.11  & + \\
thyroid (hypo)       & 0.64 & \emph{0.833}&   0.56 & \textit{0.955}&  0.53  & = \\
thyroid (repl.)      & 1.17 & \emph{0.863}&   0.98 & \textit{1.026}&  1.01  & = \\
vehicle              &28.25 & \emph{1.029}&  30.38 & \textit{0.957}& 29.08  & = \\
vowel                &30.50 & \emph{0.633}&  27.07 & \textit{0.690}& 18.69  & ++\\
yeast                &44.00 & \emph{0.949}&  42.39 & \textit{0.986}& 41.78  & = \\
\hline
average              &      & \emph{0.787}&        & \textit{0.747}& & \\
\hline\hline
abalone              & 81.03& \emph{0.901}& 82.18 & \textit{0.888}& 72.99  & ++\\ 
covertype            & 35.37& \emph{0.939}& 38.50 & \textit{0.862}& 33.20  & ++\\
letter               & 15.22& \emph{0.516}& 15.75 & \textit{0.498}&  7.85  & ++\\
sat                  & 14.25& \emph{0.782}& 17.05 & \textit{0.654}& 11.15  & ++\\
shuttle              & 0.03 & \emph{0.667}&  0.06 & \textit{0.375}&  0.02  & = \\
vowel                & 64.94& \emph{0.823}& 53.25 & \textit{1.004}& 53.46  & = \\
% {\small 21.80} & {\small 21.90} & {\small 18.52} & 
% \textit{0.747} & \\
\hline\hline
\end{tabular}
\end{center}
\end{small}
\end{table}

% \enlargethispage*{12pt}
Table~\ref{tab:datasets} shows the 20 datasets we used in this study.
They were chosen arbitrarily among datasets with $\geq 4$ classes
available at the UCI repository \citep{UCI}.\footnote{The restriction
  to 4 or more classes was made because on 3-class problems, we would
  expect frequent 3-way ties, which are not yet handled very cleverly.
  The issue of ties is discussed further below in the paper
  (Section~\ref{sec:issues}).} The implementation of the algorithm was
developed independently and not tuned on these datasets. On the six
sets with a dedicated test set, we report the error rate on these test
sets. On the other 14 sets, we estimated the error rate using paired,
stratified 10-fold cross-validations. For \textsl{abalone},
\textsl{sat} and \textsl{vowel} we performed both a test set
evaluation and a cross-validation.\footnote{As we can see in
  Table~\ref{tab:results}, there are no qualitative differences
  between hold-out and cross-validation for \textsl{abalone} and
  \textsl{sat}. For \textsl{vowel} the performance of all algorithms
  was significantly better in the case of cross-validation, which may
  indicate that the 528 examples in the original training set are
  insufficient for learning a good concept.}

Table~\ref{tab:results} shows the accuracies of \ripper\ (unordered
and ordered) and \rcubed\ on the selected datasets.  On half of the 20
experiments (not counting the cross-validated trials of the three sets
the re-appear at the bottom), \rcubed\ is significantly better ($p >
0.99$ on a McNemar test\footnote{The McNemar test \citep{McNemar-Test}
  tests for significant differences of proportions in paired sample
  designs. It is appropriate for comparing classifiers because it does
  not assume independent samples and has a comparably low Type I error
  \citep{ComparativeStudies,5x2CV}.}) than \ripper's default mode
(ordered binarization). There were only two experiments
(\textsl{thyroid (repl.)} and the test-set version of \textsl{vowel}),
where \rcubed\ is worse than \ripper, both differences being
insignificant. On the 17 cross-validated problems, \rcubed\ reduces
the average error to 75\% of the error of \ripper's error.\footnote{As
  these are relative performance measures, we use a geometric average
  so that $x$ and $1/x$ average to 1.} The comparison to
unordered \ripper\ is similar (the significance levels for this case
are not shown).


We can safely conclude that round robin binarization may result in
significant improvements over ordered or unordered binarization
without having a high risk of decreasing performance.


\section{Efficiency}
\label{sec:efficiency}

At first sight, it appears to be a questionable idea to replace $c$
binary learning tasks (unordered binarization) with $c(c-1)/2$ binary
learning tasks (round robin binarization) because the quadratic
complexity seems to be prohibitive for tasks with more than a few
classes.
% not true, they will cite that it is ``certainly more efficient''
% In fact, this argument is frequently held against pairwise
% classification in the literature (e.g., \citealt{MultiClass-SVM}).
This section will illustrate that this is not the case.


\subsection{Theoretical Considerations}
\label{sec:theoretical}

In this section, we will see that although (single) round robin
classification turns a single $c$-class learning problem into
$c(c-1)/2$ two-class problems, the total training effort is only
linear in the number of classes and smaller than the effort needed for
an unordered binarization.  The analysis is independent of the type of
base learning algorithm used, although we will show that the advantage
increases with the computational complexity of the algorithm. Some of
the ideas have already been sketched in a short paragraph by
\citet{PolychotomousClassification-TR}, but we go into considerably
more detail and, in particular, focus on the comparison to
conventional class binarization techniques. 

In the following, we assume a base learner with a time complexity
function $f(n)$, i.e., the time needed for learning a classifier from
a training set with $n$ examples is $f(n)$. Note that we
% will have to
interpret this as an \emph{exact} function, and not as an
asymptotically tight bound as in $\Theta(f(n))$ because we are not
interested in asymptotic behavior, but in the exact complexity at a
given training set size $n$.\footnote{This is a very strong assumption
  that will in general not hold up in practice. Strictly speaking, the
  time needed for training a classifier does not only depend on the
  sample size, but also on the domain and the ``difficulty'' of the
  sample for the learner. Even in the same domain, samples of equal
  sizes may yield theories of different complexities, and more complex
  theories will in general require longer training times. One way to
  incorporate this aspect could be to interpret our complexity
  functions as average run-times over all possible subsets of this
  size (in a given domain). However, we believe that making the
  simplifying assumption of an exact complexity function does not
  invalidate our claims, which are also backed up by empirical results
  (Section~\ref{sec:empirical-efficiency}).} We will consider
functions of the form $f(n) = \lambda n^p$ $(p \geq 1, \lambda > 0)$
and denote such a function with $f_p$. We use $b|f$ to denote a class
binarization with algorithm $b$, where an base learner with complexity
$f(n)$ is applied to each binary problem. Unless mentioned otherwise,
all results refer to single round robin binarizations of problems with
more than two classes ($c > 2$).

\enlargethispage{6pt}

\begin{definition}[class penalty]
%  If 
% the base learner has complexity $f(n)$ on a binary
%  learning problem of size $n$, and the 
  Let $g_{b|f}(c,n)$ be the total complexity for using a learner
  with complexity $f(n)$ on a problem with $c > 2$ classes that has
  been class-binarized using binarization algorithm $b$. We then
  define the \emph{class penalty} function $\pi_{b|f}$ as
  \[
  \pi_{b|f}(c,n) = \frac{g_{b|f}(c,n)}{f(n)}
  \]
\end{definition}

Intuitively, the class penalty $\pi_{b|f}(c,n)$ measures the
performance of an algorithm on a class binarized $c$-class problem
relative to its performance on a single two-class problem the same
size $n$. In the following, it will turn out that in some cases the
class penalty function is independent of $n$ or $f$. In such
cases, we will abbreviate the notation as $\pi_b(c)$.


%First, we look at the class penalty $\pi_u$ of unordered class
%binarization: 

\begin{lemma}[class penalty for unordered class binarization]
\ \\
  \(\pi_u(c) = c\)
  \label{lemma:pi_u}
\end{lemma}

\begin{proof}
  There are $c$ learning tasks, each using the entire training set of
  $n$ examples. Hence the total complexity $g_{u|f}(c,n) = c f(n)$, and
  $\pi_u(c) = c$.
\end{proof}
%
%\begin{lemma}[total number of examples for single round robin]
%\ \\
%  The sum of examples in all training sets of a single round robin
%  class binarization is $(c-1)n$.
%  \label{lemma:n_examples}
%\end{lemma}

%\begin{proof}
%  Each example of the original training set will occur exactly once in
%  each of the $c-1$ binary tasks where its class is paired against one
%  of the other $c-1$ classes. As there are $n$ examples in the
%  original training set, the total number of examples is $(c-1)n$.
%\end{proof}

%Note that this number is less than the total number of training
%examples in the unordered class binarization (i.e., $cn$).

\vspace*{-12pt}

\begin{lemma}[class penalty for single round robin, linear base
  algorithm]
\ \\
For a base learner with a linear run-time complexity $f_1(n) = \lambda
n$: $\pi_{r|f_1}(c) = c - 1$.
 \label{lemma:linear}
\end{lemma}

\begin{proof}
  Each example of the original training set will occur exactly once in
  each of the $c-1$ binary tasks where its class is paired against one
  of the other $c-1$ classes. As there are $n$ examples in the
  original training set, the total number of examples is $(c-1)n$.
%
%  Lemma~\ref{lemma:n_examples} shows that the learner has to process a 
%  total of $(c-1)n$ examples. 

  As $f$ is linear, the sum of the complexities on all individual
  training sets is equal to the complexity of the algorithm on the sum 
  of all training set sizes, i.e., $\sum f_1(n_i) = f_1(\sum n_i)$. Hence,
% , we can directly look at the total number of examples:
\[g_{r|f_1}(c,n) = f_1((c-1)n) =
  \lambda (c-1)n = (c-1)(\lambda n) = (c-1)f_1(n)\]
Therefore $\pi_{r|f_1}(c) = c-1$.
\end{proof}

This analysis ignores a possible constant overhead of the
algorithm, which potentially affects $c(c-1)/2$ function calls in
the round robin case, while it only affects $c$ function calls in the
unordered case. However, some significant overhead costs, like reading
in the training examples (and similar initialization steps) need, of
course, only be performed once if the round robin procedure is
performed in memory (which was not the case in the implementation
which we used for the experimental results reported in the next
section). If there is an overhead $\mu$ to be considered (i.e.,
$f(n) = \lambda n + \mu$), the total costs will be increased by
$\mu c(c-1)/2$. For very large values of $c$, these
quadratic costs may outweigh the savings, but under reasonable
assumptions (e.g., $c^2 < n$) these additive costs should not matter,
in particular---as we shall see in the following---not in the case of
super-linear base algorithms.

\begin{lemma}[class penalty for single round robin, super-linear
  base algorithm]
\ \\
For $p > 1$: 
  $\pi_{r|f_p}(c,n) < 
  \left\{ \begin{array}{cl} c-1 & \textrm{if $c$ is even}\\
                       c & \textrm{if $c$ is odd} \end{array}
   \right.$
\label{lemma:pi_r}
\end{lemma}

\begin{proof} 
%by induction. Without loss of generality, let us assume
%  $\lambda = 1$ ($\lambda$ can be pulled out of the sums in the subsequent
%  transformations)

%\medskip
%\noindent
%  $c = 4$: We have 4 classes with $n_i > 0$ examples each, $\sum_{i=1}^{4}
%  n_i = n$. The total effort is:
%\begin{eqnarray*}
%& & (n_1 + n_2)^o + (n_1 + n_3)^o + (n_1 + n_4)^o + (n_2 + n_3)^o + 
%(n_2 + n_4)^o + (n_3 + n_4)^o \\
%&=& [ (n_1 + n_2)^o + (n_3 + n_4)^o ] +  [ (n_1 + n_3)^o + (n_2 + n_4)^o ]
%+ [ (n_1 + n_4)^o + (n_2 + n_3)^o ] \\
%&<& 3 (n_1 + n_2 + n_3 + n_4)^o < 3 n^o = 3 f(n)
%\end{eqnarray*}
%The two inequalities hold because $o > 1$ and $n_i > 0$. Hence
%$\pi_r(4) < 3$.

%\medskip
%\noindent
%$c > 4$: We have $c$ classes with $n_i$ examples, $\sum_{i=1}^{c} n_i
%= n$. Let us assume that $\pi_r(c-1) < c-2$ and remove all examples of 
%class $n_c$ (without loss of generality). Then the total effort for the
%remaining $n - n_c$ examples is $< (c-2) f(n)$.

%\noindent
%The additional effort required for adding class $n_c$ is to pair it
%with each other class.

\noindent                        
  Assume we have $c$ classes, class $i$ has $n_i$ examples, $\sum_{i=1}^{c} n_i = n$.

\medskip
\noindent
$c$ is even:\\
In this case, we can arrange the learning tasks in the form of $c-1$
rounds. Each round consists of $c/2$ disjoint pairings, i.e.,
each class occurs exactly once in each round, and it has a different
partner in each round. Such a tournament schedule is always
possible.\footnote{A simple algorithm for constructing such a
  tournament is to fix one player, say the one with index $c$ (assume
  $c$ is even, add a dummy player if it is not). Then construct a
  tournament schedule for the first round by pairing player $i$ with
  player $(c+1)-i$, $i = 1 \dots c/2$.  Subsequent rounds are
  played with the same pairings, but before each round, the first
  $c-1$ entries rotate places, i.e., player $i$ takes over the place
  of $i+1$ ($i = 1\dots c-2$) and $c-1$ moves to 1. It is both, fairly
  trivial to see that this is correct, and quite easy to put into
  practice when organizing a round robin chess tournament for kids
  (let one stick to its seat and all others move around one seat at a
  time).}  Without loss of generality, consider a round where classes
$2i$ are paired against classes $2i-1$.  The complexity of this round
is $\sum_{i=1}^{c/2} f_p(n_{2i} + n_{2i-1})$. As for $p > 1$ and
$a_i > 0$, $i = 1\dots N$ it holds that $\sum_i a_i^p < (\sum_i a_i)^p$, and
because we assumed $c > 2$, we get
  \[ \sum_{i=1}^{\frac{c}{2}} f_p(n_{2i} + n_{2i-1}) <
  f_p(\sum_{i=1}^{\frac{c}{2}} n_{2i} + n_{2i-1}) = f_p(\sum_{i=1}^c n_i) = 
  f_p(n)\]
  Analogously, we can derive the same upper bound for each of the
  $c-1$ rounds. Thus the total complexity of the round robin
  binarization $g_{r|f_p}(c,n) < (c-1) f_p(n)$ and $\pi_{r|f_p}(c,n) < c-1$.

\medskip
\noindent
$c$ is odd:\\
 we add a dummy class with $n_{c+1} = 0$
  examples, and perform a tournament as above. As this tournament has 
  $c$ rounds, $\pi_{r|f_p}(c,n) < c$.

% Der Beweis mit der Idee das
%
%  exampand organize a tournament as above. Note that this
%  tournament has $c$ rounds, but in each round one of the classes is
%  paired against the dummy class. 

%  Without loss of generality, consider a round where classes $2i$ are
%  paired against classes $2i-1$.  The complexity of this round is
%  \[ \sum_{i=1}^{\frac{c+1}{2}} f(n_{2i} + n_{2i-1}) = 
%     \sum_{i=1}^{\frac{c-1}{2}} f(n_{2i} + n_{2i-1}) + f(n_c +
%     n_{c+1}) < 
%     f(\sum_{i=1}^{\frac{c-1}{2}}(n_{2i} + n_{2i-1})) + f(n_c) = 
%     f(n-n_c) + f(n_c) \]

\end{proof}
Note that the upper bounds in Lemma~\ref{lemma:pi_r} are not tight
(see also Theorems~\ref{theorem:class-balanced}
and~\ref{theorem:lim_o} below). In particular, the bound of $c-1$
should also hold for odd numbers but the proof for that case seems to
be considerably more tricky.\footnote{The straightforward proof idea
  of deriving an upper bound for all regular pairings of each round
  (excluding the pairing with the dummy class) does not go through
  because the inequality $\sum_{i=1}^c (n-n_i)^p < (c-1) n^p$ does not
  hold in general.} However, the current version suffices to prove the
following theorem:

\begin{theorem}[efficiency of round-robin and unordered binarization]
\ \\
  For algorithms with at least linear complexity ($p \geq 1$):
  $\pi_{r|f_p}(c,n) < \pi_{u|f_p}(c,n)$, i.e., single round
  robin is more efficient than unordered binarization.
  \label{theorem:relative-efficiency}
\end{theorem}

\begin{proof}
  Follows immediately from Lemmas~\ref{lemma:pi_u},~\ref{lemma:linear}
  and~\ref{lemma:pi_r}.
\end{proof}

As mentioned above, the bounds used for proving the lemmas are
certainly not tight. This becomes obvious, if we look at problems with 
a uniform class distribution.

\begin{theorem}[class penalty for class-balanced problems]
\ \\
  For a class-balanced problem, 
  $\pi_{r|f_p}(c) =  (c-1)(\frac{2}{c})^{p-1}$
\label{theorem:class-balanced}
\end{theorem}

\begin{proof}
In the (single) round robin case, we have $c(c-1)/2$ problems with
$2n/c$ examples each. Hence the total complexity is 
\begin{eqnarray*}
g_{r|f_p}(c,n) & = & \frac{c(c-1)}{2} f_p\left(2\frac{n}{c}\right) = (c-1)\frac{c}{2}
\lambda\left(2\frac{n}{c}\right)^p = 
(c-1)\left(\frac{2}{c}\right)^{p-1}\lambda n^p = \\
& = & (c-1)\left(\frac{2}{c}\right)^{p-1} f_p(n)
\end{eqnarray*}
Therefore $\pi_{r|f_p}(c) =  (c-1)(\frac{2}{c})^{p-1}$. 
\end{proof}

From this result, it is easy to see that $\pi_{r|f_p}(c,n)$ decreases
with increasing complexity order $p$ of the base learner (assuming $c
> 2$). Likewise, for $p > 2$,  $\pi_{r|f_p}(c,n)$ can be made arbitrarily small
by increasing the number of classes $c$. While the latter property is
hard to generalize for arbitrary class distributions---it is not the
case that every problem with more than $c$ classes has a smaller class
penalty than an arbitrary $c$-class problem (and vice versa)---it is
easy to prove the following theorem:

\begin{theorem}[class penalty ratio for increasing order of base
  algorithm]
\ \\
%  Let $\pi(c,p)$ denote the class penalty for an algorithm of
%  complexity $f(n) = \lambda n^p$. Then
For $c > 2$, the class penalty ratio
$\frac{\pi_{r|f_p}(c,n)}{\pi_{u|f_p}(c,n)}$ is monotonically
decreasing with $p$ and
  \[\lim_{p\rightarrow\infty}\frac{\pi_{r|f_p}(c,n)}{\pi_{u|f_p}(c,n)} = 0\]\label{theorem:lim_o}
\end{theorem}
\vspace*{-5mm}
\begin{proof}
The total effort for the single round robin is 
$\sum_{i=1}^{c-1} \sum_{j=i+1}^c f_p(n_i + n_j)$. Hence the class penalty
ratio is:
\[
\frac{\pi_{r|f_p}(c,n)}{\pi_{u|f_p}(c,n)} = 
% \frac{\pi_r(c,p)f(n)}{\pi_u(c,p)f(n)} =
\frac{g_{r|f_p}(c,n)}{g_{u|f_p}(c,n)} = 
\frac{\sum_{i=1}^{c-1} \sum_{j=i+1}^c f_p(n_i + n_j)}{c f_p(n)} = 
\frac{1}{c}\sum_{i=1}^{c-1} \sum_{j=i+1}^c \frac{ f_p(n_i + n_j)}{f_p(n)} 
\]
Consequently,
\[ \lim_{p\rightarrow\infty}\frac{\pi_{r|f_p}(c,n)}{\pi_{u|f_p}(c,n)} = 
\lim_{p\rightarrow\infty}\frac{1}{c}\sum_{i=1}^{c-1} \sum_{j=i+1}^c 
\frac{\lambda (n_i + n_j)^p}{\lambda n^p} = 
\frac{1}{c}\sum_{i=1}^{c-1} \sum_{j=i+1}^c \lim_{p\rightarrow\infty}
\left( \frac{n_i + n_j}{n} \right)^p = 0\]
The last equation holds because $c > 2$ and $0 < n_i + n_j < n\;\; 
(i,j = 1\dots c \, ; \; i \neq j)$, hence 
$\frac{n_i + n_j}{n} < 1$.
For the same reasons $\left( \frac{n_i + n_j}{n} \right)^p > \left( \frac{n_i +
n_j}{n} \right)^{p+\epsilon}$ for all $\epsilon > 0$ and  $\frac{\pi_{r|f_p}(c,n)}{\pi_{u|f_p}(c,n)}$ is monotonically decreasing with $p$.
%
%  In the proof of theorem~\ref{theorem:pi_u} we derived an upper bound 
%  of $(c-1)f(n)$ for the total complexity of the round robin task.
%  Let us denote the error we made by this approximation with
%  $\epsilon(c,o)$. We then have
%  \begin{equation*}
%  \frac{\pi_r(c,o)}{\pi_u(c,o)} = 
%  \frac{(c-1)f(n) - \epsilon(c,o)}{cf(n)} = 
%  \frac{c-1}{c} - \frac{\epsilon(c,o)}{cf(n)} 
%  \end{equation*}
%  The error $\epsilon(c,o)$ is the sum of the errors $\epsilon_i(c,o)$ 
%  that were made at each of the $c-1$ rounds of the tournament
%  described in the proof of Theorem~\ref{theorem:pi_u}. Let us again,
%  without loss of generality, look at the error $\epsilon_i(c,o)$ of a
%  round where successive classes are paired together. Then 
%  \begin{eqnarray*}
%  \lim_{o\rightarrow\infty}\frac{\epsilon_i(c,o)}{f(n)} & = &
%  \lim_{o\rightarrow\infty}\frac{n^o - \sum_{i=1}^{\frac{c}{2}} (n_{2i} + n_{2i-1})^o}{n^o} \\
%  & = & 1 -  \sum_{i=1}^{\frac{c}{2}} \lim_{o\rightarrow\infty} \left(\frac{n_{2i} + n_{2i-1}}{n}\right)^o
%  = 1 
%  \end{eqnarray*}
%  because $n_{2i} + n_{2i-1} < n$. 
%  % 
%  As an analogous derivation works for all $c-1$ values of
%  $\epsilon_i(c,o)$, we have 
%  \[
%  \lim_{o\rightarrow\infty} \frac{\epsilon(c,o)}{f(n)} = \sum_{i=1}^{c-1}\lim_{o\rightarrow\infty} \frac{\epsilon_i(c,o)}{f(n)} = 
%  c-1
%  \]
%  Hence:
%  \[
%  \lim_{o\rightarrow\infty} \frac{\pi_r(c,o)}{\pi_u(c,o)} =  
%  \frac{c-1}{c} - \frac{1}{c} \lim_{o\rightarrow\infty} \frac{\epsilon(c,o)}{f(n)} = 
%  \frac{c-1}{c} - \frac{c-1}{c} =
%  0
%  \]
\end{proof}

%It is harder to formalize the increase in efficiency with an
%increasing number of classes because this property clearly on the
%class distribution in the example set. It is not the case that any
%problem with more than $c$ classes has a smaller class penalty than a
%$c$-class problem (and vice versa). However, the general tendency is
%decreasing. We will analyze this case for
%even number of classes.

%\begin{theorem}[class penalty ratio for increasing number of classes]
%\ \\
%  \[\lim_{c\rightarrow\infty}\frac{\pi_r(c)}{\pi_u(c)} = 0\]\label{theorem:lim_c}
%\end{theorem}
%\vspace*{-5mm}
%\begin{proof} (for even $c$)
%\\
%We first try to bound the error that we made in
%Lemma~\label{lemma:pi_r} by bounding $\sum_{i=1}^{\frac{c}{2}}
%f(n_{2i} + n_{2i-1})$ with $f(n)$ (and similarly for each of the $c-1$
%rounds in the round robin tournament among the classes). According to
%the binomial theorem:
%\[
%\left( \sum_{i=1}^c n_i \right)^o = \sum_{o_1+ \dots + o_c=o} \left(
%\begin{array}{c}
%n \\
%o_1, \dots, o_c
%\end{array}
%\right)
%\prod_{j=1}^c n_j^{o_j}
%= \sum_{i=1}^c n_i^o + \sum_{o_1+ \dots +o_c=o}^{o_1 \dots o_c \neq o} \left(
%\begin{array}{c}
%n \\
%o_1, \dots, o_c
%\end{array}
%\right)
%\prod_{j=1}^c n_j^{o_j}
%\]
%We bound the rightmost sum from below by replacing the number of classes 
%$n_j$ in the product to the right with the smallest class $n_{min}$.
%\[
%\sum_{o_1+ \dots +o_c=o}^{o_1 \dots o_c \neq o} \left(
%\begin{array}{c}
%n \\
%o_1, \dots, o_c
%\end{array}
%\right)
%\prod_{j=1}^c n_j^{o_j} > 
%n_{min}^o \sum_{o_1+ \dots +o_c=o}^{o_1 \dots o_c \neq o} \left(
%\begin{array}{c}
%n \\
%o_1, \dots, o_c
%\end{array}
%\right) 
%= n_{min}^o (c^o - c)
%\]
%The last equality holds because of $\sum_{o_1+ \dots +o_c=o}^{o_1 \dots o_c \neq o} \left(
%\begin{footnotesize}\begin{array}{c}
%n \\
%o_1, \dots, o_c
%\end{array}\end{footnotesize}\right) = c^o$. The $c$ terms where $o_1 \dots o_c \neq o$
%does not hold all evaluate to 1, so we have to subtract $c$.

%\medskip
%\noindent
%Using the same trick for bounding $n_{2i} + n_{2i-1}$ with $2n_{min}$
%yields
%\[ 
%\sum_{i=1}^{\frac{c}{2}} (n_{2i} + n_{2i-1})^o < n^o
%(2n_{min})^o(\left(\frac{c}{2}\right)^o - \frac{c}{2}) = n^o - cn_{min}^o (c^{o-1} - 2^{o-1})
%\]
%This bound can be derived for each of the $c-1$ rounds of
%Lemma~\label{lemma:pi_r}. 
%\begin{eqnarray*}
%\frac{\pi_r(c)}{\pi_u(c)} &=& \frac{(c-1)\sum_{i=1}^{\frac{c}{2}}
%  (n_{2i} + n_{2i-1})^o}{cn^o} < \frac{(c-1)n^o - c(c-1) n_{min}^o
%  (c^{o-1} - 2^{o-1})}{cn^0} \\
%&=& \frac{c-1}{c} - \left(\frac{n_{min}}{n}\right)^o (c-1) (c^{o-1} -
%2^{o-1})
%\end{eqnarray*}

%As $n_{min} < \frac{n}{c}$ we get
%\begin{equation*}
%\lim_{c\rightarrow\infty} 
%\left(\frac{n_{min}}{n}\right)^o (c-1) (c^{o-1} - 2^o) \leq 
%\lim_{c\rightarrow\infty} 
%\left(\frac{n}{cn}\right)^o (c-1) (c^{o-1} - 2^o) 
%\begin{equation*}

%% \lim_{c\rightarrow\infty}\frac{\pi_r(c)}{\pi_u(c)} \leq 


%\end{proof}

% \enlargethispage*{12pt}
As the decrease with increasing order $p$ is strictly monotonic, this
theorem implies that the more expensive a learning algorithm is, the
larger will be the efficiency gain for using round robin binarization
instead of unordered binarization.  In particular, it may be the case
that for expensive algorithms, even a \emph{double} round robin is
faster than unordered binarization (in fact, it is easy to see that
Theorem~\ref{theorem:lim_o} also holds for double round robin
binarization) or may be even faster than ordered binarization.  
% It is easy to see this for problems with a uniform class distribution.
Assume, e.g., an algorithm with a quadratic complexity on a
class-balanced eight-class problem (i.e., $p=2$ and $c = 8$).
According to Theorem~\ref{theorem:class-balanced} and
Lemma~\ref{lemma:pi_u}:
\[ 
\frac{\pi_{r|f_p}(c)}{\pi_{u|f_p}(c)} =
\frac{(c-1)(\frac{2}{c})^{p-1}}{c} = \frac{7(\frac{1}{4})^1}{8} = 
\frac{7}{32} < \frac{1}{4}
\]
Thus, under these circumstances, the single round robin is more than
four times faster than the unordered approach. Considering that the
double round robin is twice as slow as the single round robin, and assuming 
that the ordered approach is twice as fast as the unordered approach
(see the following section for empirical values on that), the double
round robin may in this case be faster than the ordered approach.
This scenario will be empirically evaluated in the following section.

% \enlargethispage*{12pt}
\subsection{Empirical Evaluation}
\label{sec:empirical-efficiency}

Contrary to the theoretical analysis in the previous section, where we
focussed on the ``friendly'' case of pairing unordered binarization
vs.\ single round robin, our empirical results compare the performance
of a double round robin binarization with \ripper\ as a base learner
against both ordered binarization (\ripper's default mode) and
unordered binarization (\ripper\ with the parameters \texttt{-a
  unordered}). In the case of a linear algorithm complexity, the
double round robin should be about two times slower than the unordered
binarization and four times slower than the ordered binarization. As
discussed in the previous section, \ripper's super-linear
run-time\footnote{The complexity of \rippersmall's initial rule
  learning and pruning phase is $O(n\log^2(n))$
  \citep{jf:ML-94,Ripper}.  Thereafter \rippersmall\ performs two
  phases of optimization, which---according to the experimental
  evidence shown in \citep{Ripper}---only add a constant factor to the
  overall complexity. 
% However, in very large domains, like text
%  domains, our own experience is that this optimization can slow down
%  the algorithm considerably, and seems to dominate the run-time (but
%  we have not performed a thorough analysis of this issue).
  } might improve the relative performance of round robin learning.


\begin{table}[t!]
\begin{small}
\caption[]{\emph{Runtime results:} The first column shows the 
  average run-times (in CPU secs.\ user time) of \rcubed. The
  subsequent columns show the ratio of \rcubed\ over unordered and
  ordered \ripper. The reported run-times are average training time
  over all folds of a 10-fold cross-validation.
% Run-times are for training time only.
%, except for the five
%  results from hold-out sets at the bottom. 
  The line % not measured % in the middle 
  at the bottom shows the geometric average of the 17 cross-validated
  performance ratios.
\label{tab:efficiency}
}
\begin{center}
\begin{tabular}{||l|rcc||}
\hline\hline
dataset  & 
\multicolumn{1}{c}{\rcubed} & 
vs.\ unordered & 
vs.\ ordered \\
\hline
abalone              & 140.28 & \emph{ 3.14} & \emph{ 3.27} \\
car                  &   6.71 & \emph{ 1.55} & \emph{ 1.47} \\
glass                &   2.03 & \emph{ 2.26} & \emph{ 3.80} \\
image                &  25.84 & \emph{ 0.90} & \emph{ 1.98} \\
lr spectrometer      & 489.67 & \emph{ 4.40} & \emph{ 6.93} \\
optical              & 275.69 & \emph{ 0.63} & \emph{ 1.34} \\
page-blocks          &  36.66 & \emph{ 1.43} & \emph{ 1.93} \\
sat                  & 186.89 & \emph{ 0.69} & \emph{ 1.25} \\
solar flares (c)     &   6.65 & \emph{ 6.03} & \emph{ 7.57} \\
solar flares (m)     &   3.98 & \emph{ 5.62} & \emph{ 7.49} \\
soybean              &  21.07 & \emph{ 6.29} & \emph{13.24} \\
thyroid (hyper)      &  19.71 & \emph{ 2.68} & \emph{ 3.46} \\
thyroid (hypo)       &  14.91 & \emph{ 2.39} & \emph{ 3.63} \\
thyroid (repl.)      &  15.35 & \emph{ 2.26} & \emph{ 3.33} \\
vehicle              &   7.66 & \emph{ 1.22} & \emph{ 2.10} \\
vowel                &  16.22 & \emph{ 0.87} & \emph{ 2.16} \\
yeast                &  16.90 & \emph{ 1.77} & \emph{ 3.12} \\
\hline                
average              &        & \emph{ 2.02} & \emph{ 3.19} \\
\hline\hline
%abalone              & 193.0  & \emph{ 4.51} & \emph{ 5.73} \\
%covertype            &  ---   &  ---        & ---         \\
%letter               & 1250.0 & \emph{ 0.51} & \emph{ 1.14} \\
%sat                  & 143.0  & \emph{ 0.85} & \emph{ 1.51} \\
%shuttle              & 277.0  & \emph{ 2.10} & \emph{ 3.16} \\
%vowel                &  14.0  & \emph{ 1.83} & \emph{ 4.75} \\
%\hline                
%\hline                
\end{tabular}
\end{center}
\end{small}
\end{table}

%The right half of 
Table~\ref{tab:efficiency} shows the run times in CPU secs.\ user time
(measured on a Sparc Ultra-2 under Sun Solaris) of \rcubed\ and its
performance ratios against \ripper\ in unordered and ordered mode. 
% For
% the datasets with separate test sets (shown at the bottom), we
% measured total run-time, i.e., training time plus classification time.
% We failed to measure the run-times for the \textsl{covertype} data
% set, where the situation was complicated by the large test set, which
% had to be split into several pieces.
%
% For the 17 cross-validated results, 
The reported times are average training times,\footnote{Note that the
  performance ratios for testing times would in general be
  considerably worse; in some cases, we observed increases in the
  factors of up to 50\%.
% The reason is that with testing, no time can be
%saved on the test examples because each example has to be tested on
%all $c(c-1)$ theories. Again, we could speed up the implementation
%considerably by keeping the test examples in memory instead of having
%\ripper\ re-read them from the disk for each of the $c(c-1)$ theories,
%but the principal problem remains. 
  We will briefly discuss the problem of classification efficiency
  (and some proposals for solving it) in Section~\ref{sec:issues}.}
i.e., the performance ratios can be interpreted as empirical estimates
of the class penalty ratios $\frac{\pi_r}{\pi_{u}}$ and
$\frac{\pi_r}{\pi_{o}}$.
%
On average, \rcubed\ is about two times slower than \ripper\ in
unordered mode, and about three times slower than \ripper\ in default,
ordered mode, while \ripper's ordered mode is about 1.5 times
faster than unordered mode (as opposed to the factor 2 we were
assuming in the theoretical analysis at the end of
Section~\ref{sec:theoretical}).
%
This means that despite the inefficient implementation,
the empirical values are fairly close to the estimates we made at the
end of the previous section: for a linear algorithm, we expected the
double round-robin procedure to be about twice as slow as the
unordered approach and about four times as slow than the ordered
approach. Apparently, the additional savings---based on the fact that
simpler concepts are learned for the pairwise tasks and that \ripper's
run-time is super-linear---make up for the losses due to the
sub-optimal implementation as a wrapper. For more expensive base
learning algorithms (like support vector machines), the analysis in
the previous section lets us expect bigger savings.

% \enlargethispage*{12pt}
Moreover, there are several cases where \rcubed\ is even faster than
\ripper\ in unordered mode and comes close to \ripper\ in ordered
mode. This is despite the fact that \rcubed\ is implemented as a
\texttt{perl}-script that communicates to \ripper\ by writing the
training and test sets of the new tasks to the disk, while unordered
and ordered binarization are native in \ripper's efficient
implementation in \texttt{C}. Clearly, a tight integration of round
robin binarization into \ripper's \texttt{C}-code would be more
efficient.\footnote{The effect is somewhat compensated by the fact
  that we only report user time (which ignores time for disk access
  and system time). For example, on the 26-class \textsl{letter}
  dataset, where \rcubed\ writes $26 \times 25 = 650$ files to the
  disk, its total run-time is about 15\% higher than the reported user
  time, while there is almost no difference for \ripper.}


\section{Round Robin Learning as an Ensemble Technique}
\label{sec:ensembles}

Ensemble techniques have received considerable attention within the
recent machine learning literature
\citep{ML-FourDirections,EnsembleMethods,PopularEnsembles,Ensembles-Comparison}.
The idea to obtain a diverse set of classifiers for a single learning
problem and to vote or average their predictions is both simple and
powerful, and the obtained accuracy gains often have a sound
theoretical foundation \citep{AdaBoost,Bagging}. Averaging the
predictions of these classifiers helps to reduce the variance and
often increase the reliability of the predictions. There are several
techniques for obtaining a diverse set of classifiers. The most common
technique is to use subsampling to diversify the training sets as in
bagging \citep{Bagging} and boosting \citep{AdaBoost}. Other
techniques include the use of different feature subsets
\citep{MultipleFeatureSets-kNN}, to exploit the randomness of the base
algorithms \citep{NN-RandomInitialization}, possibly by artificially
randomizing their behavior \citep{Randomization}, or to use multiple
representations of the domain objects, for example by using
information originating from different hyperlinks pointing to a web
page \citep{jf:IDA-99,jf:OEFAI-TR-2001-30}. Finally, classifier
diversity can be ensured by modifying the output labels, i.e., by
transforming the learning tasks into a collection of related learning
tasks that use the same input examples but a different assignments of
the class labels. Error-correcting output codes are the most prominent
example for this type of ensemble method
\citep{ErrorCorrectingOutputCodes}.

Clearly, round robin classification may also be interpreted as a
member of this last group, and its performance gain may be seen in
this context. Obviously, the final prediction is made by exploiting
the redundancy provided by multiple models, each of them being
constructed from a subset of the original data. However, contrary to
subsampling approaches like bagging and boosting, these datasets are
constructed deterministically.\footnote{Boosting is also deterministic
  if the base learner is able to use weighted examples. Often,
  however, the example weights are interpreted as probabilities which
  are used for drawing a random sample as the training set for the
  next boosting iteration.} In this respect pairwise classification
shares more similarities with error-correcting output codes, but
differs from it through the fixed procedure for setting up the new
binary problems and the fact that each of the new problems is smaller
than the original problem. In particular the latter fact may often
cause the sub-problems in pairwise classification to be conceptually
simpler than the original problem (as illustrated in
Figure~\ref{fig:roundrobin}).

%\begin{figure}[tbp]
%\begin{center}
%\resizebox{0.5\textwidth}{!}{
%\includegraphics{correlation.ps}
%}
%\caption[]{Error reductions ratios of boosting ($x$-axis) vs.\ round robin ($y$-axis).}
%\label{fig:corr}
%\end{center}
%\end{figure}

In previous work \citep{jf:ICML-01}, we observed that the improvements
in accuracy obtained by \rcubed\ over \ripper\ were quite similar to
those obtained by \cfiveboost\ over \cfive\ on the same problems.
Round robin binarization seemed to work whenever boosting worked, and
vice versa.  The correlation coefficient $r^2$ of the
% Figure~\ref{fig:corr} plots 
the error ratios of \cfiveboost/\cfive\ and \rcubed/\ripper\ on the 20
datasets was about 0.618, which is in the same range as correlation
coefficients for bagging and boosting \citep{PopularEnsembles}.  We
interpreted this as weak evidence that the performance gains of round
robin learning may be comparable to that of other ensemble methods and
that it could be used as a general method for improving a learner's
performance on multi-class problems. We will further investigate this
question in this section and will in particular focus upon a
comparison of round robin learning with boosting
(Section~\ref{sec:boosting}) and bagging (Section~\ref{sec:bagging}),
and upon the potential of combining it with these techniques.

% There, it is also shown that worse base
% learners produce worse ensembles, which may explain why \rcubed\ does
% not quite level the performance of \cfiveboost. 


%> (regression-model '(0.989 0.840 0.463 .639 .045 .937 .504 .787 .5 .912 .267 .834 1.041 1.206 .682 .929 .545 1.259 .919 .967) '(.888 .862 .498 .654 .375 1.004 .186 .743 .808 .865 .394 .816 .991 .921 .717 .749 .955 1.026 .957 .986))

%Least Squares Estimates:

%Constant                  0.304802      (9.258954E-2)
%Variable 0                0.609169      (0.112963)

%R Squared:                0.617678    
%Sigma hat:                0.150940    
%Number of cases:                20
%Degrees of freedom:             18

%#<Object: 1781d0, prototype = REGRESSION-MODEL-PROTO>
%> 

\begin{table}[t]
\begin{small}
\caption[]{\emph{Boosting:} A comparison between round robin
  binarization and boosting, both with \cfive\ as a base learner.  The
  first column shows the results of \cfive, while the next three
  column pairs show the results of round robin learning, boosting, and
  the combination of both, all with \cfive\ as a base learner.  For
  these, we give both the absolute error rate and the performance
  ratio relative to the base learner \cfive.  The last line shows the
  geometric average of these ratios.  }
\label{tab:c50}
\begin{center}
\begin{tabular}{||l|r|rr|rr|rr||}
\hline\hline
 dataset              & \cfive & \multicolumn{2}{c|}{round robin} &
 \multicolumn{2}{c|}{boosting} & \multicolumn{2}{c||}{both}  \\
\hline
abalone              & 78.48 & 75.08 & \emph{0.957} & 77.88 & \emph{0.992} & 74.67 & \emph{0.951} \\
car                  &  7.58 &  5.84 & \emph{0.771} &  3.82 & \emph{0.504} &  1.85 & \emph{0.244} \\
glass                & 35.05 & 24.77 & \emph{0.707} & 27.57 & \emph{0.787} & 22.90 & \emph{0.653} \\
image                &  3.20 &  2.90 & \emph{0.905} &  1.60 & \emph{0.500} &  1.73 & \emph{0.541} \\
lr spectrometer      & 51.22 & 51.79 & \emph{1.011} & 46.70 & \emph{0.912} & 51.98 & \emph{1.015} \\
optical              &  9.20 &  5.04 & \emph{0.547} &  2.46 & \emph{0.267} &  2.54 & \emph{0.277} \\
page-blocks          &  3.09 &  2.98 & \emph{0.964} &  2.58 & \emph{0.834} &  2.78 & \emph{0.899} \\
sat                  & 13.82 & 13.16 & \emph{0.953} &  9.32 & \emph{0.675} &  9.00 & \emph{0.651} \\
solar flares (c)     & 15.77 & 15.69 & \emph{0.995} & 16.41 & \emph{1.041} & 16.70 & \emph{1.059} \\
solar flares (m)     &  4.90 &  4.90 & \emph{1.000} &  5.90 & \emph{1.206} &  5.83 & \emph{1.191} \\
soybean              &  9.66 &  6.73 & \emph{0.697} &  6.59 & \emph{0.682} &  6.44 & \emph{0.667} \\
thyroid (hyper)      &  1.11 &  1.14 & \emph{1.024} &  1.03 & \emph{0.929} &  1.33 & \emph{1.190} \\
thyroid (hypo)       &  0.58 &  0.69 & \emph{1.182} &  0.32 & \emph{0.545} &  0.53 & \emph{0.909} \\
thyroid (repl.)      &  0.72 &  0.74 & \emph{1.037} &  0.90 & \emph{1.259} &  0.90 & \emph{1.259} \\
vehicle              & 26.24 & 29.20 & \emph{1.113} & 24.11 & \emph{0.919} & 23.17 & \emph{0.883} \\
vowel                & 21.72 & 19.49 & \emph{0.898} &  8.89 & \emph{0.409} & 14.75 & \emph{0.679} \\
yeast                & 43.26 & 40.63 & \emph{0.939} & 41.85 & \emph{0.967} & 40.77 & \emph{0.942} \\
\hline                                                                     
average              &       &       & \emph{0.909} &       & \emph{0.735} &       & \emph{0.757} \\
\hline\hline
\end{tabular}
\end{center}
\end{small}

%(regression-model
%'(0.957 0.771 0.707 0.905 1.011 0.547 0.964 0.953 0.995 1.000 0.697
%1.024 1.182 1.037 1.113 0.898 0.939)
%'(0.992 0.504 0.787 0.500 0.912 0.267 0.834 0.675 1.041 1.206 0.682
%0.929 0.545 1.259 0.919 0.409 0.967))

%Least Squares Estimates:

%Constant                  -5.524668E-2  (0.358255)
%Variable 0                0.915108      (0.382518)

%R Squared:                0.276174    
%Sigma hat:                0.245642    
%Number of cases:                17
%Degrees of freedom:             15

%#<Object: 815d6d8, prototype = REGRESSION-MODEL-PROTO>
%> 

\end{table}


\subsection{Boosting}
\label{sec:boosting}

As a first step, we performed a direct comparison of the performance
of \cfive\ and \cfiveboost\ (\cfive\ called with the parameter
\texttt{-b}, i.e., 10 iterations of boosting) to \cfiverr, a single
round robin procedure with \cfive\ as the base learning algorithm.
Table~\ref{tab:c50} shows the results of a 10-fold cross-validation on
17 datasets.

The first thing to note is that the performance of \cfive\ does indeed
improve by about 10\% on average if round robin binarization is used
as a pre-processing step for multi-class problems. This is despite the
fact that \cfive\ can handle multi-class problems and does not depend
on a class binarization routine.
%
However, the gain is not as consistent and not as big as the gain for
\ripper\ (Table~\ref{tab:results}), possibly because \ripper's average
error on the multi-class problems in our study is in general above
that of \cfive\ (by a factor of 1.122), and therefore allows for
larger improvements. A possible explanation for this is that the
unordered and ordered binarization schemes used by \ripper\ are not
very good. This is confirmed by the fact that in a direct comparison
(which can easily be computed from Tables~\ref{tab:results}
and~\ref{tab:c50}), \rcubed\ decreases the average error of \cfive\ by
a factor of 0.838, and the error of \cfiverr\ by a factor of 0.923.
From this, we can conclude that robin binarization helps \ripper\ to
outperform \cfive\ on multi-class problems.

% \enlargethispage*{12pt}

The direct comparison between round robin classification and boosting
shows that the improvement of \cfiverr\ over \cfive\ is not as large
as the improvement provided by boosting: although there are a few
cases where round robin outperforms boosting, \cfiveboost\ is much
more reliable than \cfiverr, producing an average error reduction of
more than 26\% on these 17 datasets. The correlation between the error
reduction rates of \cfiveboost\ and \cfiverr\ is very weak ($r^2 =
0.276$), which refutes our earlier hypothesis and brings up the
question whether there is a fruitful combination of boosting and round
robin classification. The last column of Table~\ref{tab:c50} answers
this question negatively: the results of using round robin
classification with \cfiveboost\ as a base learner does---on
average---not lead to performance improvements over boosting.

These results are analogous to the results of \citet{AdaBoost.OC} who
compared \learner{AdaBoost.OC} (error-correcting output codes as a
binarization scheme for conventional two-class \learner{AdaBoost}) with
\learner{AdaBoost.M1} \citep{AdaBoost}, \learner{AdaBoost}'s
straightforward adaptation for multi-class base learners (a version
of which is presumably also implemented in \cfive;
\citealt{BaggingBoostingC4.5}), and found no significant differences for
the base learner \learner{C4.5} \citep{C4.5}, \cfive's predecessor.
Similar to our comparison between \cfiveboost\ and round robin
binarization, \citet{AdaBoost.OC} also found that boosting
outperformed binarization via error-correcting output codes. In
subsequent work, \citet{ReducingMulticlass} showed that the
performance gain of pairwise classification using \learner{AdaBoost}
as a base learner is on average indiscernible from the performance
gain of alternative binarization schemes, including some employing
error-correcting output codes (such as \learner{AdaBoost.OC}).

%The results of this section should not be over-interpreted, as we
%only have data points from two base algorithms and neither optimized
%parameters for boosting (e.g., the number of iterations) nor for round
%robin classification (e.g., the technique for decoding the
%predictions of the individual classifiers can certainly be improved,
%cf.~Section~\ref{sec:issues}). However, it seems to be clear that the
%multi-class learner \cfive\ does not profit as much from round robin
%binarization as the two-class learner \ripper. It would certainly 
%be useful to repeat these experiments with other multi-class and
%binary learners, in particular with different learning biases. Also, a 
%direct comparison between \rcubed\ and \slipper\  \citep{Slipper}, a
%rule learner which uses a variation of boosting to induce a redundant
%set of rules, would be interesting.

% \enlargethispage*{12pt}

\subsection{Bagging}
\label{sec:bagging}

So far we were only concerned with single and double round robins. A
natural extension to this procedure is to consider cases where more
than two classifiers are trained for each binary problem. For
algorithms with random components (such as \ripper's internal split of
the training examples or the random initialization of back-propagation
neural networks) this could simply be performed by running the
algorithm on the same dataset with different random seeds. For other
algorithms there are two options: randomness could be injected into
the algorithm's behavior \citep{Randomization} or random subsets of
the available data could be used for training the algorithm. The
latter procedure is more or less equivalent to bagging
\citep{Bagging}. We will evaluate this option in this section.

\begin{table}[t]
\begin{small}
\caption[]{\emph{Bagging:} A comparison of round robin learning versus
  bagging and of the combination of both using \ripper\ as the base
  classifier.  At the bottom, the average error ratios of the ensemble
  techniques over the respective base learner are shown for the base
  learners \ripper, \cfive, and \cfiveboost\ (we omitted the detailed
  results for the latter two). Note that the average performance
  ratios are relative to the base learner (i.e., they are only
  comparable within a line not between lines).}
\label{tab:bagging}
\begin{center}
\begin{tabular}{||l|r|rr|rr|rr||}
\hline\hline
\ripper\              & base & \multicolumn{2}{c|}{round robin} &
\multicolumn{2}{c|}{bagging} & \multicolumn{2}{c||}{both}  \\\hline
abalone              & 81.18  & 74.34 & \emph{0.916} & 78.36 & \emph{0.965}& 73.14 & \emph{0.901} \\
car                  & 12.15  &  2.26 & \emph{0.186} &  9.38 & \emph{0.771}&  1.79 & \emph{0.148} \\
glass                & 34.58  & 25.70 & \emph{0.743} & 29.44 & \emph{0.851}& 25.70 & \emph{0.743} \\
image                &  4.29  &  3.46 & \emph{0.808} &  2.51 & \emph{0.586}&  2.99 & \emph{0.697} \\
lr spectrometer      & 61.39  & 53.11 & \emph{0.865} & 57.82 & \emph{0.942}& 52.92 & \emph{0.862} \\
optical              &  9.48  &  3.74 & \emph{0.394} &  2.86 & \emph{0.302}&  2.81 & \emph{0.296} \\
page-blocks          &  3.38  &  2.76 & \emph{0.816} &  2.65 & \emph{0.784}&  2.54 & \emph{0.751} \\
sat                  & 13.04  & 10.35 & \emph{0.794} & 10.18 & \emph{0.781}&  8.92 & \emph{0.684} \\
solar flares (c)     & 15.91  & 15.77 & \emph{0.991} & 15.91 & \emph{1.000}& 15.69 & \emph{0.986} \\
solar flares (m)     &  5.47  &  5.04 & \emph{0.921} &  5.26 & \emph{0.961}&  5.18 & \emph{0.947} \\
soybean              &  8.79  &  6.30 & \emph{0.717} &  8.20 & \emph{0.933}&  6.00 & \emph{0.683} \\
thyroid (hyper)      &  1.49  &  1.11 & \emph{0.749} &  1.41 & \emph{0.945}&  1.09 & \emph{0.731} \\
thyroid (hypo)       &  0.56  &  0.53 & \emph{0.955} &  0.58 & \emph{1.050}&  0.42 & \emph{0.764} \\
thyroid (repl.)      &  0.98  &  1.01 & \emph{1.026} &  0.98 & \emph{0.999}&  0.85 & \emph{0.864} \\
vehicle              & 30.38  & 29.08 & \emph{0.957} & 26.12 & \emph{0.860}& 26.83 & \emph{0.883} \\
vowel                & 27.07  & 18.69 & \emph{0.690} & 16.26 & \emph{0.601}& 18.79 & \emph{0.694} \\
yeast                & 42.39  & 41.78 & \emph{0.986} & 40.63 & \emph{0.959}& 39.89 & \emph{0.941} \\
\hline                                                                     
average (\ripper)    &        &       & \emph{0.747} &       & \emph{0.811}&       & \emph{0.685} \\
\hline\hline                                                               
average (\cfive)     &        &       & \emph{0.909} &       & \emph{0.864}&       & \emph{0.838} \\
\hline\hline                                                               
average (\cfiveboost)&        &       & \emph{1.029} &       & \emph{0.977}&       & \emph{1.019} \\
\hline\hline
\end{tabular}
\end{center}
\end{small}
\end{table}

%\begin{table}
%\begin{small}
%\begin{center}
%\begin{tabular}{||l|r|rr|rr|rr||}
%\hline\hline
%\cfive               & base & \multicolumn{2}{c|}{bagging} &
% \multicolumn{2}{c|}{round robin} & \multicolumn{2}{c|}{both}  \\\hline
%abalone              & 78.48 & 76.87 & \emph{0.980} & 75.08 & \emph{0.957} & 73.45 & \emph{0.936} \\
%car                  &  7.58 &  6.89 & \emph{0.908} &  5.84 & \emph{0.771} &  5.38 & \emph{0.710} \\
%glass                & 35.05 & 26.17 & \emph{0.747} & 24.77 & \emph{0.707} & 25.23 & \emph{0.720} \\
%image                &  3.20 &  2.90 & \emph{0.905} &  2.90 & \emph{0.905} &  2.90 & \emph{0.905} \\
%lr spectrometer      & 51.22 & 48.78 & \emph{0.952} & 51.79 & \emph{1.011} & 52.54 & \emph{1.026} \\
%optical              &  9.20 &  4.57 & \emph{0.497} &  5.04 & \emph{0.547} &  3.15 & \emph{0.342} \\
%page-blocks          &  3.09 &  2.63 & \emph{0.852} &  2.98 & \emph{0.964} &  2.58 & \emph{0.834} \\
%sat                  & 13.82 &  9.88 & \emph{0.715} & 13.16 & \emph{0.953} &  9.43 & \emph{0.683} \\
%solar flares (c)     & 15.77 & 15.69 & \emph{0.995} & 15.69 & \emph{0.995} & 15.69 & \emph{0.995} \\
%solar flares (m)     &  4.90 &  4.90 & \emph{1.000} &  4.90 & \emph{1.000} &  4.90 & \emph{1.000} \\
%soybean              &  9.66 &  8.20 & \emph{0.848} &  6.73 & \emph{0.697} &  7.61 & \emph{0.788} \\
%thyroid (hyper)      &  1.11 &  1.19 & \emph{1.071} &  1.14 & \emph{1.024} &  1.30 & \emph{1.167} \\
%thyroid (hypo)       &  0.58 &  0.50 & \emph{0.864} &  0.69 & \emph{1.182} &  0.50 & \emph{0.864} \\
%thyroid (repl.)      &  0.72 &  0.90 & \emph{1.259} &  0.74 & \emph{1.037} &  0.77 & \emph{1.074} \\
%vehicle              & 26.24 & 25.41 & \emph{0.968} & 29.20 & \emph{1.113} & 25.30 & \emph{0.964} \\
%vowel                & 21.72 & 11.31 & \emph{0.521} & 19.49 & \emph{0.898} & 17.17 & \emph{0.791} \\
%yeast                & 43.26 & 41.85 & \emph{0.967} & 40.63 & \emph{0.939} & 38.61 & \emph{0.893} \\
%\hline
% average              &       &       & \emph{0.864} &       & \emph{0.909} &       & \emph{0.838} \\
%\hline\hline
%\cfiveboost              & base & \multicolumn{2}{c|}{bagging} &
% \multicolumn{2}{c|}{round robin} & \multicolumn{2}{c|}{both}  \\\hline
%abalone              & 77.88 & 75.51 & \emph{0.970} & 74.67 & \emph{0.959} & 73.76 & \emph{0.947} \\
%car                  &  3.82 &  5.50 & \emph{1.439} &  1.85 & \emph{0.485} &  2.78 & \emph{0.727} \\
%glass                & 27.57 & 21.03 & \emph{0.763} & 22.90 & \emph{0.831} & 23.83 & \emph{0.864} \\
%image                &  1.60 &  2.21 & \emph{1.378} &  1.73 & \emph{1.081} &  2.34 & \emph{1.459} \\
%lr spectrometer      & 46.70 & 42.94 & \emph{0.919} & 51.98 & \emph{1.113} & 51.22 & \emph{1.097} \\
%optical              &  2.46 &  1.98 & \emph{0.804} &  2.54 & \emph{1.036} &  2.17 & \emph{0.884} \\
%page-blocks          &  2.58 &  2.41 & \emph{0.936} &  2.78 & \emph{1.078} &  2.54 & \emph{0.986} \\
%sat                  &  9.32 &  8.72 & \emph{0.935} &  9.00 & \emph{0.965} &  8.58 & \emph{0.920} \\
%solar flares (c)     & 16.41 & 16.70 & \emph{1.018} & 16.70 & \emph{1.018} & 16.41 & \emph{1.000} \\
%solar flares (m)     &  5.90 &  5.54 & \emph{0.939} &  5.83 & \emph{0.988} &  5.33 & \emph{0.902} \\
%soybean              &  6.59 &  5.42 & \emph{0.822} &  6.44 & \emph{0.978} &  6.44 & \emph{0.978} \\
%thyroid (hyper)      &  1.03 &  1.22 & \emph{1.179} &  1.33 & \emph{1.282} &  1.19 & \emph{1.154} \\
%thyroid (hypo)       &  0.32 &  0.34 & \emph{1.083} &  0.53 & \emph{1.667} &  0.42 & \emph{1.333} \\
%thyroid (repl.)      &  0.90 &  0.87 & \emph{0.971} &  0.90 & \emph{1.000} &  0.93 & \emph{1.029} \\
%vehicle              & 24.11 & 24.23 & \emph{1.005} & 23.17 & \emph{0.961} & 25.30 & \emph{1.049} \\
%vowel                &  8.89 &  6.97 & \emph{0.784} & 14.75 & \emph{1.659} & 11.82 & \emph{1.330} \\
%yeast                & 41.85 & 38.68 & \emph{0.924} & 40.77 & \emph{0.974} & 38.95 & \emph{0.931} \\
%\hline
% average              &       &       & \emph{0.977} &       & \emph{1.029} &       & \emph{1.019} \\
%\hline\hline

%\end{tabular}
%\end{center}
%\end{small}
%\end{table}

Table~\ref{tab:bagging} shows the results of a comparison of round
robin learning, bagging, and a combination of both. Bagging was
implemented by drawing 10 samples with replacement from the available
data. Ties were broken in the same way as for the round robin
binarization, i.e., by simple voting using the \emph{a priori} class
probability as a tie breaker. Similarly, bagging was integrated with
round robin binarization by drawing 10 independent samples of each
pairwise classification problem. Thus we obtained a total of
$10c(c-1)$ predictions for each $c$-class problem, which again were
simply voted. The number of 10 iterations was chosen arbitrarily (to
conform to \cfiveboost's default number of iterations) and is
certainly not optimal (in both cases).

The results show clearly that the performance of the simple round
robin (second column) can be improved considerably by integrating it
with bagging (last column). The bagged round robin procedure reduces
\ripper's error on the datasets to about 68.5\% of the original error
(third line from the bottom). For comparison, we also show the results
of bagging only, which seems to give the least improvement. The
results of bagging are not included to compare it to the round robin,
but to show that the reduction in error rate for the bagged round
robin robin outperforms both its constituents.

We also repeated these experiments using \cfive\ and \cfiveboost\ as
the base learners. We only show the average performance for these
cases. Again, the advantage of the use of round robin learning is less
pronounced for the multi-class learner \cfive\ (it is even below the
improvement given by our simple bagging procedure), and the
combination of \cfiveboost\ and round robin learning does not produce
an additional gain. It is worth mentioning that the combination of
boosting and bagging outperforms boosting, which confirms previous
good results with such algorithms
\citep{BaggedBoosting,BaggedBoosting-2}.

% \enlargethispage*{12pt}

In order to compare the absolute performances of the algorithms we can
normalize all relative results on the performance of one algorithm
(say \ripper). \cfive's performance was better than \ripper's by a
factor of about 0.891. Multiplying this with the improvement of 0.735
of boosting (Table~\ref{tab:c50}) and of an additional 0.977 for
adding bagging (Table~\ref{tab:bagging}) yields that bagged
\cfiveboost\ has about 64\% of the error rate of basic \ripper, which
makes it the best performing combination. In comparison, the
combination of round robin and bagging for \ripper\ (68.5\%) is
relatively close behind, in particular if we consider the bad
performance of \ripper\ in comparison to \cfive. An evaluation of a
boosting variant of \ripper\ \citep[such as \slipper;][]{Slipper}
would be of interest.


\section{Other Properties and Open Questions}
\label{sec:issues}

In the following, we briefly discuss further important aspects of
round robin binarization.

\paragraph{Decoding:}

We have mostly ignored the issue of \emph{decoding} techniques for
combining the predictions of the pairwise classifiers into a final
prediction for the multi-class problem. The technique we used in this
paper (simple voting using the \emph{a priori} probability of the
class as a tie breaker), is quite likely to be suboptimal. First, one
could improve the tie breaking by exploring techniques that are
commonly used for breaking ties in tournament cross tables in games
and sports (such as the Sonneborn-Berger ranking in chess
tournaments). A further step ahead would be to weight each vote with a
confidence estimate provided by the base classifier, or to allow a
classifier only to vote for a class if it has a certain minimum
confidence in its prediction. Several studies in various contexts have
compared different voting techniques for combining the predictions of
the individual classifiers of an ensemble
\citep[e.g.,][]{PolychotomyDecomposition,ReducingMulticlass,jf:OEFAI-TR-2001-30}.
Although the final word on this issue remains to be spoken, it seems
to be the case that techniques that include confidence estimates 
% of the ensemble predictors 
into the computation of the final predictions
are preferable \cite[cf. also][]{AdaBoost-Confidences}.
% This points at a straightforward way how our work
% could be improved.
%
Along similar lines, there have been several proposals for combining
the class probability estimates of the pairwise classifiers into class
probability distributions for the multi-class problems
\citep{PairwiseNN-Probabilistic,PairwiseCoupling}.  More elaborate
proposals suggest learning separate classifiers for deciding whether a
given example belongs to one of the two classes used to train a
certain member of the pairwise ensemble
\citep{PairwiseClassification-Correcting}, or organizing the
classifiers into an efficient graph structure that can derive a
prediction in at most $c-1$ steps \citep{PairwiseDAGs}.


\enlargethispage*{12pt}

\paragraph{Classification Efficiency:}

Our efficiency analysis is only concerned with training time. At
testing time, one has to test a quadratic number of classifiers in
order to make the final prediction. Even though the constituent
classifiers are quite likely to be simpler (which often means that
they can make faster predictions) it can be expected that
classification takes considerably longer for a round robin ensemble
than for unordered binarization. This situation is particularly bad
for lazy learners, which defer most of their training effort to the
classification phase.

Next to the straightforward solution of testing all theories in
parallel (see below), a solution for this problem could be found in
the proposal of \citet{PairwiseDAGs} who suggest organizing pairwise
classifiers in a decision graph where each node represents a binary
classifier. They show that this structure allows obtainment of a
prediction for a \mbox{$c$-class} problem by consulting only $c-1$
pairwise classifiers without loss of accuracy on three benchmarks
problems. In some sense, this technique may be viewed as the
``ordered'' version of round robin classification.

Finally, 
% to cope with such situations, 
we could once more take a look at sports and game tournaments, where
elaborate pairing schemes allow determination of a reliable ranking in
cases where a round robin tournament is infeasible due to the high
number of players. The frequently used knock-out tournament (where
players are paired randomly and only the winner advances into the next
round) is probably too brittle. An interesting alternative might be
provided by swiss system tournaments, which are frequently used in
competitive chess. In this type of tournament, all players play a
fixed number of rounds, typically of the order $\log(c)$. The trick is
that in each round players of about equal strength (i.e., players that
are ranked close to each other in the current standings of the
tournament) are paired against each other. Such schemes could improve
classification time for problems with very high numbers of classes, in
particular if classification is very expensive (e.g., in the case of
lazy learners).


\paragraph{Comprehensibility:}

While boosting seems to provide larger gains in accuracy, the price to
pay is that the learned ensemble of classifiers is no longer easy to
comprehend.
%\footnote{An exception might be a system like
%  \slippersmall\ \citep{Slipper} which tightly integrates boosting
%  into the rule learning algorithm with the effect that only a single
%  set of rules is learned and the redundancy among these rules is
%  exploited to obtain higher accuracies. A direct comparison between
%  \slippersmall\ and \rcubed\ would be very interesting, not only for
%  this reason.} 
While round robin rule learning also learns an ensemble of
classifiers, we think that it has the advantage that each element of
the ensemble has a well-defined semantics (separating two classes from
each other). In fact, \citet[p.16]{DataPreparation} proposes a very similar
technique called \emph{pairwise ranking} in order to facilitate human
decision-making in ranking problems. He claims that it is easier for a
human to determine an order between $n$ items if one makes pairwise
comparisons between the individual items and then adds up the wins for
each item, instead of trying to order the items right
away.\footnote{The aspect of being able to rank the available
  classifications for each example (as an intermediate version between
  predicting only a class value and providing a full probability
  distribution) is another interesting aspect of round robin
  binarization, which might be worth exploring in more depth.}

%Furthermore, for explanatory purposes, it might often be enough to
%explain why the system prefers this class over some other class that
%the human user would have considered to be more likely (or why this
%class is preferred of each other class). Both unordered binarization
%or boosting could only answer such questions by giving class
%probability estimates, which are hard to interpret. In many cases,
%round robin classification could easily answer this 
%kind of question by showing 
%Problems may arise in cases where the prediction of the relevant base 
%classifier does not correspond to the final order. 


\paragraph{Parallel Implementations:}

It should be noted that---contrary to boosting, where the individual
runs depend on each other and have to be performed in
succession---pairwise classification can be easily parallelized by
assigning the binary classification problems to different processors,
as already noted by \citet{PolychotomousClassification-TR} and
\citet{PairwiseNN}. As each binary task will be smaller than the
original task, the total training time of a multi-class problem of
size $n$ will be significantly below that of a binary problem of the
same size, if each binary classifier can be trained on a separate
processor. Naturally, a parallel implementation would also provide a
trivial solution to the problem with classification efficiency
discussed above.

\paragraph{Memory Requirements:}

It is also clear that each individual binary task in a round-robin
binarization has fewer training examples than the original tasks. For
multi-class tasks that are too large to be performed in memory,
pairwise classification may provide a simple means to reduce the size
of the learning task without resorting to subsampling. Note that this
is not the case for unordered class binarization or error-correcting
output codes. 

\paragraph{Imbalanced Class Distributions:}

It would also be interesting to investigate the effect of round robin
binarization on minority classes, in particular in problems where
several large classes appear next to a few small classes. We think
that the fact that separate classifiers are trained to discriminate
the small classes from each other (and not only from all remaining
examples as would be the case for unordered binarization or for
treating the multi-class problem as a whole) may help to improve the
focus in the case of imbalanced class distributions. On the other
hand, if the base learner tends to prefer large classes, one dominant
large class will tend to win against all minority classes and will be
more frequently predicted. The evidence from Table~\ref{sec:accuracy}
seems to confirm this: it is primarily sets with skewed class
distributions where round robin classification does not perform well
(consider, e.g., the three thyroid datasets). However, this evidence
is certainly not conclusive and we believe that a closer investigation
of this issue is a rewarding topic for future research.


\section{Related Work}
\label{sec:related}

The idea of pairwise classification has been known in the neural
networks and statistics communities for some time. For example,
\citet[p.113]{Weka} refer to it as a technique for making linear
regression applicable to multi-class problems but do not cite a
source. The earliest reference we could find is \citet{StepwiseNN} who
propose a stepwise procedure for linearizing non-linear multi-class
problems by first trying to identify classes that can be solved by a
one-against-all approach, then a pairwise approach, and finally a
piece-wise linear technique. The motivation behind this and other
works in the neural networks community is that it is often better to
have a modular network, i.e., a network that consists of several
simpler and independently trained sub-networks, rather than a single,
complex multi-layer neural network, which usually requires a large
hidden layer and significant training times.  Unordered
\citep{ModularNN} and pairwise \citep{PairwiseNN} binarization
techniques have also been investigated in this context.

\citet{PolychotomousClassification-TR} evaluates pairwise
classification on two versions of \cart\ \citep{Cart} and a nearest
neighbor algorithm on 50 randomly generated problems. He observed
improvements for the \cart\ version which uses a linear function for
splitting a node and for the nearest neighbor rule. For \cart\ with
axis parallel splits, the performance of the pairwise class
binarization was similar to that of the standard techniques. The
author also provided a brief discussion of the complexity of the
approach.

Naturally, pairwise classification was also investigated within the
support vector machine community. A comparison of unordered and round
robin binarization on a speaker identification problem can be found in
\citep{SVM-SpeakerIdentification-2,SVM-SpeakerIdentification}, without
providing a clear conclusion about the preferable approach.
\citet{PairwiseClassification} discusses the technique in some detail
and presents empirical results on a digit recognition task, where the
author notes the unexpected efficiency of the approach without
providing an explanation for it. Recently,
\citet{MultiClass-SVM-Comparison} conducted an excellent empirical
comparison between unordered binarization, pairwise classification
with both the simple voting technique we used and with the more
efficient proposal for organizing the classifiers into an efficient
DAG \citep{PairwiseDAGs}, as well as two approaches for directly
generalizing the support vector algorithm to multi-class problems. In
their experiments, round robin binarization performed best, both in
terms of accuracy and training time. Interestingly, the advantage over
competing methods was more pronounced for a linear kernel than for a
non-linear RBF kernel. We interpret this as evidence that round robin
binarization simplifies the individual binary decision problems as
motivated in Figure~\ref{fig:roundrobin}.

\citet{PairwiseSVM-ThreeWay} suggest a related
technique where multi-class problems are mapped to 3-class problems.
Like with pairwise classification, the idea is to generate one
training set for each pair of classes, but in addition to encoding the
two class values with target values $+1$ and $-1$, examples of all
other classes are added with a target value of $0$, which
gives up some of the advantages that result from the reduction of the
training set sizes on the binary problems. They do not evaluate their
approach. 
%
A similar idea was used by \citet{NOEMON} for the meta-learning task
of predicting the most suitable learning algorithm(s) for a given
dataset. A nearest-neighbor learner was trained to predict the better
algorithm for each pair of learning algorithms, where each of these
pairwise problems had three classes: one for each algorithm and a
third class ``tie'' if both algorithms performed indistinguishably.

Error-correcting output codes \citep{ErrorCorrectingOutputCodes} are a
popular and powerful class binarization technique. The basic idea is
to encode a $c$-class problem as $\bar{c}$ binary problems ($\bar{c} >
c$), where each binary problem uses a \emph{subset} of the classes as
the positive class and the remaining classes as a negative class. It
may thus be viewed as a generalization of unordered binarization,
where only single classes are used as positive examples. As a
consequence, each original class is encoded as a $\bar{c}$-dimensional
binary vector, one dimension for each prediction of a binary problem
(by convention $+1$ for positive and $-1$ for negative). The resulting
matrix of the form $\{-1,+1\}^{c\times \bar{c}}$ is called the
\emph{coding matrix}. New examples are classified by determining the
row in the matrix that is closest to the binary vector obtained by
submitting the example to the $\bar{c}$ classifiers. If the binary
problems are chosen in a way that maximizes the distance between the
class vectors, the reliability of the classification can be
significantly increased. Error-correcting output codes can also be
easily parallelized, but each subtask requires the total training set.
As typically $\bar{c} > c$, the penalty function $\pi_{ecoc} > c$,
i.e., pairwise and unordered binarization are more efficient.

\citet{ReducingMulticlass} show that a generalization of
error-correcting output codes can be used as a general framework for
class binarization techniques. Their basic idea is to generalize the
coding matrices in a way that allows examples to be ignored in the
binary problems, i.e., to allow the coding matrices to be of the form
$\{-1,0,+1\}^{c\times \bar{c}}$. Clearly, pairwise classification is a
special case in this framework. For example, the coding matrix for a
double round robin has $\bar{c} = c(c-1)$ columns, where the column
corresponding to the binary problem $<\!\!i,j\!\!>$ has $+1$ in the
$i$-th row, $-1$ in the $j$-th row and $0$ in all other rows. Thus,
each row is a vector of the form $\{-1,0,+1\}^{\bar{c}}$. Note,
however, that the vector of predictions is of the form
$\{-1,+1\}^{\bar{c}}$ because every binary classifier will make a
prediction (either $+1$ or $-1$) for every example. Nevertheless, the
simple voting procedure we used is equivalent to finding the row that
is most similar to the prediction vector (if similarity is measured
with the Hamming distance), which is equivalent to the decoding
technique suggested by \citet{ErrorCorrectingOutputCodes}.
\citet{ReducingMulticlass} criticize this simple Hamming decoding and
suggest the use of loss-based decoding techniques that take into
account the confidence of the base learner into its own predictions.
In a theoretical analysis, which relates the training error of
decoding methods to the error on the binary problems and to the
minimum distance between entries in the coding matrix, the authors
derive upper bounds for the training error of loss-based decoding that
are lower than those for Hamming decoding. In an experimental study
with five different class binarization techniques and three decoding
techniques (two of them loss-based and one Hamming decoding), the
loss-based techniques seemed to produce lower generalization errors.
Their results also showed that for support vector machines unordered
binarization is inferior to all other techniques, among them pairwise
classification. Among these alternatives, no clear winner emerged.
However, for boosted decision trees, unordered binarization performed
on the same level as all other approaches.

%They
%also observed that a decoding technique that took the confidence
%scores of the binary classifiers into account (and not only their
%predictions) performed better than one that didn't. More related work
%on decoding is discussed in Section~\ref{sec:issues}.


% \citet{PairwiseCoupling} introduced \emph{pairwise coupling}, a
% tie-breaking technique which combines the class probability estimates
% from the binary classifiers into a joint probability distribution for
% the multi-class problem. Earlier work by
% \citep{PairwiseNN-Probabilistic} investigated the same problem within a
% neural networks context. \citet{PairwiseDAGs} suggested to organize
% the pairwise classifiers into a decision graph. Such a structure could
% not only be used to break ties, but also to speed up the
% classification time, because they show that at most $c-1$ pairwise
% classifiers have to be consulted to obtain a prediction.

Finally, we note the relation of round robin classification to
comparison training \citep{lig*Tesauro89b,lig*Utgoff91}, which has
been proposed as a training procedure in evaluation function learning.
In this framework, the learner is not trained with the target values
of the evaluation function in certain states, but instead is trained
on state pairs where the preferable state is marked. Thus, this
training procedure is somewhere between supervised learning (where the
function is trained on examples of target values) and reinforcement
learning (where it only receives indirect feedback about the value of
states). \citet{lig*Tesauro89b} demonstrated a particularly
interesting technique, where he enforced a symmetric neural network
architecture and showed that with this architecture, one only has to
perform $n$ network evaluations to determine the best of $n$ moves. It
is an interesting open question, whether a similar technique could be
employed for speeding up pairwise classification.


\section{Conclusions}

This paper has investigated the use of round robin binarization (or
pairwise classification) as a technique for handling multi-class
problems with separate-and-conquer rule learning algorithms (aka
covering algorithms). Our experimental results show that, in
comparison to conventional ordered or unordered binarization, the
round robin approach may yield significant gains in accuracy without
risking a bad performance. In particular, round robin binarization
helps \ripper\ to outperform \cfive\ on multi-class problems, whereas
\cfive\ outperforms the original version of \ripper\ on the same
problems. We think that the reason for this improvement lies on the
one hand in the exploitation of diverse predictions in an ensemble of
classifiers and, on the other hand, in the fact that the resulting
binary problems may be considerably simpler and can thus be learned
more reliably (Figure~\ref{fig:roundrobin}).

Moreover, we demonstrated both empirically and theoretically that the
resulting qua\-dratic growth in the number of learning problems is
compensated by the reduction in size for each of the individual
problems. For base algorithms with linear or super-linear run-time,
round robin binarization is faster than the conventional unordered
technique. Furthermore, we have proven that this advantage increases
with the complexity of the base algorithm. Note that these theoretical
results are not restricted to rule learning, but are also applicable
to perceptrons, support vector machines, boosting, and other learning
algorithms that need binarization techniques for solving multi-class
problems. Our experimental results confirm the efficiency of round
robin binarization for rule learning, but for a true evaluation of the
performance of this technique, an efficient, tight integration into a
separate-and-conquer rule learning algorithm would be needed.

Finally, we investigated the properties of round robin learning as a
general ensemble technique, in particular in comparison to bagging and
boosting. For \cfive\ and even less so for \cfiveboost, round robin
classification did not seem to work as well as it did for \ripper,
which is consistent with previous similar results on error-correcting
output codes. Similarly, a straightforward integration of pairwise
classification with bagging outperformed both constituent techniques,
when using \ripper\ (and to some extent when using \cfive) as a base
learner, but not in combination with boosting. It remains to be seen
whether round robin learning does not work well for boosting in
general, or whether this is a specific phenomenon for \cfive\ and its
boosting procedure. In any case, we can conclude that round robin
classification provides a more efficient and more accurate alternative
to the class binarization procedures that are currently in use in
inductive rule learning.

% The main disadvantage of the approach, of course, is its dependency on
% the number of classes. More classes result in a bigger ensemble which
% should produce better predictions. In particular in the limiting case,
% where only two or three classes are available, the benefits should be
% rather small. This trade-off also needs to be investigated in more detail.


\acks{I wish to thank William Cohen, Eibe Frank, 
% Robert Israel, % Hilfe bei der Abschaetzung von (a+b+c)^n
% Rayid Ghani,  % EOC with < c Klassen
% who's the guy with partitioning?
  Alexandros Kalousis, Stefan Kramer, Rich Maclin, Dragos Margineantu,
  Johann Petrak, Foster Provost, Lup\v{c}o Todorovski, Gerhard Widmer,
  and the maintainers of and contributors to the UCI collection of
  databases for discussions, programs, databases, and/or pointers to
  relevant literature. Thanks are also due to the anonymous reviewers
  and the editor, in particular for the careful comments on the
  theoretical section of this paper.
  
  The Austrian Research Institute for Artificial Intelligence is
  supported by the Austrian Federal Ministry of Education, Science and
  Culture. The author is supported by the Austrian \emph{Fonds zur
    F\"orderung der Wissenschaftlichen Forschung (FWF)} under grant
  no.\ P12645-INF.}  


%and the ESPRIT long-term project METAL (26.357).


% Thanks: Lupco, Eibe, Stefan K., Metal (data), JP in particular
% Will for \ripper\ and the maintainers of UCI + the individual databases
% METAL

%\bibliography{ensembles,theory,ml,decision_trees,my,kdd,nn,ilp,math,lig,metal,rules}

\begin{thebibliography}{49}
\expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
\expandafter\ifx\csname url\endcsname\relax
  \def\url#1{{\tt #1}}\fi

\bibitem[Allwein et~al.(2000)Allwein, Schapire, and Singer]{ReducingMulticlass}
E.~L. Allwein, R.~E. Schapire, and Y.~Singer.
\newblock Reducing multiclass to binary: A unifying approach for margin
  classifiers.
\newblock {\em Journal of Machine Learning Research}, 1:\penalty0 113--141,
  2000.

\bibitem[Anand et~al.(1995)Anand, Mehrotra, Mohan, and Ranka]{ModularNN}
R.~Anand, K.~G. Mehrotra, C.~K. Mohan, and S.~Ranka.
\newblock Efficient classification for multiclass problems using modular neural
  networks.
\newblock {\em IEEE Transactions on Neural Networks}, 6:\penalty0 117--124,
  1995.

\bibitem[Angulo and Catal\`a(2000)]{PairwiseSVM-ThreeWay}
C.~Angulo and A.~Catal\`a.
\newblock {$K$-SVCR}. A multi-class support vector machine.
\newblock In R.~{L\'opez de M\'antaras} and E.~Plaza (eds.) {\em Proceedings
  of the 11th European Conference on Machine Learning (ECML-2000)}, pp.\
  31--38. Springer-Verlag, 2000.

\bibitem[Bauer and Kohavi(1999)]{Ensembles-Comparison}
E.~Bauer and R.~Kohavi.
\newblock An empirical comparison of voting classification algorithms: Bagging,
  boosting, and variants.
\newblock {\em Machine Learning}, 36:\penalty0 105--169, 1999.

\bibitem[Bay(1999)]{MultipleFeatureSets-kNN}
S.~D. Bay.
\newblock Nearest neighbor classification from multiple feature subsets.
\newblock {\em Intelligent Data Analysis}, 3\penalty0 (3):\penalty0 191--209,
  1999.

\bibitem[Blake and Merz(1998)]{UCI}
C.~L. Blake and C.~J. Merz.
\newblock {UCI} repository of machine learning databases.
\newblock \url{http://www.ics.uci.edu/~mlearn/MLRepository.html}, 1998.
\newblock Department of Information and Computer Science, University of
  California at Irvine, Irvine CA.

\bibitem[Breiman(1996)]{Bagging}
L.~Breiman.
\newblock Bagging predictors.
\newblock {\em Machine Learning}, 24\penalty0 (2):\penalty0 123--140, 1996.

\bibitem[Breiman et~al.(1984)Breiman, Friedman, Olshen, and Stone]{Cart}
L.~Breiman, J.~Friedman, R.~Olshen, and C.~Stone.
\newblock {\em Classification and Regression Trees}.
\newblock Wadsworth \& Brooks, Pacific Grove, CA, 1984.

\bibitem[Clark and Boswell(1991)]{CN2-improvements}
P.~Clark and R.~Boswell.
\newblock Rule induction with {CN2}: Some recent improvements.
\newblock In {\em Proceedings of the 5th European Working Session on Learning
  (EWSL-91)}, pp.\ 151--163, Porto, Portugal, 1991. Springer-Verlag.

\bibitem[Clark and Niblett(1989)]{CN2}
P.~Clark and T.~Niblett.
\newblock The {CN2} induction algorithm.
\newblock {\em Machine Learning}, 3\penalty0 (4):\penalty0 261--283, 1989.

\bibitem[Cohen(1995)]{Ripper}
W.~W. Cohen.
\newblock Fast effective rule induction.
\newblock In A.~Prieditis and S.~Russell (eds.) {\em Proceedings of the 12th
  International Conference on Machine Learning (ML-95)}, pp.\ 115--123, Lake
  Tahoe, CA, 1995. Morgan Kaufmann.

\bibitem[Cohen and Singer(1999)]{Slipper}
W.~W. Cohen and Y.~Singer.
\newblock A simple, fast, and effective rule learner.
\newblock In {\em Proceedings of the 16th National Conference on Artificial
  Intelligence ({AAAI}-99)}, pp.\ 335--342, Menlo Park, CA, 1999. AAAI/MIT
  Press.

\bibitem[Cortes and Vapnik(1995)]{SupportVectorNetworks}
C.~Cortes and V.~Vapnik.
\newblock Support-vector networks.
\newblock {\em Machine Learning}, 20\penalty0 (3):\penalty0 273--297, 1995.

\bibitem[Dietterich(1997)]{ML-FourDirections}
T.~G. Dietterich.
\newblock Machine learning research: Four current directions.
\newblock {\em AI Magazine}, 18\penalty0 (4):\penalty0 97--136, Winter 1997.

\bibitem[Dietterich(1998)]{5x2CV}
T.~G. Dietterich.
\newblock Approximate statistical tests for comparing supervised classification
  learning algorithms.
\newblock {\em Neural Computation}, 10\penalty0 (7):\penalty0 1895--1924, 1998.

\bibitem[Dietterich(2000{\natexlab{a}})]{EnsembleMethods}
T.~G. Dietterich.
\newblock Ensemble methods in machine learning.
\newblock In J.~Kittler and F.~Roli (eds.) {\em First International Workshop
  on Multiple Classifier Systems}, pp.\ 1--15. Springer-Verlag,
  2000{\natexlab{a}}.
% \newblock URL \url{ftp://ftp.cs.orst.edu/pub/tgd/papers/mcs-ensembles.ps.gz}.

\bibitem[Dietterich(2000{\natexlab{b}})]{Randomization}
T.~G. Dietterich.
\newblock An experimental comparison of three methods for constructing
  ensembles of decision trees: Bagging, boosting, and randomization.
\newblock {\em Machine Learning}, 40\penalty0 (2):\penalty0 139--158,
  2000{\natexlab{b}}.

\bibitem[Dietterich and Bakiri(1995)]{ErrorCorrectingOutputCodes}
T.~G. Dietterich and G.~Bakiri.
\newblock Solving multiclass learning problems via error-correcting output
  codes.
\newblock {\em Journal of Artificial Intelligence Research}, 2:\penalty0
  263--286, 1995.

\bibitem[Feelders and Verkooijen(1995)]{ComparativeStudies}
A.~Feelders and W.~Verkooijen.
\newblock Which method learns most from the data? {Methodological} issues in
  the analysis of comparative studies.
\newblock In {\em Proceedings of the 5th International Workshop on Artificial
  Intelligence and Statistics}, pp.\ 219--225, Fort Lauderdale, Florida, 1995.

\bibitem[Freund and Schapire(1997)]{AdaBoost}
Y.~Freund and R.~E. Schapire.
\newblock A decision-theoretic generalization of on-line learning and an
  application to boosting.
\newblock {\em Journal of Computer and System Sciences}, 55\penalty0
  (1):\penalty0 119--139, 1997.

\bibitem[Friedman(1996)]{PolychotomousClassification-TR}
J.~H. Friedman.
\newblock Another approach to polychotomous classification.
\newblock Technical report, Department of Statistics, Stanford University,
  Stanford, CA, 1996.
% \newblock URL \url{http://www-stat.stanford.edu/~jhf/ftp/poly.ps.Z}.
% \newblock To appear in \textit{Machine Learning}.

\bibitem[F{\"u}rnkranz(1997)]{jf:MLJ}
J.~F{\"u}rnkranz.
\newblock Pruning algorithms for rule learning.
\newblock {\em Machine Learning}, 27\penalty0 (2):\penalty0 139--171, 1997.

\bibitem[F{\"u}rnkranz(1999{\natexlab{a}})]{jf:IDA-99}
J.~F{\"u}rnkranz.
\newblock Exploiting structural information for text classification on the
  {WWW}.
\newblock In D.~Hand, J.~N. Kok, and M.~Berthold (eds.) {\em Advances in
  Intelligent Data Analysis: Proceedings of the 3rd International Symposium
  (IDA-99)}, pp.\ 487--497, Amsterdam, Netherlands, 1999{\natexlab{a}}.
  Springer-Verlag.
% \newblock URL
%  \url{http://wieden.ai.univie.ac.at/~juffi/publications/ida-99.ps.gz}.

\bibitem[F{\"u}rnkranz(1999{\natexlab{b}})]{jf:AI-Review}
J.~F{\"u}rnkranz.
\newblock Separate-and-conquer rule learning.
\newblock {\em Artificial Intelligence Review}, 13\penalty0 (1):\penalty0
  3--54, 1999{\natexlab{b}}.

\bibitem[F{\"u}rnkranz(2001)]{jf:ICML-01}
J.~F{\"u}rnkranz.
\newblock Round robin rule learning.
\newblock In C.~E. Brodley and A.~P. Danyluk (eds.) {\em Proceedings of the
  18th International Conference on Machine Learning (ICML-01)}, pp.\ 146--153,
  Williamstown, MA, 2001{\natexlab{b}}. Morgan Kaufmann Publishers.

\bibitem[F{\"u}rnkranz(2002)]{jf:OEFAI-TR-2001-30}
J.~F{\"u}rnkranz.
\newblock Hyperlink ensembles: A case study in hypertext
classification.
\newblock \emph{Information Fusion}. Special Issue on Fusion of
Multiple Classifiers, 2002. To appear.
%\newblock Technical Report OEFAI-TR-2001-30, Austrian Research Institute for
%  Artificial Intelligence, Wien, Austria, 2001{\natexlab{a}}.
% \newblock URL \url{http://www.oefai.at/cgi-bin/tr-online?number+2001-30}.

\bibitem[F{\"u}rnkranz and Widmer(1994)]{jf:ML-94}
J.~F{\"u}rnkranz and G.~Widmer.
\newblock {Incremental Reduced Error Pruning}.
\newblock In W.~Cohen and H.~Hirsh (eds.) {\em Proceedings of the 11th
  International Conference on Machine Learning (ML-94)}, pp.\ 70--77, New
  Brunswick, NJ, 1994. Morgan Kaufmann.

\bibitem[Hastie and Tibshirani(1998)]{PairwiseCoupling}
T.~Hastie and R.~Tibshirani.
\newblock Classification by pairwise coupling.
\newblock In M.~I. Jordan, M.~J. Kearns, and S.~A. Solla (eds.) {\em Advances in
  Neural Information Processing Systems 10 (NIPS-97)}, pp.\ 507--513. MIT
  Press, 1998.
% \newblock URL \url{http://www-stat.stanford.edu/~hastie/Papers/2class.ps}.

\bibitem[Hsu and Lin(2002)]{MultiClass-SVM-Comparison}
C.-W. Hsu and C.-J. Lin.
\newblock A comparison of methods for multi-class support vector machines.
\newblock {\em IEEE Transactions on Neural Networks}, 2002.
\newblock To appear.

\bibitem[Kalousis and Theoharis(1999)]{NOEMON}
A.~Kalousis and T.~Theoharis.
\newblock Noemon: Design, implementation and performance results of an
  intelligent assistant for classifier selection.
\newblock {\em Intelligent Data Analysis}, 3\penalty0 (5):\penalty0 319--337,
  1999.
% \newblock URL
%  \url{http://cui.unige.ch/AI-group/metal/restricted/Papers/IDA99.ps}.

\bibitem[Knerr et~al.(1990)Knerr, Personnaz, and Dreyfus]{StepwiseNN}
S.~Knerr, L.~Personnaz, and G.~Dreyfus.
\newblock Single-layer learning revisited: A stepwise procedure for building
  and training a neural network.
\newblock In F.~{Fogelman Souli\'e} and J.~H\'erault (eds.) {\em
  Neurocomputing: Algorithms, Architectures and Applications}, volume F68 of
  {\em NATO ASI Series}, pp.\ 41--50. Springer-Verlag, 1990.

\bibitem[Knerr et~al.(1992)Knerr, Personnaz, and
  Dreyfus]{StepwiseNN-DigitRecognition}
S.~Knerr, L.~Personnaz, and G.~Dreyfus.
\newblock Handwritten digit recognition by neural networks with single-layer
  training.
\newblock {\em IEEE Transactions on Neural Networks}, 3\penalty0 (6):\penalty0
  962--968, 1992.

\bibitem[Kolen and Pollack(1991)]{NN-RandomInitialization}
J.~F. Kolen and J.~B. Pollack.
\newblock Back propagation is sensitive to initial conditions.
\newblock In {\em Advances in Neural Information Processing Systems 3
  (NIPS-90)}, pp.\ 860--867. Morgan Kaufmann, 1991.

\bibitem[Kre{\ss}el(1999)]{PairwiseClassification}
U.~H.-G. Kre{\ss}el.
\newblock Pairwise classification and support vector machines.
\newblock In B.~Sch{\"o}lkopf, C.~Burges, and A.~Smola (eds.) {\em Advances
  in Kernel Methods: Support Vector Learning}, chapter~15, pp.\ 255--268. MIT
  Press, Cambridge, MA, 1999.

\bibitem[Krieger et~al.(2001)Krieger, Wyner, and Long]{BaggedBoosting-2}
A.~Krieger, A.~J. Wyner, and C.~Long.
\newblock Boosting noisy data.
\newblock In C.~E. Brodley and A.~P. Danyluk (eds.) {\em Proceedings of the
  18th International Conference on Machine Learning (ICML-2001)}, pp.\ 
  274--281. Williamstown, MA, 2001. Morgan Kaufmann Publishers.

\bibitem[Lu and Ito(1999)]{PairwiseNN}
B.-L. Lu and M.~Ito.
\newblock Task decomposition and module combination based on class relations: A
  modular neural network for pattern classification.
\newblock {\em IEEE Transactions on Neural Networks}, 10\penalty0 (5):\penalty0
  1244--1256, 1999.

\bibitem[Mayoraz and Alpaydin(1999)]{MultiClass-SVM}
E.~Mayoraz and E.~Alpaydin.
\newblock Support vector machines for multi-class classification.
\newblock In J.~Mira and J.~V. S\'anchez-Andr\'es (eds.) {\em Engineering
  Applications of Bio-Inspired Artificial Neural Networks: Proceedings of the
  International Work-Conference on Artificial and Natural Neural Networks
  (IWANN-99), Volume II}, pp.\ 833--842, Alicante, Spain, 1999.
  Springer-Verlag.

\bibitem[Mayoraz and Moreira(1997)]{PolychotomyDecomposition}
E.~Mayoraz and M.~Moreira.
\newblock On the decomposition of polychotomies into dichotomies.
\newblock In D.~H. Fisher (ed.) {\em Proceedings of the 14th International Conference on Machine
  Learning (ICML-97)}, pp.\ 219--226, Nashville, TN, 1997. Morgan Kaufmann.

\bibitem[McNemar(1947)]{McNemar-Test}
Q.~McNemar.
\newblock Note on the sampling error of the difference between correlated
  proportions or percentages.
\newblock {\em Psychometrika}, 12:\penalty0 153--157, 1947.

\bibitem[Moreira and Mayoraz(1998)]{PairwiseClassification-Correcting}
M.~Moreira and E.~Mayoraz.
\newblock Improved pairwise coupling classification with correcting
  classifiers.
\newblock In C.~N\'{e}dellec and C.~Rouveirol (eds.) {\em
  Proceedings of the 10th European Conference on Machine Learning
  (ECML-98)}, pp.\ 160--171, Chemnitz, Germany, 1998. Springer-Verlag.

\bibitem[Opitz and Maclin(1999)]{PopularEnsembles}
D.~Opitz and R.~Maclin.
\newblock Popular ensemble methods: An empirical study.
\newblock {\em Journal of Artificial Intelligence Research}, 11:\penalty0
  169--198, 1999.

\bibitem[Pfahringer(2000)]{BaggedBoosting}
B.~Pfahringer.
\newblock Winning the {KDD99} classification cup: Bagged boosting.
\newblock {\em SIGKDD explorations}, 1\penalty0 (2):\penalty0 65--66, 2000.

\bibitem[Platt et~al.(2000)Platt, Cristianini, and Shawe-Taylor]{PairwiseDAGs}
J.~C. Platt, N.~Cristianini, and J.~Shawe-Taylor.
\newblock Large margin {DAGs} for multiclass classification.
\newblock In S.~A. Solla, T.~K. Leen, and K.-R. M{\"u}ller (eds.) {\em
  Advances in Neural Information Processing Systems 12 (NIPS-99)}, pp.\
  547--553. MIT Press, 2000.
% \newblock URL \url{http://lara.enm.bris.ac.uk/cig/gzipped/2000/dagsvm.ps.gz}.

\bibitem[Price et~al.(1995)Price, Knerr, Personnaz, and
  Dreyfus]{PairwiseNN-Probabilistic}
D.~Price, S.~Knerr, L.~Personnaz, and G.~Dreyfus.
\newblock Pairwise neural network classifiers with probabilistic outputs.
\newblock In G.~Tesauro, D.~Touretzky, and T.~Leen (eds.) {\em Advances in
  Neural Information Processing Systems 7 (NIPS-94)}, pp.\ 1109--1116. MIT
  Press, 1995.

\bibitem[Pyle(1999)]{DataPreparation}
D.~Pyle.
\newblock {\em Data Preparation for Data Mining}.
\newblock Morgan Kaufmann, San Francisco, CA, 1999.

\bibitem[Quinlan(1993)]{C4.5}
J.~R. Quinlan.
\newblock {\em C4.5: {P}rograms for {M}achine {L}earning}.
\newblock Morgan Kaufmann, San Mateo, CA, 1993.

\bibitem[Quinlan(1996)]{BaggingBoostingC4.5}
J.~R. Quinlan.
\newblock Bagging, boosting, and {C4.5}.
\newblock In {\em Proceedings of the 13th National Conference on
  Artificial Intelligence (AAAI-96)}, pp.\ 725--730. AAAI/MIT Press, 1996.

\bibitem[Rivest(1987)]{DecisionLists}
R.~L. Rivest.
\newblock Learning decision lists.
\newblock {\em Machine Learning}, 2:\penalty0 229--246, 1987.

\bibitem[Schapire(1997)]{AdaBoost.OC}
R.~E. Schapire.
\newblock Using output codes to boost multiclass learning problems.
\newblock In  D.~H. Fisher (ed.) {\em Proceedings fo the 14th International Conference on Machine
  Learning (ICML-97)}, pp.\ 313--321, Nashville, TN, 1997. Morgan Kaufmann.

\bibitem[Schapire and Singer(1999)]{AdaBoost-Confidences}
R.~E. Schapire and Y.~Singer.
\newblock Improved boosting algorithms using confidence-rated predictions.
\newblock {\em Machine Learning}, 37\penalty0 (3):\penalty0 297--336, 1999.

\bibitem[Schmidt(1996)]{SVM-SpeakerIdentification-2}
M.~S. Schmidt.
\newblock Identifying speakers with support vector networks.
\newblock In {\em Proceedings of the 28th Symposium on the Interface
  (INTERFACE-96)}, Sydney, Australia, 1996.
% \newblock URL \url{http://www.stat.uga.edu/~lynne/symposium/paper1i3.ps.gz}.

\bibitem[Schmidt and Gish(1996)]{SVM-SpeakerIdentification}
M.~S. Schmidt and H.~Gish.
\newblock Speaker identification via support vector classifiers.
\newblock In {\em Proceedings of the 21st IEEE International Conference Conference
  on Acoustics, Speech, and Signal Processing (ICASSP-96)}, pp.\/ 105--108,
\newblock Atlanta, GA, 1996.

\bibitem[Tesauro(1989)]{lig*Tesauro89b}
G.~Tesauro.
\newblock Connectionist learning of expert preferences by comparison training.
\newblock In D.~Touretzky (ed.) {\em Advances in Neural Information
  Processing Systems 1 (NIPS-88)}, pp.\ 99--106. Morgan Kaufmann, 1989.

\bibitem[Utgoff and Clouse(1991)]{lig*Utgoff91}
P.~E. Utgoff and J.~Clouse.
\newblock Two kinds of training information for evaluation function learning.
\newblock In {\em Proceedings of the 9th National Conference on Artificial
  Intelligence (AAAI-91)}, pp.\ 596--600, Anaheim, CA, 1991. AAAI Press.

\bibitem[Weston and Watkins(1999)]{PiecewiseLinearSVM}
J.~Weston and C.~Watkins.
\newblock Support vector machines for multi-class pattern recognition.
\newblock In M.~Verleysen (ed.) {\em Proceedings of the 7th European
  Symposium on Artificial Neural Networks (ESANN-99)}, pp.\ 219--224, Bruges,
  Belgium, 1999.

\bibitem[Witten and Frank(2000)]{Weka}
I.~H. Witten and E.~Frank.
\newblock {\em Data Mining --- Practical Machine Learning Tools and Techniques
  with Java Implementations}.
\newblock Morgan Kaufmann Publishers, 2000.

\end{thebibliography}


% Additional work for a journal version
% - take one set and empirically determine growth function of Ripper
 % and R3  by learning from increasing training sets
% - Partitioning!!!
% - Bostroem

% Work for the CRC:
% - include sth on ordered classification (maybe experiments)
% - include Inuzuka in related work

\end{document}

%(setf c5r '(75.08
%  5.84
% 24.77
%  2.90
% 51.79
%  5.04
%  2.98
% 13.16
% 15.69
%  4.90
%  6.73
%  1.14
%  0.69
%  0.74
% 29.20
% 19.49
% 40.63))


%(setf r3 '(74.34
% 2.26
%25.70
% 3.46
%53.11
% 3.74
% 2.76
%10.35
%15.77
% 5.04
% 6.30
% 1.11
% 0.53
% 1.01
%29.08
%18.69
%41.78))


%(setf c5 '( 78.48
%  7.58
% 35.05
%  3.20
% 51.22
%  9.20
%  3.09
% 13.82
% 15.77
%  4.90
%  9.66
%  1.11
%  0.58
%  0.72
% 26.24
% 21.72
% 43.26))

%(setf ripper '(  81.18
%  12.15 
%  34.58 
%   4.29 
%  61.39 
%   9.48 
%   3.38 
%  13.04 
%  15.91 
%   5.47 
%   8.79 
%   1.49 
%   0.56 
%   0.98 
%  30.38 
%  27.07 
%  42.39 ))