\documentclass[twoside,11pt]{article}
\usepackage{jmlr2e}
\usepackage{subfigure}
\usepackage{array}
\usepackage{amsmath}
\usepackage{floatflt}
\usepackage{multirow}
\usepackage[english,polish]{babel}
\usepackage{color}


% begin Ela --------------------------------------------------
% this is to print a Polish letter correctly in LaTeX
\newdimen\plWleft
\newdimen\plWdown
\newdimen\plWright
\newdimen\plWtemp
\def\sob#1#2#3#4#5{%parameters: letter and fractions hl,ho,vl,vo
  \setbox0\hbox{#1}\setbox1\hbox{$_\mathchar'454$}\setbox2\hbox{p}%
  \plWright=#2\wd0 \advance\plWright by-#3\wd1
  \plWdown=#5\ht1 \advance\plWdown by-#4\ht0
  \plWleft=\plWright \advance\plWleft by\wd1
  \plWtemp=-\plWdown \advance\plWtemp by\dp2 \dp1=\plWtemp
  \leavevmode
  \kern\plWright\lower\plWdown\box1\kern-\plWleft #1}

\DeclareTextCommand{\eob}{OT1}{\sob e{.50}{.35}{0}{.93}}
% end Ela --------------------------------------------------


\newcommand{\bm}[1]{\mbox{\boldmath$#1$}}
\newlength{\figsize}
\setlength{\figsize}{0.3\textwidth}

\newenvironment{smallitem1}
  {\vspace*{-0.22cm}\begin{list}{-}{\setlength{\itemsep}{-2pt plus 1pt minus 1pt}}}
  {\end{list}}
\newenvironment{smallitem2}
  {\vspace*{-0.22cm}\begin{list}{$\bullet$}{\setlength{\itemsep}{-2pt plus 1pt minus 1pt}}}
  {\end{list}}


\jmlrheading{2}{2001}{175-211}{03/01}{12/01}
{El\.zbieta P\eob kalska, Pavel Pacl\'{\i}k and Robert P.W. Duin}
\ShortHeadings{Dissimilarity-based Classification}
{P\eob kalska, Pacl\'{\i}k \& Duin}
\firstpageno{175}


\begin{document}

\selectlanguage{english}

\title{A Generalized Kernel Approach to Dissimilarity-based Classification}


\author{\name El\.zbieta P\eob kalska
        \email ela@ph.tn.tudelft.nl
        \AND 
        \name Pavel Pacl\'{\i}k
        \email pavel@ph.tn.tudelft.nl
        \AND
        \name Robert P.W. Duin
        \email duin@ph.tn.tudelft.nl\\
        \addr Pattern Recognition Group, Faculty of Applied Sciences\\
         Delft University of Technology\\
         Lorentzweg 1, 2628 CJ Delft, The Netherlands}

\editor{Nello Cristianini, John Shawe-Taylor, Robert Williamson}
\maketitle


\begin{abstract}%
Usually, objects to be classified are represented by features. 
In this paper, we discuss an alternative object representation based on
dissimilarity values. If such distances separate the classes well,
the nearest neighbor method offers a good solution. However, dissimilarities 
used in practice are usually far from ideal and the performance of the 
nearest neighbor rule suffers from its sensitivity to noisy examples. 
We show that other, more global classification techniques are preferable 
to the nearest neighbor rule, in such cases.

For classification purposes, two different ways of using generalized 
dissimilarity kernels are considered. In the first one, distances 
are isometrically embedded in a pseudo-Euclidean space and the classification 
task is performed there. In the second approach, classifiers are built 
directly on distance kernels. Both approaches are described theoretically 
and then compared using experiments with different dissimilarity measures 
and datasets including degraded data simulating the problem of missing values.
\end{abstract}


\begin{keywords}
dissimilarity, embedding, pseudo-Euclidean space, nearest mean
classifier, support vector classifier, Fisher linear discriminant
\end{keywords}


\section{Introduction}
In this paper, we study the design of classifiers, directly based on a
given set of dissimilarities between objects using a generalized kernel
approach. Kernels are often understood as symmetric, positive definite
functions of two variables, and thereby, they express similarity between
objects represented in a feature space. Here, we propose to address a
kernel in a more general way, i.e. as a proximity measure.  The important
difference is that a kernel describes now a proximity relation between two
objects, which may not be explicitly represented in a feature space, or
which may come from two different feature spaces. An example is a
dissimilarity measure defined between shapes or contours, or images of
different sizes. A suitable choice can be e.g. variants of the Hausdorff
distance, comparing two sets of points \citep{Dubuisson}. In such a case,
there is no need to define a common feature space, where the objects are
represented. It means that there might be no functional dependence given
specifically. We simply assume that a dissimilarity representation is just given.

Since a lot of attention has been paid to similarity kernels, this paper 
is devoted to classification aspects using dissimilarity (or distance) 
kernels. Here, we want to emphasize the importance of recognition tasks 
for which dissimilarity kernels are built directly on images or shapes. 
Therefore, no feature space is (or needs to be) originally defined, but
dissimilarities arise directly from the application. Examples are 
\citet{Jain} and \citet{Jacobs}. 
Therefore, concerning the notation, if we generally refer to objects, 
like $r$, $s$ or $x$, they will {\it not\/} be printed in bold, to 
emphasize that they are not (or might not) be feature vectors. The distance 
kernel represents the information in a relative way, i.e. through pairwise
dissimilarity relations between objects. The goal now is to learn {\em only\/} 
from such relational data, i.e. without any use (different than computing 
distances if necessary) of a starting original representation. The question 
studied here is, therefore, how to learn from data, given only dissimilarity 
kernels. 

Recently, \citet{Scholkopf} has proposed to treat kernels as generalized 
distance measures and also strengthened the link to algorithms based on 
positive definite kernels, such as the support vector classifier (SVC) 
\citep{Vapnik,Scholkopf1} or the kernel principal component analysis
\citep{Scholkopf0,Scholkopf1}. There is, however, an essential difference 
with our approach. Sch{\"o}lkopf starts his reasoning from a feature space 
representation, where, in our case, this is not possible; we start 
a learning task from a dissimilarity kernel. For a more elaborate discussion, 
see Section \ref{sec:scholkopf}.

Our principal question is how, given a dissimilarity kernel, a recognition
problem can be tackled. To this end, we collected a set of methods which
can be used by a researcher. In the paper we analyze the problem and
illustrate possible solutions. Given a good dissimilarity measure, the
$k$-nearest neighbor ($k$-NN) classifier is expected to perform well. It
is, however, difficult to build such a measure for a complex recognition
problem. In case of imperfect dissimilarities, the $k$-NN rule suffers 
from its sensitivity to noisy examples, but more global classifiers can 
perform better. How to design them is the goal of this research.

Two distinct approaches for classification tasks are studied in this
paper. In the first one, a dissimilarity representation is isometrically
embedded in a feature space, as presented in Section \ref{sec:embed}.
This is always possible for a finite representation, although such a
space may be pseudo-Euclidean \citep{Goldfarb2}. Objects mapped to such a
space preserve the structure of the data revealed in original
distances. It means that if the dissimilarity measure defines classes
which are bounded and compact, the configuration found in an underlying
feature space should reflect those properties. An underlying feature space
is constructed in the process of embedding. For this purpose, both the
compactness hypothesis and its reverse should hold (see 
Section \ref{sec:compactness}). If the dimensionality of such a feature 
space is low, then a classification task can be easily performed.

In the second approach, a dissimilarity kernel is interpreted as a mapping
based on a chosen representation set $R$ of objects \citep{Duin2, Pekalska2},
as presented in Section \ref{sec:class2}. In this formulation, classifiers 
can be constructed directly on the dissimilarity kernels, as in dissimilarity 
spaces.

The paper is organized as follows. Section~\ref{sec:compactness} discusses
dissimilarities in the light of the compactness hypothesis, in general.
Section~\ref{sec:embed} presents some mathematical formulation
of the linear embedding problem. Section~\ref{sec:class1} gives more
details on building classifiers in the embedded space.
Section~\ref{sec:class2} addresses the classification problem 
by the direct use of dissimilarity kernels. 
Sections~\ref{sec:exp} and \ref{sec:exp2} describe the experiments 
conducted, and discuss the results. Conclusions are summarized in 
Section~\ref{sec:concl}.


\section{On dissimilarity measures}\label{sec:compactness}
In general, a classification problem can be solved based on the so-called 
compactness hypothesis \citep{Arkadev, Duin3}, which states that 
objects that are similar, are also close in their representations.
Effectively, this puts a constraint on the dissimilarity measure $d$, which 
has to be such that $d (r, s)$ is small if objects $r$ and $s$ are similar, 
i.e. it should be much smaller for similar objects than for objects that 
are very different.
For feature representations, the above does not hold the other way around: 
two entirely different objects may have the same feature representation. 
This does not cause a problem if these feature values are improbable for all 
or for all but one of the classes. For a dissimilarity kernel, however, 
the reverse of the compactness hypothesis also holds provided that 
the dissimilarity measure $d$ poses some continuity. If we demand that $d(r, s) = 0$, 
if and only if objects $r$ and $s$ are identical, this implies that they 
belong to the same class. This can be extended somewhat by assuming that 
all objects $s$ with a small distance to the object $r$, i.e. for which 
$d(r,s) < \epsilon$ for a positive $\epsilon$ being sufficiently small, 
are so similar to $r$, that they belong to the same class. Consequently, 
the dissimilarities of $r$ and $s$ to all objects $x$ under consideration should 
be about the same, i.e. $d (r,x) \approx d (s,x)$.
We conclude, therefore, that for dissimilarity representations satisfying
the above continuity, the reverse of the compactness hypothesis holds: 
objects that are similar in their representation are also similar in reality 
and belong, thereby, to the same class \citep[see][]{Duin4}.
As a result, classes do not overlap and the classification error may become 
zero for large training sets.

In order to interpret further such a hypothesis, let us recall the notion of 
metric. A~distance measure $d$ is called a metric when the following
conditions are fulfilled:
\begin{smallitem2}
\item reflectivity, i.e. $d(x,x) = 0$ 
\item positivity, i.e. $d(x,y) > 0$ if $x$ is distinct from $y$
\item symmetry, i.e. $d(x,y) = d(y,x)$
\item triangle inequality, i.e. $d(x,y) < d(x,z) + d(z,y)$ for every $z$
\end{smallitem2}
Basically, reflectivity and positivity are crucial to define a proper
dissimilarity measure. We do not accept a dissimilarity measure which is zero 
for two different objects, because it would violate the compactness hypothesis. 
On the other hand, negative dissimilarities would be difficult to interpret. 
Therefore, reflectivity and positivity conditions should always be fulfilled. 
If a distance measure is a metric, then the assumption on the continuity is 
fulfilled and, thereby, also the reverse of the compactness hypothesis by the 
means of symmetry and triangle inequality. However, whether the last two conditions 
are necessary for a dissimilarity kernel is disputable. 
Non-metric distances often seem to arise when shapes or objects in images
are compared by template matching or when other type of distances are built e.g.
in computer vision \citep{Dubuisson, Jacobs}. It has been also argued 
\citep{Tversky,Jacobs} that the symmetry constraint might be too strong, 
especially when dissimilarities come from psychological judgments. 
Therefore, in some methods, asymmetric distances are also permitted.
In this paper, symmetric distance measures not obeying the triangle inequality,
and therefore not proper metrics, are included. 


\section{Linear embedding of dissimilarities}\label{sec:embed}
There is a number of ways to embed dissimilarity data in a feature space.
Since, we are interested in a faithful configuration, a (non-)linear embedding 
is performed such that the distances are preserved as well as possible. 
Since nonlinear projections require more computational effort and, moreover,
the way of projecting new points to the existing configuration is not 
straightforward (or not defined), the linear mappings are preferable.
Here, isometric embeddings are considered.  

\subsection{Embedding of Euclidean distances}
Let the representation set $R = \{p_1,p_2,\ldots,p_n\}$ refer to $n$ objects
\citep{Duin2, Pekalska2}. Given a Euclidean distance 
matrix $D \in {\cal{R}}^{n \times n}$ between those objects, a distance 
preserving mapping onto a Euclidean space can be found. 
Such a projection is known in the literature as a {\it classical scaling\/}
or a {\it metric multidimensional scaling\/} \citep{Young,Borg,Cox}.
In other words, the dimensionality $k \le n$ and the configuration 
$X \in {\cal{R}}^{n \times k}$ can be found such that the (squared) Euclidean 
distances are preserved. Note that having determined one configuration, another 
one can be found by a rotation or a translation. To remove the last degree 
of freedom, without loss of generality, the mapping will be constructed such 
that the origin coincides with the centroid (i.e. the mean vector) of the 
configuration $X$.\footnote[1]{It is also possible that any arbitrary point of 
$X$ would become the centroid.}

To define $X$, the relation between Euclidean distances and inner products
are used. First, it can be proven that  
\begin{equation*}
  D^{(2)} = \bm{b} \, \bm{1}^T + \bm{1} \, \bm{b}^T - 2\, B,
\end{equation*}
\citep[see][Appendix \ref{sec:app_dist_prods}]{Borg}, 
where $D^{(2)}$ is a matrix of square Euclidean distances, $B$ is the matrix of 
inner products of the underlying configuration $X$, i.e. $B = X\, X^T$ and 
$\bm{b}$ is a vector of the diagonal elements of $B$. $B$ can also be expressed as:
\begin{equation}
   B = -\frac{1}{2} \, J \, D^{(2)} \, J,
\label{Bis0}
\end{equation}
where $J$ is the centering matrix 
$J = I - \frac{1}{n} \bm{1} \, \bm{1}^T \in {\cal {R}}^{n \times n}$
and $I$ is the identity matrix. $J$ projects\footnote[2]
{A more general projection can be achieved, imposing that a
weighted mean becomes zero, by which $J = I - \bm{w} \, \bm{1}^T$, where $\bm{w}$
is a weight vector, such that $\bm{w} ^T \bm{1} = 1$ and 
$B = - \frac{1}{2}\,J^T \, D^{(2)} J$.} the data such that the final configuration
has zero mean. $B$ is positive definite since it is a Gram matrix \citep{Greub}. 
Then, the factorization of $B$ by its eigendecomposition can be found as:
\begin{equation}
   X \, X^T = B = Q \, \Lambda \, Q^T,
\label{eigen}
\end{equation}
where $\Lambda$ is a~diagonal matrix with the diagonal consisting of the first
non-negative eigenvalues, ranked in descending order, followed by the zero values, 
and $Q$ is an orthogonal matrix of the corresponding eigenvectors \citep{Borg}. 
For $k \le n$ non-zero eigenvalues, a $k$-dimensional representation $X$ can be 
then found as:
\begin{equation}
   X = Q_k \, \Lambda_k^{\frac{1}{2}},  \qquad Q_k \in {\cal{R}}^{n \times k},\quad
       \Lambda_k^{\frac{1}{2}} \in {\cal{R}}^{k \times k},
\label{Xis}
\end{equation}
where $Q_k$ is a matrix of the first $k$ leading eigenvectors and 
$\Lambda_k^{\frac{1}{2}}$ contains the square roots of the corresponding 
eigenvalues. Note that $X$, determined in this procedure, is unique up to 
rotation (the centroid is now fixed), since for any orthogonal matrix $T$, 
$X \, X^T = (XT) \, (XT)^T$. Note also, that features of $X$ are 
uncorrelated, since the estimated covariance matrix of $X$ becomes:
\begin{equation} 
Cov(X) = \frac{1}{n-1} \,X^T \, X = \frac{1}{n-1} \, \Lambda_k^{\frac{1}{2}} \, Q_k^T \, 
Q_k \, \Lambda_k^{\frac{1}{2}} = \frac{1}{n-1} \,\Lambda_k,
\label{Covis}
\end{equation} 
because $Q_k$ is orthogonal.
 
 
\subsection{Embedding of non-Euclidean distances}
The matrix $B = -\frac{1}{2} \, J \, D^{(2)} \, J$ is positive definite 
if and only if the distance matrix $D \in {\cal{R}}^{n \times n}$ is Euclidean 
\citep{Borg,Gower,Gower2}. Therefore, for a non-Euclidean $D$, $B$ 
is not positive definite, i.e. $B$ has {\it negative\/} eigenvalues. 
As a result, $X$ cannot be constructed from $B$, since it relies on the square 
roots of eigenvalues; see Formula (\ref{Xis}). Two approaches are possible 
to address this problem in a Euclidean space: 
\begin{smallitem2}
\item Only $p$ positive eigenvalues are taken into account, resulting in a
$p$-dimensional configuration $X=Q_p \, \Lambda_p^{\frac{1}{2}}$ ($p<k$),
for which distances approximate the original ones. Since the distances
are positive, the largest negative eigenvalues in magnitude, are smaller
than the largest positive eigenvalues. Also the sum of the positive
eigenvalues is larger than the sum of magnitudes of the negative ones.

A justification for choosing only positive eigenvalues can be found in
Section \ref{sec:reduced}, where the issue of noise influence is discussed.
We argue that, in general, directly measured distances may be noisy, and
therefore, they may not be perfectly Euclidean, which will result in small
negative eigenvalues of $B$; see Figure \ref{fig:line} for an illustration.
Therefore, by disregarding them, noise can be diminished.

\item
It is known that there exists a positive constant 
$c \ge 2\,|\lambda|$, where $\lambda$ is the smallest (negative) eigenvalue of $B$,
such that a new square Euclidean distance matrix might be created from $D^{(2)}$ 
by adding $c$ to the off-diagonal elements \citep{Cox,Gower}, i.e.:
\begin{equation}
D^{(2)}_{n} = D^{(2)} + c \, (\bm{1} \, \bm{1}^T - I).
\label {Dnewis}
\end{equation}
Then, $X$ can be again expressed in a Euclidean space. In practice, the 
eigenvectors remain the same and the value $\frac{c}{2}$ is added to the non-zero 
eigenvalues, giving the new eigenvalue matrix $\Lambda_k + \frac{c}{2}\,I$. This 
is equivalent to regularizing the covariance matrix of our configuration $X$, 
i.e. $Cov(X)= \frac{1}{n-1}\,(\Lambda_k + \frac{c}{2}\,I)$ and changing 
$X$ respectively. %Note that $D^{(2)}$ is distorted significantly when $c$ is large. 
\end{smallitem2}
%
\begin{figure}[t]
\begin{center}
   \includegraphics[width=8.7cm]{line_illustration}
   \\[-4mm]
   \caption{An example on the usefulness of disregarding the negative eigenvalues.
     Assume that the theoretical, original data is a perfect line, however
     due to the measurement process, somewhat distorted, as observed in the
     first plot. The distance kernel $D$ is computed with distances $d_{ij}
     = ||\bm{x}_i - \bm{x}_j||^{1.004}$, which is nearly Euclidean. During
     the embedding process $16$ eigenvalues are revealed, where $14$ are
     negative (the largest negative in magnitude equals $-0.042$). This
     would suggest a possible $16$-dimensional configuration, however, one
     significant positive eigenvalue indicates that the `real' intrinsic
     dimensionality of the data is $1$ (for a Euclidean distance and a
     perfect line, the embedded configuration is $1$-dimensional). The
     second plot shows the projection onto first two dimensions (the
     configuration is retrieved up to a rotation). The last plot presents
     the projection onto the 1st and the 3rd dimensions, where the 3rd
     dimension corresponds to the largest (in magnitude) negative eigenvalue
     (how the embedding is done in such a case is explained by Formula
     \ref{Xpsis}).   
	  Notice how a tiny change of both the theoretical data and the Euclidean 
	  distance of a very simple problem enlarges the number of retrieved 
     dimensions, from $1$, in the perfect case, to $16$.
     }
  \label{fig:line}
  \end{center}
  \vspace*{-1cm}
\end{figure}
%
These two approaches transform the problem, so that a configuration $X$ can
be defined again in a Euclidean space. This is especially useful when the
negative eigenvalues are relatively small in magnitude, which suggest, on
the other hand, that the original distance measure is close to Euclidean.
In such cases, those negative eigenvalues can be interpreted as a noise
contribution. If the negative eigenvalues are relatively large, then by
neglecting them, possibly important information has been rejected. There is
still an open question about the consequences on classification tasks of
transforming the problem into a Euclidean space, either by neglecting the
negative eigenvalues or by directly enlarging $D^{(2)}$ by a constant.

Another possibility exists for problems in which the Euclidean space 
is not `large enough'. \citet*{Goldfarb1, Goldfarb2} proposed to project the data 
onto a pseudo-Euclidean space, which can be performed for any symmetric distance 
matrix. The pseudo-Euclidean space can be seen as consisting of 
two Euclidean spaces, for which the inner product operation is positive definite 
on the first space and negative definite on the second one 
\citep[see][Appendix Appendix \ref{sec:app_ps}]{Greub}. For a simple 
illustration, see Figure \ref{fig:R11}. To embed the data, the same reasoning 
as for a Euclidean space is applied here. The essential difference refers to 
the notion of an inner product and a distance. Now, 
$B = -\frac{1}{2} \, J \, D^{(2)} \, J$, is still the matrix of inner products, 
but it is expressed as (see Appendix \ref{sec:app_ps}):
\begin{equation}
   B = X\, M \, X^T, 
\end{equation}
where $M$ is a matrix of the inner product operation in a pseudo-Euclidean space.
Following \citet*{Goldfarb2}, we can write (compare to Formula \ref{eigen}):
\begin{equation}
   X\, M \, X^T = B = Q \, 
   \Lambda \, Q^T = Q \, |\Lambda|^{\frac{1}{2}} \, 
        \begin{bmatrix} 
        M & \\ & 0
        \end{bmatrix} 
   \, |\Lambda|^{\frac{1}{2}} \, Q^T,
\mbox{~~where~~} 
M = 
\begin{bmatrix}
I_{p \times p} & 0  \\
0 & - I_{q \times q},\\
\end{bmatrix}
\end{equation}
and $p+q=k$. $\Lambda$ is now based on $p$ positive and $q$ negative
eigenvalues, which are presented in the following order: first, positive
eigenvalues with decreasing values, then negative ones with decreasing
magnitude and finally, zero values. Therefore, $X$ can be now represented
in a pseudo-Euclidean space ${\cal R}^k = {\cal R}^{(p,q)}$ of, the
so-called, signature $(p,q)$ \citep{Goldfarb1}, as follows:
\begin{equation}
   X = Q_{k} \, |\Lambda_{k}|^{\frac{1}{2}}.
\label{Xpsis}
\end{equation}
Note that $X$ has uncorrelated features, since the estimated pseudo-Euclidean
covariance matrix (which is not positive definite as in a Euclidean space) 
is given as \citep{Goldfarb2}:
\begin{equation}
Cov(X) = \frac{1}{n-1} \,X^T \, X\, M = 
\frac{1}{n-1} \,|\Lambda_k| M = \frac{1}{n-1} \, \Lambda_k.
\label{Covpis}
\end{equation}
This means that $X$ is a result of a mapping in the sense of the PCA projection
and the whole embedding procedure can be also interpreted as a sort of 
a kernel-PCA \citep{Scholkopf0,Scholkopf} approach, where the kernel $B$ is 
a reproducing kernel for the pseudo-Euclidean feature space (see Section 
\ref{sec:scholkopf}). 


\begin{figure}[t]
\begin{center}
   \includegraphics[width=4.7cm]{R11}
   \\[-5mm]
   \caption{A pseudo-Euclidean space $R^{(1,1)}$, where
            $d^2(\bm{x},\bm{y}) = (\bm{x} - \bm{y})^T M (\bm{x} -\bm{y})$.
            Here, the length of any vector of the form $[x_1\ \pm x_1]^T$, 
            is zero. The orthogonal vectors are mirrored w.r.t.
            the lines $x_2=x_1$ or $x_2=-x_1$, e.g. $\langle OA,OC\rangle = 0$.
            Vector $\bm{v}$ defines the plane $\bm{v}^T M \bm{x}=0$ in this space.
            Vector $\bm{w} = M \, \bm{v}$, a `flipped' version of $\bm{v}$,
            describes the plane as if in a Euclidean space, i.e. it is perpendicular. 
            This explains that in any pseudo-Euclidean space, the inner product 
            operation can be seen as a Euclidean operation where one vector is `flipped' 
            by $M$. In general, distances can be of any sign, e.g.:
            $d^2(A,C) = d^2(F,G) = 0$, $d^2(A,B) = 1$, $d^2(B,C) = -1$, 
            $d^2(D,A) = -8$, $d^2(F,A) = 21$ and $d^2(E,C) = 8$.
}
  \label{fig:R11}
  \end{center}
  \vspace*{-1.2cm}
\end{figure}

Computing square distances in a pseudo-Euclidean space ${\cal R}^{(p,q)}$
can be interpreted as computing the square Euclidean distance in a `positive' space 
${\cal R}^p$ and subtracting the square Euclidean distance found in a `negative' 
space ${\cal R}^q$ (see Appendix \ref{sec:app_ps}). The distances computed 
only in the `positive' space are overestimated, therefore, the purpose of the
`negative' space is to correct them, e.g. make them be non-Euclidean. 
Since pseudo-Euclidean spaces, we are going to use, will result from the
embedding process of positive distances, the contribution of the `negative'
space ${\cal R}^q$ to the overall distances is smaller than of the space
${\cal R}^p$. In such a case, due to the construction of $X$, $X$ has
relatively small feature values corresponding to the space ${\cal R}^q$.
Practice confirms that many measured distances are close to the Euclidean one,
giving relatively small negative eigenvalues in the embedding procedure. 
Note also, that the inner product $\bm{v}^T M \, \bm{x}$ can be interpreted 
as a traditional, Euclidean inner product $\bm{w}^T \bm{x}$, where 
$\bm{w} = M \bm{v}$. 
Therefore, having found a pseudo-Euclidean configuration $X$, a linear
classifier $y = \bm{v}^T M \bm{x} + v_0$ can be found by addressing it as
in a standard, Euclidean case, i.e. $y = \bm{w}^T \bm{x} + v_0$.


\subsection{Embedding new points}
Having found a configuration $X$ in a pseudo-Euclidean space that 
preserves all pairwise distances $D(R,R)$, new objects can be can 
added to this space via the linear projection.
Given the square distance matrix $D_{n}^{(2)} \in {\cal{R}}^{s \times n}$, 
expressing dissimilarities between $s$ novel objects and all objects of 
the representation set $R$, a configuration $X_{n}$ is to be determined 
in a pseudo-Euclidean space ${\cal R}^k ={\cal R}^{(p,q)}$. First, 
the matrix $B_{n} \in {\cal{R}}^{s \times n}$ of inner products relating 
all new objects to all objects from $R$ should be found, which becomes  
(see Appendix \ref{sec:app_adding}):
\begin{equation}
   B_{n} = -\frac{1}{2} (D_n^{(2)} \, J - U \,D^{(2)} \, J ),
\label{Bnewis}
\end{equation}
where $J$ is the centering matrix 
and $U = \frac{1}{n} \, \bm{1}^T \bm{1} \in {\cal{R}}^{s \times n}$.
Since $B_n$ can be expressed as:
$$
  X_n \, M \, X^T = B_n, \quad \mbox{with}
$$ 
\begin{equation}
M = 
\begin{cases}
I \in {\cal{R}}^{k \times k} & \text{if the space~} {\cal R}^k \text{~is Euclidean,} \\
\begin{bmatrix}
I_{p \times p} & 0 \\
0 & - I_{q \times q} \\
\end{bmatrix}
\in {\cal{R}} ^{k \times k} & \text{if the space~} {\cal R}^k \text{~is pseudo-Euclidean,} \\
\end{cases} \label{Mis}
\end{equation}
therefore, $X_n$ is found as the mean-square error solution to 
$X_n \, M \, X^T = B_n$, i.e. $X_n = B_n \, X \, (X^T \, X)^{-1} \, M$. 
Knowing that $X^T \, X = |\Lambda|$ and $X = Q_k \, |\Lambda_k|^{\frac{1}{2}}$,
$X_n$ is alternatively presented as:
\begin{equation*} 
X_n = B_n \, X \, |\Lambda|^{-1} \, M \mbox{~~or~~}
X_n = B_n \, Q_k \, |\Lambda_k|^{-\frac{1}{2}} \, M.
\end{equation*} 


\subsection{Reduction of dimensionality}\label{sec:reduced}
By adding one object to the representation set $R$, (and, therefore, to the 
dissimilarity kernel $D(R,R)$), in practice one point is added to a finite 
pseudo-Euclidean space, but the dimensionality $k$ of the vector representation 
might increase by more than one, contrary to the Euclidean case \citep{Goldfarb2}. 
This means that both outliers and noise can
contribute significantly to the resulting dimensionality $k$. In practice, 
when new points are added, they are projected onto the space determined 
by the starting configuration $X$. Therefore, the reliability of $X$, i.e. 
whether $D(R,R)$ describes a sufficiently well sampled representation set, 
plays an essential role in the process of representing new data, 
and consequently, the classification performance.

\begin{figure}[t]
\begin{center}
   \includegraphics[width=13cm]{noise_illustration}
   \\[-3mm]
   \caption{Noise influence on eigenvalues of $B$. 
            The first, leftmost plot presents the 2D theoretical banana data 
            (consisting of $200$ points), for which the Euclidean
            distance matrix $D$ has been computed. The second plot shows
            the result of the embedding of $D$ into a 2D space (note that the retrieved
            configuration is up to a rotation). 
            The third plot presents the projection onto the first $2$ dimensions of 
            the $199$D data obtained via embedding of distorted distances $\tilde{D}$, 
            (where $\tilde{d}_{ij} = d_{ij} + s_{ij}$ and $s_{ij} \thicksim N(0,0.001)$),
            which become non-Euclidean.
            The last plot presents the projection onto the first $2$ dimensions of 
            the 4D data obtained via embedding of $D(\tilde{A})$, where $\tilde{A}$ 
            consists of the theoretical data $A$ to which $2$ noisy features were added. 
            Note that the first $2$ largest eigenvalues, as given under the graphs, 
            are about the same for non-distorted as well as for distorted data, which 
            gives practically the same results in all cases. Therefore, by rejecting 
            relatively small eigenvalues, noise is diminished.}
  \label{fig:noise}
  \end{center}
  \vspace*{-1.2cm}
\end{figure}


Originally, the (pseudo-)Euclidean configuration $X$ is found such that
the distances are preserved exactly 
and the dimensionality of $X$ is determined by the number of non-zero
eigenvalues of $B$. However, there might be many relatively small non-zero
eigenvalues as compared to the large ones. Knowing that dissimilarities are
noisy measurements, the small eigenvalues correspond to non-significant 
directions of $X$. In such a framework, neglecting small eigenvalues stands 
for reducing noise contribution (see Figure~\ref{fig:noise} for an 
illustration) or for finding a representation with the intrinsic dimension. 

In such a case, distances will be preserved approximately. One has,
however, a control over the dimensionality of the reduced vector
representation. Basically, the dimensionality reduction can be achieved by
the orthogonal projection, governed by the PCA.
The particular construction of $X = Q_{k} \,|\Lambda_{k}|^{\frac{1}{2}}$ 
and the fact that $X$ is an uncorrelated vector representation, i.e.
$Cov(X) =\frac{1}{n-1} \, \Lambda_k$, stand for $X$ being given in the form
of the orthogonal PCA projection; see Formula (\ref{Covpis}). It means that 
the reduction of dimensionality 
is performed in a simple way by neglecting directions corresponding to 
eigenvalues small in magnitude. The reduced representation (being an orthogonal 
projection) is then determined by the $p'$ significant positive eigenvalues and 
$q'$ significant (in magnitude) negative eigenvalues. Therefore, 
$X' \in {\cal{R}}^{n \times k'}$, $k' < k$, is found as 
$X' = Q_{k'} \, |\Lambda_{k'}|^{\frac{1}{2}}$, where $k' = p' + q'$ and 
$\Lambda_{k'}$ is a diagonal matrix of first, decreasing positive eigenvalues 
and then increasing negative eigenvalues, and $Q_{k'}$ is the matrix of 
corresponding eigenvectors. 


\section{Classification in a pseudo-Euclidean space}\label{sec:class1}
The symmetric dissimilarity kernel $D$ can be seen as a description of an underlying,
lower-dimensional vector representation $X$. If $X$ is determined in a Euclidean 
space, then any traditional classifier can be constructed in such a space.
If $X$ happens to be a pseudo-Euclidean representation, then the conventional 
classifiers should be redefined.
Here, we limit ourselves to simple, linear classification rules: nearest mean 
classifier, Fisher linear discriminant and support vector classifier,
but, more advanced classifiers can be built as well.

\subsection{Generalized nearest mean classifier}
Nearest mean classifier (NMC) is the simplest linear classifier which assigns
an unknown object to a class of the nearest mean. In a (pseudo-)Euclidean
space ${\cal{R}}^k$, such a decision rule is based on the (pseudo-)Euclidean 
distance. Given $D$, assume a $2$-class problem with the classes $\omega_1$ 
and $\omega_2$.
The vector representation $\{\bm{x}_1,\ldots,\bm{x}_{n}\}$ is such that it 
preserves the originally considered squared distances $D^{(2)}$. Let 
$\overline{\bm{x}}_{(i)}$ be the mean vector of the class $\omega_i$. 
For a new object $z$ represented in this space, as $\bm{z}_x$, the classification 
rule is defined as:
$$
\begin{array}{rl}
\mbox{Assign~} z \mbox{~to~} \omega_1 &  \mbox{iff~} 
d^2(\bm{z}_x,\overline{\bm{x}}_{(1)}) <  d^2(\bm{z}_x,\overline{\bm{x}}_{(2)}) \\
\mbox{Assign~} z \mbox{~to~} \omega_2 &  \mbox{otherwise}
\label{rule_nmc}
\end{array}
$$
where 
$d^2 (\bm{x}, \bm{y}) = ||\bm{x} - \bm{y}||^2 = \, 
\langle\bm{x}-\bm{y},\,\bm{x}-\bm{y}\rangle 
\, = (\bm{x} - \bm{y})^T \, M \, (\bm{x} - \bm{y})$ 
and $M$ is defined by Formula (\ref{Mis}). 
Embedding a dissimilarity kernel $D$ can be avoided when only class mean vectors 
and distances have to be computed. Therefore, we propose an alternative approach. 
A similar classification process can be carried out, however, without performing 
the exact mapping. As a result, the generalized nearest mean classifier (GNMC) 
will be obtained.
 
Assume first that a class $\omega$ is represented by a distance matrix 
$D(R,R)$ based on the representation set $R = \{p_1,\ldots,p_{n}\}$.
Let a new object $z$ be represented by distances to the set $R$. 
Then, the proximity of $z$ to the class $\omega$ is measured by 
the function $f_{\omega}$ defined as:
\begin{equation}
\begin{array}{lcl}
f_{\omega} (z) &=& \frac{1}{n} \, \sum_{j=1}^n \, d^2 (z, p_j) - V_d (R),\\[3mm]
V_d (R)  &=& \frac{1}{2n^2} \, \sum_{j=1}^n \, \sum_{k=1}^n \, d^2 (p_j,p_k),
\end{array}
\label{fomega}
\end{equation}
where $V_d(R)$ is a generalized variability of the underlying feature space.
It can be shown (see Appendix \ref{sec:app_variability}) that the following holds:
 \begin{equation}
\begin{array}{lcl}
f_{\omega} (z)  &=& ||\bm{z}_x - \overline{\bm{x}}||^2 = d^2 (\bm{z}_x, \overline{\bm{x}}),\\[2mm]
V_d (X) &=& \frac{1}{n} \, \sum_{j=1}^n ||\bm{x}_j||^2 - ||\overline{\bm{x}}||^2,
\label{vd_omega}
\end{array}
\end{equation}
where $\{\bm{x}_1,\ldots,\bm{x}_{n}\}$ is found by a linear embedding, as 
introduced in Section \ref{sec:embed} and $\bm{z}_x$ is the representation of 
the object $z$ in space~${\cal R}^k$.

From dependencies given in (\ref{vd_omega}), two important observations can 
be made.
The first one refers to $V_d$ expressing a variability in $X$. If $X$ is a
$1$-dimensional vector, then $V_d$ coincides with the variance of $X$. For
$X$ being a higher dimensional Euclidean representation, $V_d$ is equal to
the sum of variances, i.e. $\mbox{trace}\,(cov(X))$. For a pseudo-Euclidean
space, $V_{d}$ stands for a generalized variability, based on the
pseudo-Euclidean covariance matrix; see Formula (\ref{Covpis}). The second
observation refers to the function $f_{\omega}(z)$ which measures the
distance of the point $\bm{z}_x$ to the mean of the class $\omega$, both
represented in the space ${\cal R}^k$. The interesting point is that such a
distance can be computed without performing the embedding process
explicitly, since it operates only on the given distances $D$, as presented
in (\ref{fomega}).

This result allows us to define a generalized nearest mean classifier as follows:
\begin{equation}
\mbox{Assign~} z \mbox{~to~} \omega_j: f_{\omega_j} (z) = \min_i \{ f_{\omega_i}(z) \},
\label{rule_gnmc}
\end{equation}
where $f_{\omega_i}$ is defined as in (\ref{fomega}). In other words, $z$ is 
assigned to a class of the nearest mean (centroid), where each centroid is 
described in an underlying space defined by the within-class distances. 
Note that in the formulation given by (\ref{rule_gnmc}), the classification 
rule holds for any number of classes.

The NMC and the GNMC in a pseudo-Euclidean space are not, in general, identical 
classifiers. The NMC finds a linear embedding onto a feature space ${\cal R}^k$
based on the whole distance matrix $D^{(2)}$. Therefore, the dimensionality of 
such a space is determined by both the within-class and between-class distances.
The GNMC operates only on the within-class distances. Although the embedding is 
not performed directly, the GNMC works in the underlying feature spaces 
${\cal R}^k_{\omega_i}$, for each class separately. It may happen that the 
signatures of feature spaces ${\cal R}^k_{\omega_i}$ are not the same. In such 
a case, the performances of the the NMC and the GNMC differ, because the 
NMC unifies pseudo-Euclidean space and the signature for all classes, while the 
GNMC treats them separately, which allows to describe them properly. 
Since the GNMC makes use of the distinct signatures, its accuracy is expected to 
be higher for problems in which the classes behave differently.


\subsection{Fisher linear discriminant}
The linear classifier (or a separating hyperplane) in a pseudo-Euclidean space
${\cal R}^k = {\cal{R}}^{(p,q)}$ is defined as follows \citep{Greub,Goldfarb2}:
\begin{equation}
f (\bm{x}) = \bm{v}^T \, M \, \bm{x} + v_0, \quad \mbox{where~~} 
M = 
\begin{bmatrix}
I_{p \times p} & 0  \\
0 & - I_{q \times q}\\
\end{bmatrix}
\label{lc_pseudo}
\end{equation}
To construct the Fisher linear discriminant (FLD), the notion of 
a pseudo-Euclidean covariance matrix is needed. For the representation $X$, 
it is defined as \citep{Goldfarb2}:
%
\begin{equation*}
cov(X) = \frac{1}{n-1}\,  \left [ \sum_{i=1}^n  (\bm{x}_i - \overline{\bm{x}}) \, 
(\bm{x}_i - \overline{\bm{x}})^T \right ] M, \text{~~where~~} 
\overline{\bm{x}} = \frac{1}{n} \sum_{i=1}^n \bm{x}_i.
\end{equation*}
%
Making use of the above definition and following \citet*{Goldfarb2}, the 
FLD, obtained by maximizing the ratio of between-scatter to within-scatter 
(Fisher criterion) \citep{Fukunaga}, for a $2$-class problem is given by:
\begin{equation*}
\begin{array}{rcl}
\bm{v} &=& M \, C_W^{-1} \, (\bm{m}_1 - \bm{m}_2),n \\[0.2cm]
v_0    &=& -\frac{1}{2} \, (\bm{m}_1 + \bm{m}_2)^T \, \underbrace{M \, M}_{= I} 
           \, C_W^{-1} \, (\bm{m}_1 - \bm{m}_2),
\end{array}
\end{equation*}
where $C_W\, M$ is the pooled within-class covariance matrix 
in a pseudo-Euclidean space ($C_W$ is the pooled within-class covariance 
matrix as computed for the Euclidean case) and $\bm{m}_1$ and $\bm{m}_2$ stand 
for the class means. The FLD in a pseudo-Euclidean space can be constructed as 
the hyperplane $f(\bm{x}) = \bm{w}^T  \bm{x} + v_0$, where $\bm{w} = M \bm{v}$
and $\bm{w}^T  \bm{x}$ refers to a Euclidean inner product. See
Figure \ref{fig:fisher} as an illustration of a simple problem.

\begin{figure}[t]
\begin{center}
   \includegraphics[width=14.9cm]{ps_classf_illustration}
   \\[-3mm]
   \caption{An illustration of the decision boundary of the FLD in an embedded space. 
            The leftmost plot presents a 2D theoretical, artificial data. 
            There are $3$ training points, marked by circles. 
            Only $3$ objects are taken for training, because then the data can be 
            perfectly embedded in not more than $2$ dimensions. 
            The remaining points, marked by `+' and `*' belong to the examples 
            of testing data, which illustrate how the new objects are projected on 
            the retrieved (pseudo-)Euclidean space.
            The following plots show the embedding results of the $L_p$ 
            distance $D$, where $d_{ij} = (\sum_{k=1}^2 \, |x_{ik}-x_{jk}|^p)^{1/p}$, 
            for $p = \{0.6, 0.9,1.5,2\}$. For positive $p$ smaller than $1$,
            the $L_p$ distance is not metric and in such cases the distances 
            between close objects are emphasized. In all subplots, 
            the FLD, determined by the $3$ circles, in the original (first 
            subplot) or the embedded spaces is drawn. For $p=2$, the original, theoretical 
            data is retrieved up to a rotation.
}
  \label{fig:fisher}
  \end{center}
  \vspace*{-1cm}
\end{figure}


\subsection{Support vector classifier}\label{sec:svc}
Let $n$ points $\bm{x}_i$, $i=1,2,\ldots,n$ be given in a Euclidean space 
${\cal R}^k$. Each point $\bm{x}_i$ belongs to one of two classes as
described by the corresponding label $y_i \in \{-1,1\}$. 
The goal for non-overlapping classes is to find the optimal hyperplane:
$f (\bm{x}) = \bm{w}^T \bm{x} + w_0$, which maximizes the margin, i.e.
$\frac{2}{||\bm{w}||^2}$ \citep{Vapnik,Burges} (or, alternatively, minimizes 
$||\bm{w}||^2$). For non-linearly separable classes, non-negative slack 
variables $\xi_i$ are introduced, allowing for classification errors, so 
that the soft margin linear support vector classifier (SVC) \citep{Vapnik,Burges} 
is found as the solution of the quadratic programming procedure: 
\begin{equation}
 \begin{array} {lll}
 \mbox {Minimize}   & \frac{1}{2} \bm{w}^T \bm{w} + C\, \sum_{i=1}^n \xi_i & \\[0.1cm]
 \mbox {s.t.~ }     & y_i \, ( \bm{w}^T \bm {x}_i +  w_0) \ge 1 - \xi_i, & \quad i=1,2,\ldots,n\\[0.1cm]
                    & \xi_i \ge 0&
 \end {array} 
\label{SVC}
\end{equation}
%
The term $\sum_{i=1}^n \xi_i$ is an upper bound on the misclassification of the 
training samples and $C$ can be regarded as a regularization parameter, a trade-off 
between the number of errors and the width of the margin. 
The dual programming formulation is given as follows:
\begin{equation}
 \begin{array} {lll}
 \mbox {Maximize}   & -\frac{1}{2}\, 
\bm{\alpha}^T \, \text{diag}(\bm{y}) \, K \, \text{diag}(\bm{y}) \,\bm{\alpha} + \bm{\alpha}^T \bm{1} &\\[0.1cm]
 \mbox {s.t.~ }     & \bm{\alpha}^T \, {\bm y} = 0& \\[0.1cm]
                    & 0 \le \alpha_i \le C, \quad \quad i=1,2,\ldots,n &
 \end {array} 
\label{dual}
\end{equation}
where $K$ is an $n \times n$ kernel matrix such that $K_{ij} = \langle\bm{x}_i,\bm{x}_j\rangle$.
After solving the problem (\ref{dual}), the weight vector $\bm{w}$ is found to be
a linear combination of the data vectors, giving
$\bm{w} = \sum_{i=1}^n \alpha_i \, y_i \, \bm{x}_i$.
Since many $\alpha_i$ become zero, the data points $\bm{x}_i$ with positive $\alpha_i$,
are the so-called support vectors. Only they contribute to the hyperplane equation.
As a result, the discrimination function can be presented in terms of inner products 
as follows:
\begin{equation}
  f(\bm{x})  =  \sum_{\alpha_i > 0} \alpha_i \, y_i \, \langle\bm{x}, \bm{x}_i\rangle + w_0. 
\label{svcis}
\end{equation}
Note that in the training stage, the kernel matrix $K$ becomes $K = X \, X^T$, 
which equals $B$, the matrix of inner products (see Formula \ref{Bis0}), i.e. $K=B$. 

Since the linear SVC is based only on inner products, and by the linear relation 
(\ref{Bis0}) between $D^{(2)}$ and $B$ ($=K$), the SVC can be easily constructed in 
the underlying feature space without performing the embedding explicitly, provided 
that the distances are Euclidean. For novel objects, represented by the distances 
to the representation set $R$, the SVC can be immediately tested by 
using $B_n$, the matrix of inner products between new objects and objects
embedded originally, as provided by Formula (\ref{Bnewis}).

For a non-Euclidean dissimilarity matrix $D$, the matrix of inner products is not
positive definite, resulting in a pseudo-Euclidean space, The configuration $X$ 
is given by (\ref{Xpsis}), i.e. $X = Q \, |\Lambda|^{\frac{1}{2}}$, for which
a linear classifier is defined by (\ref{lc_pseudo}).
If we now assign $\bm{w}^T = \bm{v}^T \, M$, then the classifier 
$f (\bm{x}) = \bm{w}^T \bm{x} + v_0$ can be treated in a Euclidean space. 
The operation of $\bm{v}^T \, M$ is seen as flipping the values
of the $\bm{w}$ vector in all `negative' directions of the pseudo-Euclidean 
space. As suggested by \citet {Graepel2}, this is equivalent to
flipping the negative eigenvalues to positive ones, and considering the
inner product $B' = Q \, |\Lambda| \, Q^T$ in a Euclidean space as 
being positive definite.

Summarizing, for a dissimilarity kernel $D$, the SVC classifier can 
be built in the underlying feature space as follows. First, the matrix
$B$ is computed according to (\ref{Bis0}). If $B$ is not positive definite,
then the matrix $B'$ is computed as $B' = Q \, |\Lambda| \, Q^T$, 
otherwise $B' = B$. It describes a positive definite kernel,  
used to construct the SVC according to (\ref{svcis}). 
Note also that any polynomial SVC classifier \citep{Vapnik} can be built 
by using $B'$ directly.


\subsection{Discussion on the kernel trick for distances}\label{sec:scholkopf}
Recently, \citet{Scholkopf} has considered kernels as generalized dissimilarity 
measures. His reasoning starts from the observation that a Mercer kernel 
$K$ \citep{Vapnik}, i.e. a positive definite kernel, can be seen as 
a (nonlinear) generalization of the similarity measure based 
on inner products. This is possible because such a kernel 
can be expressed as an inner product operation in some high-dimensional 
feature space ${\cal G}$, i.e.
$K(\bm{x},\bm{y}) = \langle\phi(\bm{x}), \phi(\bm{y})\rangle$, where $\phi$ 
is a mapping, $\phi: {\cal F} \rightarrow {\cal G}$ and $\phi(\bm{x})$ is 
the image of $\bm{x}$ in the space ${\cal G}$. Following the same idea, 
Sch{\"o}lkopf considers a generalization of the squared Euclidean distance 
in the space ${\cal G}$, by using, the so-called {\it kernel trick\/}:
$$
||\phi(\bm{x}) - \phi(\bm{y})||^2 = 
K(\bm{x},\bm{x}) + K(\bm{y},\bm{y}) - 2\,K(\bm{x},\bm{y}),
$$ 
which allows to express this distance only by using the kernel, without explicitly 
performing the mapping. Next, Sch{\"o}lkopf argues that a larger class 
of kernels, namely the {\it conditionally positive definite} kernels, can 
be used. A symmetric function $K: {\cal F} \times {\cal F} \rightarrow {\cal R}$ 
which for all $m \in {\cal N}$, all vectors $\bm{c} \in {\cal R}^{m \times 1}$
and all $\bm{x}_i \in {\cal F}$ fulfills:
\begin{equation}
  \bm{c}^T \, K \, \bm{c} \ge 0, \quad \mbox{where~~} K_{ij} = K (\bm{x}_i, \bm{x}_j) 
\label{p.d.}
\end{equation}
is called a positive definite (p.d.\!) kernel. When the inequality (\ref{p.d.}) 
is satisfied for $\bm{c}$ such that $\bm{c}^T \bm{1} = 0$, the kernel is called
conditionally positive definite (c.p.d.). A relation between the p.d. and c.p.d.\! 
kernels can be established as (`$\dag$' stands for a conjugate transpose):
\begin{equation}
\tilde{K} = \frac{1}{2} \, (I - \bm{1}\bm{w}^{\dag}) \, K \, (I - \bm{w}\bm{1}^{\dag}), 
\quad \mbox{where~~} \bm{w}\bm{1}^{\dag} = 1
\label{Ktilde}
\end{equation}
i.e. $\tilde{K} \in {\cal R}^{n \times n}$ is p.d.\! if and only if $K$ is c.p.d.\!

In the simplest case, $c_i=\frac{1}{n}$ for all $i$, (\ref{Ktilde}) is then 
equal to Formula (\ref{Bis0}), if we read $B=\tilde{K}$ and $D^{(2)} = -K$. 
This means that $D$ is a Euclidean distance matrix, if and only if $- D^{(2)}$
is a c.p.d.\! kernel. 

For $K$, being a c.p.d.\! kernel, with $K(\bm{x},\bm{x}) = 0$ for all 
$\bm{x} \in {\cal F}$, there exists a Hilbert space ${\cal H}$ of real-valued 
functions on ${\cal F}$ and a mapping $\phi: {\cal F} \rightarrow {\cal H}$, 
such that
\begin{equation}
||\phi(\bm{x}) - \phi(\bm{y})||^2 = - K(\bm{x},\bm{y}).
\label{phidist}
\end{equation}
This supports the fact that a Euclidean distance kernel $D$ can be embedded 
in a Euclidean space, which is an example of a Hilbert space. 
This is justified by $K = - D^{(2)}$ being a c.p.d.\! kernel. 
This means that in this paper a particular case of the mapping $\phi$ is 
considered.

More generally, Sch{\"o}lkopf proves that for a real-valued, symmetric kernel 
$\tilde{K}$, there exists a linear space ${\cal H}$ (which might not be a Hilbert 
space) of real-valued functions on ${\cal F}$, endowed with a symmetric, bilinear 
form $Q(\cdot,\cdot)$ and a mapping $\phi: {\cal F} \rightarrow {\cal H}$, such that
$$
  \tilde{K}(\bm{x},\bm{y}) = Q (\phi(\bm{x}),\phi(\bm{y})).
$$
$\tilde{K}$ is then a reproducing kernel for a feature space ${\cal H}$ 
\citep{Wahba}.

According to the definition, a pseudo-Euclidean space \citep{Greub}is 
equipped with a non-degenerate, indefinite, symmetric bilinear form 
$Q = \langle\cdot,\cdot\rangle$, seen as a generalized inner product 
(see Appendix \ref{sec:app_ps}). This justifies that for a non-Euclidean 
distance kernel $D$, $\tilde{K} = B = \frac{1}{2} \, J \, K \, J$, 
where $K = - D^{(2)}$ (see Formula \ref{Bis0}), is a reproducing kernel 
for a pseudo-Euclidean feature space ${\cal H}$. 

Sch{\"o}lkopf argues also that the c.p.d.\! kernels $K$ are `{ a natural 
choice whenever we are dealing with a translation invariant problem}', 
where the SVC or the kernel-PCA are of an example. From Formula (\ref{phidist}) 
it follows that $-K$ is the squared Euclidean distance in some Hilbert 
space. 

In summary, Sch{\"o}lkopf provides a new framework for distance based algorithms.
The squared Euclidean distance can now be realized in another feature space 
by using a suitable kernel function, which should be conditionally positive 
definite. Using Formula (\ref{Ktilde}), a c.p.d.\! kernel can be transformed 
into a p.d.\! kernel, to which, again, the kernel algorithms could be applied.
In this way, Sch{\"o}lkopf's work provides a mathematical context of our
approach of embedding distances. It gives more information on relations
between the c.p.d.\! kernels and p.d.\! kernels, or in other words, it names
the class of c.p.d.\! kernels that can be isometrically embedded in a Euclidean 
space. However, Sch{\"o}lkopf starts from a given feature space 
and a known dissimilarity measure. We assume that a distance kernel is given 
implicitly by a dataset, maybe not even knowing what type of a measure it is 
or how it was computed. One of our approaches 
relies on a linear embedding, which is a particular case of a mapping $\phi$ 
considered by Sch{\"o}lkopf. While Sch{\"o}lkopf focuses mostly on a mathematical 
formulations, we try to study how the methods work in practice.
 

\section{Classification on dissimilarities}\label{sec:class2}
The second approach, mentioned in the introduction, addresses the dissimilarity 
kernel as a mapping defined by the representation set $R=\{p_1,\ldots,p_{n}\}$.
A~mapping $D(z,R): {\cal F} \rightarrow {\cal R}^n$ is defined as
$D(z,R) = [d(z,p_1)\, d(z,p_2)\, \ldots\, d(z,p_n)]^T$. Notice that ${\cal F}$ 
expresses an original feature space of objects, which might not be given explicitly. 
The dimensionality of such a~dissimilarity space is controlled by the size of $R$. 
Using this formulation, classifiers can be constructed directly on the 
dissimilarity kernels, as in the dissimilarity space. 

\begin{figure}[t]
\begin{center}
   \includegraphics[width=13.2cm]{dissm_space_illustration}
   \\[-3mm]
   \caption{A simple illustration of a 2D dissimilarity space. 
            The first and third plots show the theoretical, artificial data, 
            with a quadratic classifier. The $L_p$ distance $D$, where 
            $d_{ij} = (\sum_{k=1}^2 \, |x_{ik}-y_{jk}|^{5/2})^{2/5}$ was computed for
            this data. The representation set consist of two objects, i.e. 
            $R = [p_1,p_2]$. The second and the fourth plots present the dissimilarity
            spaces $D(\cdot,R)$, where the representative objects are marked by 
            circles on the first and third plots. Note that if $R$ is well chosen,
            a linear classifier on a dissimilarity kernel $D(\cdot,R)$ 
            separates the data very well.
}
  \label{fig:dissm_sp}
  \end{center}
  \vspace*{-1cm}
\end{figure}

A justification to construct classifiers in dissimilarity space is as follows.
The property that distances should be small for similar objects, i.e. 
belonging to the same class, and larger for objects of different classes, 
gives a possibility for a discrimination and, thereby, $D(\cdot,p_i)$, defined
by the distances to the representative $p_i$, can be interpreted as a feature. 
If, $p_i$ is a characteristic object of a particular class, then the 
discrimination power of $D(\cdot,p_i)$ can be large. On the other hand, if 
$p_i$ is a non-typical object of its class, then the $D(\cdot,p_i)$ may 
discriminate poorly. 

Defining a well-discriminating dissimilarity measure for a non-trivial 
recognition problem is difficult. On the other hand, when a good distance
measure is derived, we almost solved our classification problem.
It is still a challenge, especially if such a measure should preferably 
incorporate invariances. Building such a measure is equivalent to defining 
good features for a traditional classification problem. If a good measure 
can be found, then the $k$-nearest neighbor ($k$-NN) method is expected 
to perform well provided that $D$ is metric or nearly metric. 
The decision of the $k$-NN is based on local neighborhoods only and it is,
in general, sensitive to noise. It means that $k$ nearest neighbors found 
might not include the best representatives of a class to which an object 
should be assigned. Moreover, the $k$-NN does not work for an asymmetric 
distance measure or might perform very badly for dissimilarities which 
strongly do not obey the triangle inequality. In such cases, a better 
generalization can be achieved by a classifier built in a dissimilarity space.  

It might be better to include all the distances to determine 
the good representatives in a training process. For instance, a linear 
classifier in a dissimilarity space is a weighted linear combination of 
dissimilarities between an object and the representation set $R$. 
The weights are optimized on the training set, and large weights 
(in magnitude) emphasize objects which play an essential role during 
discrimination. By doing this, a more global classifier can be built, 
by which its sensitivity to noisy representative examples is reduced. 
Our experience confirms that a linear or quadratic classifier can 
often generalize better than the $k$-NN rule, especially for a small 
representation set $R$ (see \citep{Pekalska2}).  

A linear classifier built on the dissimilarity kernel $D$ is given by:
\begin{equation}
f (D (x,R)) = \sum_{j=1}^n w_j\, d(x, x_j) + w_0 = \bm{w}^T \, D(x, R) + w_0
\label{LC}
\end{equation}
and a linear classifier on the dissimilarity kernel $D^{(2)}$ is 
expressed as:
\begin{equation}
f (D^{(2)} (x,R)) = \sum_{j=1}^n w_j\, d^2 (x, x_j) + w_0 =
\bm{w}^T \, D^{(2)}(x, R) + w_0.
\label{LC2}
\end{equation}
There is an essential difference between those two separating hyperplanes. 
Because of the linear relation between the square distance 
matrix $D^{(2)}$ and the matrix of inner products, the classifier (\ref{LC2}) 
built on $D^{(2)}$ is, in fact, a quadratic classifier in the 
underlying space.
The linear classifier (\ref{LC}), constructed on dissimilarity kernel $D$,
is, in general, non-quadratic, nonlinear classifier in the underlying 
feature space, since there is a nonlinear relation between the inner 
products and the kernel $D$.


\subsection{The Fisher Linear Discriminant (FLD)}\label{sec:FLD}
In general, any traditional classifier operating on feature spaces 
might be used on dissimilarity kernels. Since most of the commonly-used 
dissimilarity measures are based on sums of differences between measurements, 
the choice of the linear Bayesian classifier, assuming normal densities, 
is a natural consequence of the central limit theorem applied to them. 
In principle, the quadratic Bayesian classifier could be even better, but 
it requires much more training objects for estimation of the class 
covariance matrices. It is known that for $2$-class problems with equally 
probable classes, this classifier is equivalent to the Fisher 
linear discriminant (FLD), obtained by maximizing the so-called Fisher 
criterion, i.e. the ratio of between-scatter to within-scatter  
\citep{Duda,Fukunaga}. Therefore, we refer to this separating hyperplane 
as to the FLD.

The FLD can be now constructed on $D$ in the form of (\ref{LC}). 
As a starting point, the representation set $R$, consisting of $n$ objects,
and the training set $T$ coincide, i.e. $T=R$. In such a case, we have to 
deal with a small sample size classification problem, i.e. with $n$ vectors
$D(x_i,R)$ in $n$ dimensions. 
We have recently proposed to use a reduced representation set $R'$ of the 
size $r<n$, which is especially of importance when the distances are 
expensive to compute \citep{Pekalska2}. 
The easiest way to choose $R'$ is by a random selection. Also $r$ objects 
can be chosen such that the minimum distance between any of them is 
maximized. Another possibility is based on a greedy approach. 
Starting from a randomly chosen object, in an iterative procedure,
an object is added, which is the most dissimilar to all objects already 
chosen. It might be done globally or for each class separately. In case 
the most dissimilar objects are chosen they are likely to be outliers or 
positioned on the boundary. There exist many other ways to determine
a reduced representation set $R'$, but they will not be investigated here.

After $R'$ has been established, a linear classifier is built on $D(T,R')$. 
For a new object, only distances to the set $R'$ have to be computed. 
For a $2$-class problem, with the prior probabilities 
$p_{\omega_1}$ and $p_{\omega_2}$, the linear Bayesian classifier 
\citep{Fukunaga} is constructed on the dissimilarity kernel $D$ as follows:
\begin{equation*}
f (D(x, R')) = \bm{w}^T \, D(x, R')^T + w_0,
\end{equation*}
where 
$\bm{w} = C_W^{-1} \, (\bm{m}_1 - \bm{m}_2)$ and 
$w_0 = -\frac{1}{2} \, (\bm{m}_1 + \bm{m}_2)^T \, C_W^{-1} \, (\bm{m}_1 - \bm{m}_2)
+ log (\frac{p_{\omega_1}}{p_{\omega_2}})$. $C_W$ is the pooled within-class
covariance matrix and $\bm{m}_i$, $i=1,2$, stands for the class mean in the 
dissimilarity space $D(\cdot,R)$. 

When $T$ and $R'$ are identical (or $R'=R$), $C_W$ becomes singular, and the 
linear classifier cannot be built. Regularization can be used instead, 
yielding an approximated covariance matrix $(1-\lambda) \,C_W + \lambda \, I$. 
A classifier based on the regularized covariance matrix is called 
the regularized linear discriminant (RLD).


\subsection{Linear programming (LP) machines}\label{sec:LP}
With a properly defined objective function and constraints for a dissimilarity 
kernel, a separating hyperplane can be obtained by solving a linear programming 
problem. Assume a $2$-class problem, with classes $\omega_1$ and $\omega_2$ 
of the cardinality $n_1$ and $n_2$, respectively, and the labels 
$y_i = \{1, -1\}$. Let $f$ be the separating hyperplane, built for the 
complete representation set $R$ (i.e. $R=T$) as given by (\ref{LC}).
Then, the simple optimization problem, minimizing the number of 
misclassification errors $\xi_j$, can be defined as:
\begin{equation}
\begin{array}{lll}
\mbox{Minimize}  & \sum_{i=1}^n s_i \, \xi_i &\\[0.3cm]
\mbox{s.t.} & y_i \, f(D(\bm{x}_i,R)) \ge 1 - \xi_i, & i=1,\ldots,n \\[0.2cm]
            & \xi_i \ge 0& 
\label{LP}
\end{array}
\end{equation}
where $s_i=1$ for all $i$, or $s_i = \frac{1}{n_i}$ depending on a class label 
$y_i$. It is argued by \citet{Bennett} that the latter formulation can guarantee 
a nontrivial solution even when mean vectors of two classes happen to be the same.
Such a problem can be solved by the standard optimization methods, such as  simplex 
algorithm or interior-point methods. Since no other constraints are included, 
the hyperplane is constructed in an $n$-dimensional dissimilarity space $D(\cdot,R)$. 
It is possible, however, to impose a sparse 
solution, by minimizing the norm $L_1$ of the weight vector $\bm{w}$ of the 
hyperplane (\ref{LC}) i.e. $||w_j||_1 = \sum_{j=1}^n |w_j|$. In order to formulate 
such a minimization task in terms of a LP problem (i.e. to eliminate the absolute 
value $|w_j|$ from the objective function), $w_j$ is expressed by non-negative 
variables $\alpha_j$ and $\beta_j$ as $w_j = \alpha_j - \beta_j$. The minimization 
problem becomes thereby:
\begin{equation}
\begin{array}{lll}
\mbox{Minimize}  & \sum_{i=1}^n (\alpha_i+\beta_i) + C \,\sum_{i=1}^n {\xi_i} &\\[0.3cm]
\mbox{s.t.} & y_i \, f(D(\bm{x}_i,R)) \ge 1 - \xi_i, & i=1,\ldots,n \\[0.2cm]
            & \alpha_i, \,\beta_i, \,\xi_i \ge 0&
\label{LP1}
\end{array}
\end{equation}
A more flexible formulation of a classification problem has been 
proposed by \citep{Graepel1}. Now, the problem is to minimize 
$||\bm{w}||_1 - \mu\,\rho$, which basically means that the margin $\rho$
becomes a variable of the optimization problem. Note that $\rho = 1$ for
the formulation (\ref{LP1}). By imposing $||\bm{w}||_1$ to be constant, 
the modified version of (\ref{LP1}) can be introduced as:
\begin{equation}
\begin{array}{lll}
\mbox{Minimize}  &  \frac{1}{n} \sum_{i=1}^n {\xi_i} - \mu \, \rho &\\[0.3cm]
\mbox{s.t.} &  \sum_{i=1}^n (\alpha_i+\beta_i) = 1 &\\[0.2cm]
            & y_i \, f(D(\bm{x}_i,R)) \ge 1 - \xi_i, &  i=1,\ldots,n \\[0.2cm]
            & \xi_i, \,\alpha_i, \,\beta_i, \,\rho \ge 0&
\label{LPmar}
\end{array}
\end{equation}
In this approach, a sparse solution $\bm{w}$ is obtained, which means
that important objects are selected (by nonzero weights) from the original
representation set $R$ ($R = T$), resulting in a reduced set $R'$. This solution
is a similar adaptation of the SVC for feature 
representations defined with the LP machines \citep{Smola, Scholkopf3}.
It is of essential importance since for novel objects only dissimilarities 
to the objects from $R'$ have to be computed.
 

\subsection{Support vector classifier}
The support vector classifier can be built on the dissimilarity kernel. 
Recall, that for a chosen representation set $R=\{p_1,p_2,\ldots,p_{n}\}$, 
a dissimilarity mapping $D(z,R): {\cal F} \rightarrow {\cal R}^n$ is defined as
$D(z,R) = [d(z,p_1)~ d(z,p_2)~ \ldots~ d(z,p_n)]^T$.
Since the linear decision function in such a space is given by (\ref{LC}),
the support vector kernel $K$ consists of the elements:
$$
  K_{ij} = \langle D(\bm{x}_i, R), D(\bm{x}_j, R) \rangle,
$$ 
where $\langle\cdot, \cdot\rangle$ stands here for the Euclidean inner product.
Therefore, in the formulation of the linear support vector classifier
(see Section \ref{sec:svc}), the matrix $K$ is given by $K = D \, D^T$, 
which is positive definite and can be used for the construction of the SVC. 
In such a case, however, a sparse solution, provided by the method, 
is obtained in the whole dissimilarity space $D(\cdot,R)$. It means that for 
evaluation of new objects, still the dissimilarities to all training objects 
should be computed, because our SVC is in the form of (\ref{LC}).


\section{Experiments with the NIST digits}\label{sec:exp}
%
All experiments in this section are based on a $2$-class problem, i.e.
the recognition of digits 3 and 8 from a NIST database~\citep{nist}. 
In total, the data consist of $2000$, equally sampled, $128 \times 128$ 
binary images. Since no features are given to describe the images, they are 
represented by dissimilarity kernels.

The aim of this paper is to illustrate the potentials of dissimilarity
kernels as well as studying the behavior of simple classifiers constructed
by using such kernels. Therefore, we are not going to use perfect 
distance kernels (constructed for the specific problem such that the $1$-NN
gives a perfect result). Instead, our goal is to investigate what can be done
in other cases. Of course, the ultimate goal is to build well-discriminating
 dissimilarities, however, it might be not possible as it is not always possible 
 to find well-discriminating features for traditional classification tasks.
The first series of experiments focuses on the comparison between two 
distance kernel approaches to classification: via the linear embedding 
or via dissimilarity spaces. 

Two different dissimilarity measures are considered here: Euclidean on blurred 
images and modified-Hausdorff \citep{Dubuisson} on digit contours. 
They were chosen to illustrate the behavior of our kernel approaches with 
respect to the distance properties. While the first distance is a metric, 
the second one is not, as it violates the triangle inequality.

The modified-Hausdorff distance, applied on digit contours,
is used here, since it is found useful for template matching purposes 
\citep{Dubuisson}. It measures the difference 
between two sets $A = \{a_1, \ldots, a_g \}$ and $B = \{b_1, \ldots, b_h \}$
and is defined as:
$$
D_{MH}(A,B) = \max \{h_M\,(A,B), h_M\,(B,A)\} \mbox{~~and~~} 
h_M\,(A,B) = \frac{1}{g} \sum_{a \in A} \min_{b \in B} ||a - b||.
$$
To calculate such a distance between two images, first the digits are detected
and then the dissimilarity is computed with respect to the boxes surrounding 
them, which means that it is shift-invariant.

\begin{figure}[Ht]
  \begin{center}
    \subfigure[No degradation; $P=0$]
    {\includegraphics[width=4.4cm,height=5.2cm]{digits0b}}~
    \subfigure[$P=0.2$]
    {\includegraphics[width=4.4cm,height=5.2cm]{digits02b}}~
    \subfigure[$P=0.6$]
    {\includegraphics[width=4.4cm,height=5.2cm]{digits06b}}
    \\[-4mm]
    \caption{Degradation of images of handwritten digits 3 and 8. Subplot a
      shows examples of 16$\times$16 binary digits used in our experiments. 
      Images of degraded digits are presented in subplots b and c. 
      The level of degradation is governed by the probability $P$ that 
      an individual pixel is set to background.}
    \label{fig:images}
  \end{center}

  \vspace*{-10mm}
\end{figure}

To find the second dissimilarity measure, images are first blurred with 
the Gaussian function with the standard deviation of $8$ pixels (which is 
similar to the distance transform of an image). The motivation for such 
preprocessing is to avoid sharp edges of the digits. We, thus, make our
technique robust to small tilts or variable thickness.
Then, the Euclidean distance is computed between the blurred versions. 

Experiments are performed $20$ times and the results are averaged. 
In each single experiment, the data is randomly split into a training set 
and a testing set. The testing set consists always of $1000$ samples 
(i.e. $500$ per class). A number of different training sets are chosen, with 
the sizes varying from $10$ to $250$ objects per class. Our goal is to 
investigate three different directions: the behavior of the (generalized) 
nearest mean classifier, the behavior of the Fisher linear discriminant
\citep{Fukunaga} and the use of representation sets for classifiers constructed 
directly on the dissimilarity kernels.

In the second type of experiments, our goal is to study the usefulness
of dissimilarity kernels for data with missing values. We think that 
dissimilarity kernels are designed for tackling such types 
of problems. In order to study the performance of classifiers as a function 
of the number of missing values, we simulated such data by randomly 
corrupting the images of 3 and 8. 
The level of degradation is governed by a probability $P$ that a particular 
image pixel is set to the background. We used four different degradation 
levels in our experiments, i.e. $P=\{0.0, 0.2, 0.4, 0.6\}$. 
Although this type of a procedure can be seen as introducing extra noise, 
it still simulates the missing values problem, since we assumed that 
the corrupted pixels had originally unknown values, and because the images 
are binary, the values can be just assigned to the background pixels.
To simplify our experiment (to make the computation of distances less expensive), 
we used a resampled dataset, for which the original binary images were rescaled 
to a $16\times 16$ raster; see Figure~\ref{fig:images} for an illustration.
\begin{floatingfigure}[r]{4.2cm}
  \vbox{\hbox{\hspace{2.8cm}object $s$}

  \begin{tabular}[h]{cc|cc}
    &  &   1  &  0  \\\cline{2-4}
    \multirow{2}{1.35cm}{object $r$} & 1 & $a$  & $b$ \\
    &  0 & $c$  & $d$ \\
  \end{tabular}
  \caption{Similarity for binary images.} 
  \label{bintable}
  }

\vspace*{-3mm}
\end{floatingfigure}


\begin{table}[t]
\begin{center}
\caption{Dissimilarity coefficients for the binary images r and s.}

\vspace*{3mm}
\begin{tabular}[h]{|l||cccc|} \hline
Similarity measure & Metric & Euclidean & Similarity & Dissimilarity \\
\hline
Jaccard                     & Yes & Yes & 
$\displaystyle S_{rs} = { a\over a+b+c }$ & $D_{rs}=\sqrt{1-S_{rs}}$\\[0.4cm]
Simple matching             & Yes & No  & 
$\displaystyle S_{rs} = { a + d \over a+b+c+d }$ & $D_{rs}=1-S_{rs}$\\[0.4cm]
Yule                        & No  & No  & 
$\displaystyle S_{rs} = { ad - bc \over ad + bc }$ & $D_{rs}=1-S_{rs}$\\
\hline
\end{tabular}
\label{tab:sims}
\end{center}

\vspace*{-7mm}
\end{table}

The usual way to compute dissimilarities on the binary data is to construct 
a similarity measure first, and then, transform it to a corresponding distance. 
Similarity measures for binary objects $s$ and $r$ are often based on variables 
$a,b,c$ and $d$ reflecting the number of elementary matches between objects 
(see Figure~\ref{bintable}).
For instance, the variable $a$ reflects the number of cases where both objects 
scored $1$. For our experiments, three measures were chosen, yielding different 
properties: Jaccard, simple matching and Yule similarities \citep{Cox,Gower};
see summary in Table~\ref{tab:sims}. Jaccard measure is of interest, because it 
is an overlap ratio excluding all non-occurrences, and, thereby, disregarding the 
information on matches between background pixels. On the contrary, the simple 
matching measure, describing the proportion of matches to the total number of 
instances (pixels), might not be useful in our case. This comes from the fact 
that it counts matches between the background pixels, where some of them are 
considered as unknown. The Yule similarity is of different type, i.e. 
a cross-product ratio, measuring association between pixels as a predictability 
of one, given another.

Our aim is to compare the behavior of the above presented methods on these 
dissimilarities with respect to different degradation.
For each level of degradation, complete $2000 \times 2000$ distance 
matrices were computed. We assume that both, the training and the testing 
data, are degraded in a similar way. A training set of a fixed size of 
$250$ samples per class was randomly chosen. All the classifiers 
are tested on the independent testing set containing $500$ samples per class. 
The testing procedure is repeated again $20$ times and the results are averaged.


\setlength{\figsize}{0.45\textwidth}
\begin{figure}[htbp]
  \subfigure[Nearest mean classifiers constructed in the embedded spaces]{
        \includegraphics[width=\the\figsize,height=0.77\figsize]{blurrednmc}
        \includegraphics[width=\the\figsize,height=0.77\figsize]{modhausnmc}
        }
  \vspace{-14pt}
  \subfigure[The FLD and the SVC functions built in the embedded spaces]{
    \hbox{
      \includegraphics[width=\the\figsize,height=0.77\figsize]{blurredfisher}
      \includegraphics[width=\the\figsize,height=0.77\figsize]{modhausfisher}
      }
    }
  \vspace{-14pt}
  \subfigure[Classifiers built on dissimilarity kernels, using the notion of the representation set]{
    \hbox{
      \includegraphics[width=\the\figsize,height=0.77\figsize]{blurredrset}
      \includegraphics[width=\the\figsize,height=0.77\figsize]{modhausrset}
      }
    }
  \caption{Comparison of different classification methods with blurred Euclidean 
  (left column) and modified-Hausdorff (right column) dissimilarity kernels.}
\label{fig:series1}

\vspace*{-8mm}
\end{figure}


\subsection{Results and discussion on kernel approaches to dissimilarities}
%
For both dissimilarity kernels, modified-Hausdorff and blurred Euclidean, 
two experimental directions are considered. In the first direction, 
linear classifiers are built in the embedded space, while in the second 
one, linear classifiers were constructed in dissimilarity spaces, i.e. 
built directly on dissimilarity kernels. The results are presented with reference
to the 1-NN rule, as the one, commonly applied to dissimilarity representations
(for both distance kernels, the 1-NN rule is mostly the best among $k$-NN rules
for larger $k$).

Concerning embedded spaces, three decision functions were used: the nearest 
mean classifier, the Fisher linear discriminant and the support vector classifier.
They are applied in a Euclidean space, based on {\it only\/} 
positive eigenvalues, in a pseudo-Euclidean space and in a Euclidean space, 
obtained from the embedding of the `corrected' dissimilarity kernel; 
see Formula (\ref{Dnewis}).
Note that the last classifier makes only sense for a pseudo-Euclidean space. 
For Euclidean dissimilarity measures, Euclidean and pseudo-Euclidean spaces coincide.
Because, in such a case, both embeddings are the same, results in 
the pseudo-Euclidean case are not reported.

Figure \ref{fig:series1}a presents the performance of all nearest mean 
classifiers with reference to the 1-NN rule. For both distance measures, 
it can be observed that such classifiers are not complex enough for this problem
to give a good performance. They perform much worse than the $1$-NN rule,
especially, in the case of the modified-Hausdorff kernel. This suggests that
for this dissimilarity, the classes revealed in an embedded space are not
of a Gaussian shape. Concerning the pseudo-Euclidean space,
the GNMC reaches a higher accuracy than the NMC, however, the NMC built
in a Euclidean space, based on only positive eigenvalues behaves
nearly the same as the GNMC. This indicates that the `negative' directions 
in a pseudo-Euclidean space are not of much significance.
An interesting observation is also that the nearest mean classifiers 
do not nearly improve their accuracy with the increase of training size 
larger than $50$ objects per class. This indicates that this number of objects
represents well the classes in the underlying feature space.

\setlength{\figsize}{0.4\textwidth}
\begin{figure}[htbp]
\centering
  \includegraphics[width=\the\figsize,height=0.62\figsize]{eig_b}~
  \includegraphics[width=\the\figsize,height=0.62\figsize]{eig_mh}
  \caption{The eigenvalues of the matrix $B$ of inner products 
   for the blurred Euclidean (left) and modified-Hausdorff (right) 
   dissimilarity kernels of the size $200 \times 200$.}
\label{fig2}

\vspace*{-4mm}
\end{figure}


Next, we studied the behavior of the FLD and the SVC constructed 
in the embedded spaces. Here, the FLD and SVC are built 
on the reduced representations, as described in Section \ref{sec:reduced}. 
Such a reduced representation can be achieved as the solution of the PCA 
projection. 
The dimensionality was assigned to a value of $10\%$ or $25\%$
of the training size for the blurred Euclidean and the modified-Hausdorff 
distances, respectively. Such choices were motivated by visual exploration 
of the eigenvalues of the inner product matrix $B$ and selecting the 
number of values significantly large in magnitude (see Formulas 
(\ref{Bis0})--(\ref{Xis})) for the embedding purpose and compare Figure 
\ref{fig2}. The results are presented in Figure \ref{fig:series1}b, for 
which the following conclusions can be drawn:
\begin{smallitem2}

\item For the blurred Euclidean dissimilarity kernel, both the SVC
and the FLD, constructed in the embedded space, outperform the $1$-NN rule.
The good performance, especially of the FLD, indicates a Gaussian 
description of the classes (although overlapping) as revealed by
original distances.

\item For the modified-Hausdorff dissimilarity kernel, a linear classifier
in the embedded space seems not to be complex enough, in a similar way as
the NMC. The $1$-NN method mostly outperforms all other decision rules
considered. Note also that for the training size larger than $100$ objects, 
the $1$-NN rule here gives lower errors than the $1$-NN rule on blurred 
Euclidean kernel.
\end{smallitem2}

Concerning the behavior of different variants of the FLD and the SVC
for the modified-Hausdorff distances, the pseudo-Euclidean
embedding allows for reaching a higher accuracy than either a Euclidean
part described only by the positive eigenvalues of $B$ (see Formulas
\ref{Bis0}--\ref{Xis}) or a Euclidean embedding of
the enlarged dissimilarity kernel (see Formula \ref{Dnewis}).
This still shows that using the 'negative' directions makes sense.
For the embedding into Euclidean spaces, the FLD performs
worse than the SVC, but in a pseudo-Euclidean space, they behave
similarly.  


In these experiments, the results depended on the training size while the 
dimensionality was fixed. An additional experiment was also performed, where 
the training size was fixed to $100$ objects per class, and the constructed 
dimensionality was varied. As before, the test size was set to $500$ samples 
per class. The goal is to illustrate the performance of the FLD as a function 
of retrieved dimensionality. Analyzing Figure \ref{fig:dimens}, the following 
observations can be made:

\begin{smallitem2}

\item For the blurred Euclidean dissimilarity kernel, the FLD in an
embedded (Euclidean) space outperforms significantly the FLD built on the
dissimilarity kernel $D$ for dimensionalities smaller than $50$. The best
result is reached for the dimensionality in the range $20-30$. For larger
dimensionalities, the error increases. Judging from Figure \ref{fig2},
left, we observe that there are at most $20$ essential eigenvalues, 
since the remaining ones seem not to have a significant contribution
for the FLD in the embedded space.
The classifier behavior for those two error curves indicates that 
the classes are again linearly separable.

\item For the modified-Hausdorff dissimilarity kernel, the FLD in an embedded 
pseudo-Euclidean space performs mostly better than the FLD in an embedded 
Euclidean space. Here, however, the FLD on distance kernel $D$ reaches the 
highest accuracy. This confirms that the classes can be separated 
best in a nonlinear way, since a linear classifier built on the dissimilarity 
kernel can be interpreted as a nonlinear decision function in an underlying
feature space.
\end{smallitem2}


\setlength{\figsize}{0.43\textwidth}
\begin{figure}[ht]
  \centering
  \includegraphics[width=\the\figsize,height=0.77\figsize]{blurredfeat}~~
  \includegraphics[width=\the\figsize,height=0.77\figsize]{modhausfeat}
  \caption{The performance of the FLD for blurred Euclidean (left) 
    and modified-Hausdorff (right) dissimilarity kernels as a function of
    retrieved dimensionality for the fixed training size of $100$ objects per class.}
  \label{fig:dimens}

\vspace*{-5mm}
\end{figure}

In the second direction of our experiments, we focussed on building linear
classifiers in the dissimilarity space (i.e. directly on the kernel $D$)
and on the importance of the representation set $R$. Until now, $R$ was 
identical to the training set $T$. In such a case, distances to all 
representation (training) objects have to be computed for an evaluation of 
new objects. Therefore, the reduction of the size of $R$ is essential from
the computational point of view.

In Sections \ref{sec:FLD} and \ref{sec:LP}, two different approaches were
proposed. The first possibility is to reduce $R$ by selecting objects according 
to a pre-specified criterion. Here, for simplicity, the random selection of 
objects is used, since we found that this often gives reasonable 
results \citep{Duin1, Pekalska2}. 
In the experiments, the FLD is built on the kernel $D(T,R)$, where $R$ is 
always randomly reduced to $25\%$ of the training size. We also studied the 
RLD with the fixed regularization parameter of $0.01$, constructed on the complete 
dissimilarity kernel $D(R,R)$.

Another possibility is to enforce a sparse solution in a dissimilarity 
space in a way similar to solving the feature selection problem. This can 
be achieved, e.g. via linear programming schema \citep{Bradley} and
the classification problem can be formulated in terms of the sparse LP, as 
given by (\ref{LPmar}). For comparison, also an LP non-sparse solution was 
found, as described by (\ref{LP}). As a reference to all methods used, 
the $1$-NN rule is also given.

Experimental results are presented in the Figure \ref{fig:series1}c.
In case of the blurred Euclidean distances, the reduced representation
sets lead to lower accuracy in comparison to the classifiers based on
identical $T$ and $R$. The loss in accuracy is on average $1.4\%$, however
it is gained by using only $25\%$ of the training samples to build $R$
for the FLD, and by $\approx 15\%-20\%$ of the training samples in case 
of the LP formulation. The representation set $R$ is chosen in a different 
manner for these classifiers.
In the case of the FLD, $R$ is arbitrarily chosen beforehand. For the LP, 
it is, on the contrary, provided as a sparse solution of the corresponding 
optimization task. The classification error for this method is not provided
in the case of $250$ training objects in Figure \ref{fig:series1}c since
the minimization problem was infeasible and the solution could not be found.

Most linear classifiers reach higher accuracy than the $1$-NN rule 
(only the FLD with a randomly chosen $R$ shows somewhat worse performance), 
thereby, it confirms that building linear classifiers on dissimilarity 
kernels may be beneficial.

For the modified-Hausdorff distances, most linear classifiers perform worse 
than in case of the blurred distances. The worst results are reported for 
the FLD based on the reduced, randomly selected $R$. The $1$-NN rule is
the best for training sizes larger than $100$ objects per class. 
For smaller training sizes, it is outperformed, especially by the RLD and 
the non-sparse LP.

In summary, from this series of experiments, we can conclude  
that the blurred Euclidean dissimilarity kernel describes more compact 
and bounded classes than the modified-Hausdorff kernel. Also, most of 
the linear classifiers, both in the underlying space or on the dissimilarity
kernel, perform worse in the latter case. 
This is probably caused by the fact, that the modified-Hausdorff distance, 
as applied here, does not offer rotation invariance, needed for this problem.
Blurred Euclidean distance is, on the other hand, partially robust against
tilting due to the initial blurring step.
The $1$-NN method for larger training sizes is of the best 
classifiers for the modified-Hausdorff case. Only the RLD and the non-sparse
LP perform about the same, and for smaller training sizes, outperform
the $1$-NN rule. 


\subsection{Results and discussion on kernel approaches to dissimilarities 
for missing values problem}

\setlength{\figsize}{0.45\textwidth}
\begin{figure}[htbp]
  \subfigure[Nearest mean classifiers constructed in the embedded spaces]{
        \includegraphics[width=\the\figsize,height=0.77\figsize]{jaccardnmc}
        \includegraphics[width=\the\figsize,height=0.77\figsize]{yulenmc}
        }
  \hspace{-20pt}
  \subfigure[The FLD built in the embedded spaces]{
    \hbox{
      \includegraphics[width=\the\figsize,height=0.77\figsize]{jaccardfisher}
      \includegraphics[width=\the\figsize,height=0.77\figsize]{yulefisher}
      }
    }
  \hspace{-20pt}
  \subfigure[Classifiers built on dissimilarity kernels, using the notion of the representation set]{
    \hbox{
      \includegraphics[width=\the\figsize,height=0.77\figsize]{jaccardrset}
      \includegraphics[width=\the\figsize,height=0.77\figsize]{yulerset}
      }
    }
  \caption{Comparison of different classification methods with the 
    Jaccard (left column) and Yule (right column) dissimilarity kernels.
    The training size was fixed to $100$ objects per class. }
  \label{fig:degradation}

\vspace*{-5mm}
\end{figure}

In the second track of our experiments, we studied the applicability of 
dissimilarity kernels to data with missing values. In order to keep a similar 
line of reasoning, as in the previous part, we simulated the missing values 
problem by setting some pixel values to the background. Both the training and 
testing sets have now fixed sizes and the varying quantity is the level of 
image degradation which was described in Section \ref{sec:exp}.

\begin{figure}[htbp]
  \centering
  \subfigure[Simple matching]{\includegraphics[width=\the\figsize,height=0.8\figsize]{simplerset}}
  \subfigure[Yule distance]{\includegraphics[width=\the\figsize,height=0.8\figsize]{yulerset}}
  \\[-3mm]
  \caption{Comparison of methods using the notion of the representation set on 
  the simple matching and Yule dissimilarities.}
  \label{fig:simple&yule}

\vspace*{-5mm}
\end{figure}

Figure~\ref{fig:degradation} presents the generalization error rate as 
a function of increasing data degradation for Jaccard and Yule measures
and three discussed groups of kernel methods: the variants of the NMC, 
the FLD and methods based on the representation set $R$.
Since the results of the Yule measure and the simple matching dissimilarities are similar,
we have chosen for a presentation the Yule distance kernel as 
the distance which is both non-Euclidean and non-metric. As an indication,
the most distinct results are presented in Figure \ref{fig:simple&yule}.
The following conclusions can be made in overall:
\begin{smallitem2}
\item The nearest mean classifiers are surprisingly robust against the
  increase of data deterioration, see Figure \ref{fig:degradation}a.
  
\item The FLD functions are less robust to data degradation, but they
  achieve higher accuracy than any of the NMC.

\item Classifiers constructed directly on the dissimilarity kernel $D$, as
  observed in Figure~\ref{fig:degradation}c and \ref{fig:simple&yule}a,
  generally outperform the $1$-NN rule. They are also more robust against the
  degradation of images. Comparing all results, the $1$-NN rule deteriorates
  the most.
  
\item Surprisingly, the performance of classifiers does not deteriorate
  much for increasing image degradation. In the best cases, the error
  between $8\%-10\%$ is achieved for the degradation of $P=0.6$ (see Figure
  \ref{fig:degradation} and Figure \ref{fig:images} for reference).  
  It suggests that simple dissimilarity kernels based on
  binary images are highly robust against missing information.
  
\item Besides the NMC variants, the accuracy of classifiers on simple 
  binary dissimilarity kernels, applied to non-degenerated data ($P=0$), 
  are often $1.5-2$ times higher than of the same classifiers on
  more complicated dissimilarity measures such as the blurred Euclidean
  and modified-Hausdorff (compare Figures 
  \ref{fig:degradation}--\ref{fig:simple&yule} for the training size of $100$ objects
  with Figure \ref{fig:series1}).
  One of the reasons is that the binary distances are applied on the rescaled 
  images, where the digits are more `aligned' than in case of the original
  $128 \times 128$ binary images. 
  
\item Surprisingly, for the Yule dissimilarity kernel, the performance of the
  sparse LP is better than for the non-sparse LP, when the level of
  degradation is higher (see Figure \ref{fig:degradation}c, right).
  
\item For the Jaccard dissimilarity kernel, both the RLD and the LP classifiers
  applied to the degraded images at the level $P=0.4$ perform still
  comparably to the best results on the blurred Euclidean or modified
  Hausdorff dissimilarities.
  
\item On average, Jaccard dissimilarity kernel allows for a better
  separability of classes than the Yule distance (being observed in Figure
  \ref{fig:degradation}).
  
\item In case of the Jaccard distance measure, two methods give 
  identical solutions: the RLD and the LP formulation (both with $R=T$).
  These methods also achieve the best overall classification results
  compared to all other classifiers and distance measures we have
  investigated (error $1.7\%$ for not degraded Jaccard distance; compare
  Figure \ref{fig:degradation} for $P=0$ with Figure \ref{fig:series1} 
  for the training size of $100$ objects).
\end{smallitem2}

It is interesting, that a simple distance measure (like Jaccard), operating on 
binary images of digits, outperforms the modified-Hausdorff dissimilarity, 
computed on the contours. A possible explanation is that in the latter case, the
modified-Hausdorff was not implemented to be robust against tilting or stretching,
while in the first case, by rescaling images to a lower raster, the digits
became aligned. The binary dissimilarity measures are also considerably robust 
against the data degradation. For example, both the RLD and the LP classifiers, 
based on images with a high level of degradation, achieve the performance 
comparable to the best classifiers built on intact blurred Euclidean or 
modified-Hausdorff dissimilarities.

In summary, we have to conclude that the presented dissimilarity measures
(especially the Jaccard one) are in general robust against degradation.
Among all classifiers considered, the $1$-NN rule shows the highest sensitivity 
to data degradation, which is to be expected due to its sensitivity to noisy
examples. Most classifiers outperform the $1$-NN rule, even the variants of 
the NMC for $p=0.6$ and the FLD based on a random $R$ for Yule and simple 
matching measures. This again proves our point, that often more can be achieved 
by constructing more complex classifiers making use of dissimilarity kernels 
than by the $1$-NN rule.
The most robust are the variants of the NMC, especially the GNMC and the 
NMC in an embedded Euclidean space determined by the positive eigenvalues, 
however, the overall performance is not the highest. The best results 
are achieved by the RLD and the~LP.


\section{Experiments with the Kimia dataset}\label{sec:exp2}
In this experiment, we want to show how the $k$-NN result, based on noisy 
prototypes, can be outperformed by other types of more advanced classifiers 
constructed via embeddings or in the dissimilarity spaces. For this purpose, 
we use the Kimia dataset \citep{Kimia} with binary shapes. \citet{Kimia} 
proposed to compare shapes by computing an edit-distance between 
their medial axes. In this type of a distance, all the efforts are put 
into its construction. They report that such a dissimilarity measure
allows for a very good performance of the $k$-NN. Here, we used the same 
dataset but with a `less-perfect' dissimilarity measure, i.e. the 
modified-Hausdorff distance. As studied here, it is not robust 
against rotation.

\begin{figure}[ht]
  \begin{center}
    \includegraphics[width=6.8cm]{img3_8_10_14_17_18}~~~
    \includegraphics[width=6.8cm]{img4_5_11_13_15_17}
    \\[-3mm]
    \caption{Group A, left, and group B, right, of the binary images, used 
    in our experiments.}
    \label{fig:kimia}
  \end{center}

  \vspace*{-8mm}
\end{figure}


The Kimia dataset consists of $18$ categories, where each category contains 
$12$ binary images of shapes. For our experiment, to make it both illustrative 
and feasible, we created two groups of images, each of $6$ categories, as
presented in Figure \ref{fig:kimia}. Notice that the images are of different 
sizes, which supports our idea of not starting from feature space representations.
Since the classes have a small number of objects, the classification is 
done by using the leave-one-out procedure. It means that the dissimilarity
measure is built each time on the representation set consisting of $71$ objects, 
and one object, each time different, is used for testing. The procedure is 
repeated $72$ times and the results are averaged. 


\begin{table}[t]
\begin{center}
\caption{The leave-one-out results for two groups of binary images. The 
  average number of support vectors or the size of the representation set $R$ 
  is given in parenthesis; when not given, the complete dissimilarity kernel 
  is used. Note that a misclassification of one
  object gives an error of $1.39\%$. This means that the error of $6.94\%$
  or $12.5\%$ stands for $5$ or $9$ wrongly assigned objects, respectively.}

\vspace*{3mm}

\begin{tabular}{|l||l|l|}\hline
Classifier & Group A & Group B  \\ \hline
%NMC on $D$                         & 27.78      &   12.50       \\
%NMC on $D^{(2)}$                   & 36.11      &      15.28    \\
NMC; Eucl. emb.                     & 18.06      &      12.50    \\
GNMC                                & 16.67      &      15.28    \\
NMC; ps-Eucl. emb.                  & 20.83      &      16.67    \\
NMC; Eucl. emb; $D^{(2)}+c  $ & 27.78    &      12.50    \\
\hline
%FLD on $D$; PCA-reduced      & ~9.72      &      ~8.33    \\
%FLD on $D^{(2)}$; PCA-reduced& 12.50      &      ~9.72    \\
FLD; Eucl. emb.             & 18.06      &      18.06    \\
FLD; ps-Eucl. emb.          & 13.89      &      ~8.33    \\
SVC; Eucl. emb.             & 12.50 (46) &      16.67 (57)\\
SVC; ps-Eucl. emb.          & 11.11 (54) &      ~9.72 (45)\\
\hline
FLD on D; R by $D_{max}$ crit & ~6.94 (18)&     ~8.33 (18)\\
FLD on D; random $R$        & 12.50 (32) &      ~6.94 (18)\\
RLD; $R=T$                  & ~8.33      &      ~6.94 \\
LP; sparse, choosing $R$    & ~6.94 (55) &      ~5.56 (44)\\
LP; $R=T$                   & 11.11      &      12.50 \\
SVC on $D$; $R=T$           & ~8.33 (42) &      ~5.56 (32)\\
\hline  
$1$-NN                      & 12.50      &      ~6.94    \\
$3$-NN                      & 13.89      &      11.11    \\
$5$-NN                      & 16.67      &      11.11    \\
$7$-NN                      & 16.67      &      16.67    \\
\hline
\end{tabular}
\label{tab:kimia}

\vspace*{-6mm}
\end{center}
\end{table}

The results of our classifiers are presented in Table \ref{tab:kimia}. 
In case of the support vector classifiers, a number of different values
for the parameters $C$ or $\mu$ (see Formulas \ref{SVC} and \ref{LPmar})
have been considered and the best results are reported. 

Group A is an example of a problem, where the $k$-NN error is relatively 
high, i.e. the smallest $1$-NN error equals $12.5\%$, which means that 
$9$ objects are wrongly classified. More advanced classifiers can, however,
outperform the $k$-NN method. The best result of $6.94\%$ is given here by 
the FLD constructed on the dissimilarity kernel with the representation 
set $R$ of $18$ objects, reduced by randomly initialized $D_{max}$ 
criterion operating on a dissimilarity kernel.
The criterion 
looks for such $k$ (here $k=18$) objects that the distances between them are maximized.
The same performance of $6.94\%$ is achieved by the LP classifier with 
the optimized $R$ of $55$ objects. Good results are also provided by the RLD 
and the SVC built in the dissimilarity space. 

Group B is an example, where the performance of the $1$-NN rule is significantly 
better than in the previous case, but it deteriorates much when more neighbors are 
taken into account. In such a case, when the $1$-NN generalizes much better than 
the $k$-NN for larger $k$, other type of classifiers may have difficulties to 
outperform this method. Still, some improvement can be gained, which is
reached here by the SVC and the sparse formulation of the LP classifier in the 
dissimilarity space. Notice, however, that many classifiers (besides the NMCs)
perform better than $3$-NN, $5$-NN, or $7$-NN, which proves our point, that
they profit from a larger number of objects and are able to reduce the noise
in local (of a few neighbors) neighborhoods. 

For both groups, the linear classifiers built in the embedded spaces perform 
much worse than the linear classifiers on the dissimilarity kernel. 
This, however, does not exclude yet embedded spaces as being not useful in 
this case; it shows only that a linear classifier might be not the best 
solution (remember that a linear classifier on the dissimilarity kernel 
is a nonlinear classifier in the underlying space), while e.g. polynomial one
could be. 

Studying the behavior of classifiers, a few important conclusions can be made:
\begin{smallitem2}
\item
The nearest mean classifiers generalize relatively poorly, which suggest 
that the classes are rather elongated. This can be also concluded, observing
the images in Figure \ref{fig:kimia}, where some classes include rotated variants 
of the shapes. Since our dissimilarity is not robust against rotation, those
shapes differ significantly within one class.    

\item 
The FLD and the SVC perform better in a pseudo-Euclidean space than
in a Euclidean space, which suggests that employing this space is profitable.
  
\item 
The LP classifier with a sparse formulation generalizes well.
However, since it is trained always one class against all other classes,
in the end, the optimized representation set becomes large. The FLD, with
the reduced representation set, gives also good results, and it is based 
on a much smaller number of objects, which decreases the computational 
load for evaluation of novel objects.
\end{smallitem2}

In summary, for imperfect dissimilarity measures, the $k$-NN method can be
outperformed by more sophisticated classifiers, taking into account a number
of representative objects, thus becoming more global in their decisions.


\section{Conclusions}\label{sec:concl}
Objects are usually described by features. In this paper, an alternative 
approach is discussed, where objects are represented by their mutual proximities. 
This allows us to extend the notion of a kernel as a general proximity relation.
In this paper, dissimilarity kernels are discussed, as started also by \citet{Scholkopf}. 
Such a framework is advantageous in situations, where the feature representation 
is not straightforward, for example when recognizing objects by their contours, 
for highly-dimensional data such as hyperspectral images or when dealing with 
missing information.  
Two different ways for building classifiers using dissimilarity kernels are 
discussed. In the first approach, dissimilarities are isometrically embedded 
in a (pseudo-)Euclidean space and the classification is performed there. 
In the second approach, classifiers are built directly on distance kernels.

We conducted a number of experiments for both approaches.
This is illustrated by a~$2$-class digit recognition problem, for which 
various dissimilarity measures were investigated, and two $6$-class shape
classification problems based on the modified-Hausdorff distance. The 
experiments include also randomly distorted digit images, simulating the problem 
of missing values. 

If a dissimilarity kernel separates the classes well, the $k$-NN method
is supposed to achieve a good performance. However, in this paper, we focus 
on imperfect dissimilarity measures. Our main point is to show that 
in such cases, other, more advanced classifiers will usually outperform 
the $k$-NN rule. 

An example when the performance of the $1$-NN rule may be improved by other 
techniques is the blurred Euclidean distance computed on the NIST digits. 
Nearly all presented methods offer a higher accuracy. The best classifiers 
are a non-sparse LP formulation on a dissimilarity kernel and the FLD in the 
embedded space. Two methods do not give satisfactory results. These are the
nearest mean classifiers as they do not fit the structure of the data 
well and the FLD classifier based on randomly chosen $R$. We can conclude 
that there is still space to improve the performance of the $1$-NN rule 
by more global approaches, when the distance measure is not sophisticated 
enough for the given problem.

Another example is the modified-Hausdorff distance. The measure is built 
specifically for template matching purposes and it separates both digit 
classes well. It is almost impossible to improve the performance of 
the~$1$-NN method any further by other, more advanced approaches. 
Only for smaller training sizes, the accuracy of the non-sparse LP and 
the RLD is higher.
Also, the absolute performance of the $1$-NN rule is better for the 
modified-Hausdorff kernel than for the blurred Euclidean kernel for larger 
training sets. It is, however, interesting that, the non-sparse LP
classifier built on a blurred Euclidean kernel outperforms the $1$-NN method
on the modified-Hausdorff . This shows that a more advanced classifier,
constructed on a simpler dissimilarity representation, may be better than 
the simple $1$-NN rule applied on a more sophisticated measure.

This can best be illustrated by analyzing the digit recognition problem 
on degradated data. The performance of the $1$-NN rule seriously 
deteriorates with the increase of data degradation. At the same time,
dissimilarities, based on less information, become simpler. In such 
a case, more global techniques work better than the $1$-NN method.
The simpler the dissimilarity, the larger the potential for improvement
of the global classifiers.
Even nearest mean classifiers in the embedded space or the FLD based on 
randomly chosen $R$ provide better results for high data degradation.

Our experiments with the Kimia dataset confirms that in cases
where the $1$-NN rule gives a high error, other, more complex techniques,
built on dissimilarity kernels, achieve better results.

Concerning pseudo-Euclidean spaces, we confirm the conclusion of 
\citet{Goldfarb2} that reduction of dimensionality is essential 
to diminish the influence of noise. The use of a pseudo-Euclidean space
is often more advantageous than addressing the problem in a Euclidean space,
which can be observed for both the digit and the Kimia datasets.

Building classifiers in an embedded space might be preferable to constructing
them on a dissimilarity kernel, when a training set, consisting of
representative objects, is sufficiently large and the retrieved dimensionality 
is small. This assures then a good generalization for novel objects, which
can be observed in case of the blurred Euclidean distances, where the classes 
are well linearly separable for at least $100$ training objects.

Linear classifiers operating on complete dissimilarity kernels 
often achieve a higher accuracy than methods with the reduced representation 
sets. The latter offer, however, sparse solutions, advantageous from the 
computational point of view. They can also achieve very good results, like in 
the case of the Kimia dataset. Linear classifiers, like the FLD or 
the SVC, generalize well in dissimilarity spaces, and often provide
a better performance than the $1$-NN rule.

In summary, our main conclusion is that for dissimilarities, which do not 
separate the classes well enough, the $1$-NN, or, in general, the $k$-NN method 
can be outperformed by more advanced classifiers built either in an embedded 
space or on distance kernels. 


\acks{This work was partly supported by the Dutch Organization for 
Scientific Research (NWO). Thanks go to David Tax for valuable 
discussions and his art of `cutting' things. The authors are
also grateful for both encouragement and criticism of the 
anonymous reviewers. }


\appendix

\section{On pseudo-Euclidean spaces}
This appendix is meant to provide additional information
and make the picture more complete with respect to the 
embedding problem and pseudo-Euclidean spaces.
Most of the derivations presented here are based on the following
references \citep{Borg,Greub,Goldfarb1,Goldfarb2,Gower,Gower2}.


\subsection{A pseudo-Euclidean space}\label{sec:app_ps}
A pseudo-Euclidean space $\cal{E}$ is a real linear vector space 
equipped with a non-degenerate, indefinite, symmetric bilinear 
function $\langle\cdot,\cdot\rangle$, called inner product \citep{Greub}. 
Such a space can be interpreted as composed from two Euclidean subspaces, 
i.e. $\cal{E_+}$ of the dimensionality $p$ and $\cal{E_-}$ of the 
dimensionality $q$ such that $\cal{E} = \cal{E_+} \oplus \cal{E_-}$ 
$(\cal{E_+} \cap \cal{E_-} = \{\bm{0}\})$ and the inner product
is positive definite on $\cal{E_+}$ and negative definite on 
$\cal{E_-}$. $\cal{E}$ is therefore characterized by the signature 
$(p,q)$ \citep{Goldfarb1}. A basis 
$\{\bm{e}_1, \bm{e}_2, \ldots, \bm{e}_{p+q} \}$ is called orthonormal 
in a pseudo-Euclidean space if
$$
 \langle\bm{e}_i, \bm{e}_i\rangle \, = 
\begin{cases}
+1, & i=1,\ldots,p\\
-1, & i=p+1,\ldots,q\\ 
\end{cases}
$$
and $\langle\bm{e}_i, \bm{e}_j\rangle\, = 0$ for $i \neq j$. It is known 
that such a basis can always be constructed \citep{Greub}.
Therefore, if the orthonormal basis is chosen, the inner product between 
two vectors $\bm{x}$ and $\bm{y}$ is given by:
\begin{equation}
\langle\bm{x}, \bm{y}\rangle \, = 
\sum_{i=1}^p x_i y_i \, - \sum_{j=p+1}^{p+q} x_j y_j =
\bm{x}^T \, M \, \bm{y}, \quad
M = 
\begin{bmatrix}
I_{p \times p} & 0 \\
0 & - I_{q \times q} \\
\end{bmatrix}
\label{phiis}
\end{equation}
Note that the `norm' of a non-zero vector $\bm{x}$ in a pseudo-Euclidean space
is defined as:
$$
||\bm{x}||^2 = \langle\bm{x}, \bm{x}\rangle \, = \bm{x}^T \, M \, \bm{x},
$$
which can be positive, negative or zero. In the latter case, a non-zero vector 
$\bm{x}$ is called a light vector \citep{Greub}. 
The squared distance can be then defined in a similar way as
in a Euclidean space using the notion of inner product:
\begin{equation}
d^2 (\bm{x}, \bm{y}) = ||\bm{x} - \bm{y}||^2 = 
\,\langle\bm{x} - \bm{y},\, \bm{x} - \bm{y}\rangle
= (\bm{x} - \bm{y})^T \, M \, (\bm{x} - \bm{y})
\label{dis}
\end{equation}
which can be, in general, positive, negative or zero. Note that, even when 
$\bm{x}$ and $\bm{y}$ are different, their distance might still equal to zero.

Alternatively, a pseudo-Euclidean space can be expressed as: 
${\cal E} = {\cal{R}}^{(p,q)} = {\cal{R}}^p \times i\,{\cal{R}}^q$, where 
$i = \sqrt{-1}$, which justifies again Formulas (\ref{phiis}) and (\ref{dis}) 
and allows one to express the square distance as:
\begin{equation*}
d^2 (\bm{x}, \bm{y}) = 
d^2_{{\cal R}^p} (\bm{x}, \bm{y}) - d^2_{{\cal R}^q} (\bm{x}, \bm{y}),
\end{equation*}
where the distances on the right side are Euclidean.
Note also, that a Euclidean space ${\cal R}^p = {\cal R}^{(p,0)}$, 
is a special case of the pseudo-Euclidean space.


\subsection{Relation between distances and inner products}\label{sec:app_dist_prods}
Assume $n$ vectors given in a Euclidean space: 
$\{\bm{x}_1,\bm{x}_2,\ldots,\bm{x}_{n}\}$. Based on the definition of the Euclidean 
distance and the properties of the inner product \citep{Greub}, one may write:
\begin{equation}
\begin{array}{rcl}
d^2 (\bm{x}_i, \bm{x}_j) &=& \langle\bm{x}_i - \bm{x}_j,\bm{x}_i - \bm{x}_j\rangle \\[0.2cm]
&=& \langle\bm{x}_i, \bm{x}_i \rangle + \langle\bm{x}_j, \bm{x}_j \rangle - 
2 \langle\bm{x}_i, \bm{x}_j \rangle\\[0.2cm]
&=& d^2(\bm{x}_i,\bm{0}) + d^2(\bm{x}_j,\bm{0}) - 2 \, \langle\bm{x}_i, \bm{x}_j\rangle,
\label{exp0}
\end{array}
\end{equation}
where $\bm{0}$ is the origin in this space. Based on the above dependence,
the inner product $\langle\bm{x}_i, \bm{x}_j \rangle$ can be then expressed as:
\begin{equation}
 \langle\bm{x}_i, \bm{x}_j \rangle \, = {\displaystyle -\frac{1}{2} \left [ 
 d^2 (\bm{x}_i, \bm{x}_j) - d^2(\bm{x}_i,\bm{0}) - d^2(\bm{x}_j,\bm{0}) \right ]}.
\label{exp1}
\end{equation}
Making use of well known properties of the inner products and Formula
(\ref{exp1}), the distance $d^2(\bm{x}_i,\overline{\bm{x}})$ can be 
expressed only in terms of distances as follows \citep{Goldfarb2}:
\begin{equation}
\begin{array}{rcl}
 d^2 (\bm{x}_i, \overline{\bm{x}}) &= & ||\bm{x}_i - \overline{\bm{x}}||^2 = 
\langle \bm{x}_i - \overline{\bm{x}},\, \bm{x}_i - \overline{\bm{x}}\rangle \\[3mm]
&=& \langle \bm{x}_i,\, \bm{x}_i\rangle  + \langle\overline{\bm{x}},\, 
\overline{\bm{x}}\rangle
- 2 \, \langle\bm{x}_i ,\,\overline{\bm{x}}\rangle  \\[3mm]
&=&  d^2(\bm{x}_i,\, \bm{0}) + \frac{1}{n^2} \, \sum_{p=1}^n \sum_{s=1}^n
\langle\bm{x}_p,\, \bm{x}_s\rangle - \frac{2}{n} 
\, \sum_{s=1}^n \langle\bm{x}_i,\, \bm{x}_s \rangle \\[0.3cm]
&=&  d^2(\bm{x}_i,\, \bm{0}) + \frac{1}{2n^2} \, \sum_{p,s=1}^n \,
\left [ d^2(\bm{x}_p,\bm{0}) +  d^2(\bm{x}_s,\bm{0}) - 
d^2 (\bm{x}_p,\, \bm{x}_s) \right ]  \\[3mm]
& & -\frac{1}{n} \sum_{s=1}^n\, \left [ d^2(\bm{x}_i,\,\bm{0}) +  
d^2(\bm{x}_s,\,\bm{0}) - d^2 (\bm{x}_i,\, \bm{x}_s) \right ] \\[0.2cm]
&=& \frac{1}{n}\, \sum_{s=1}^n d^2 (\bm{x}_i,\, \bm{x}_s) - \frac{1}{2n^2} \, 
\sum_{p,s=1}^n \, d^2 (\bm{x}_p,\, \bm{x}_s) \\[3mm]
&=& d^2_{i\,.} - \frac{1}{2} d^2_{..}, \label{exp2}
\end{array}
\end{equation}
where $d^2_{i \,.}$ stands for the mean computed on each row of the kernel
$D^{(2)}$ and $d^2_{..}$ is the overall mean value.

Without loss of generality, let us further assume that the mean vector 
coincides with the origin, i.e. $\overline{\bm{x}} = \bm{0}$. It implies 
that $d^2 (\bm{x}_i, \bm{0}) = d^2 (\bm{x}_i, \overline{\bm{x}})$, which
can, therefore, be used in Formula (\ref{exp1}) and by plugging (\ref{exp2}), 
one obtains:
\begin{equation}
\langle\bm{x}_i, \bm{x}_j \rangle \, = 
-\frac{1}{2} \left [
d^2 (\bm{x}_i, \bm{x}_j) - \frac{1}{n} \sum_{s=1}^n d^2 (\bm{x}_i,\bm{x}_s) 
- \frac{1}{n} \sum_{s=1}^n d^2 (\bm{x}_s,\bm{x}_j) + 
\frac{1}{n^2} \sum_{p,s=1}^n d^2 (\bm{x}_p, \bm{x}_s) 
\right ] 
\label{exp3}
\end{equation}
Let $X \in {{\cal{R}}^{n \times k}}$ be a representation of all vectors and let 
$B$ be the matrix of inner products, i.e. $B =  X\, X^T$, such that 
$b_{ij} = \,\langle\bm{x}_i, \bm{x}_j\rangle$. Then, Formula (\ref{exp3}) 
becomes:
$$
b_{ij} = -\frac{1}{2}\,(d_{ij}^2 - d^2_{i\,.} - d^2_{.\,j} + d^2_{..})
$$
Let $D^{(2)}$ be an $n \times n$ square Euclidean distance matrix.
Substituting $d^2_{i\,.} = \frac{1}{n}\, D^{(2)}(\bm{x}_i, \cdot) \, \bm{1}^T$,
$d^2_{.\, j} = \frac{1}{n}\, D^{(2)}(\cdot, \bm{x}_j)^T \, \bm{1}^T$
and $d^2_{. .} = \frac{1}{n^2}\,\bm{1}\, D^{(2)}\, \bm{1}^T$, and after 
some mathematical operations, $B$ is found to be:
\begin{equation}
 B = -\frac{1}{2} J \, D^{(2)} \, J, \quad \mbox{where}\quad
 J = I - \frac{1}{n} \, \bm{1}^T \bm{1}
\label{Bis}
\end{equation}
Using Formula (\ref{exp0}), alternatively, $D^{(2)}$ can be expressed as:
\begin{equation}
D^{(2)} = \bm{b}\, \bm{1}^T + \bm{1}\, \bm{b}^T - 2\, B,
\label{D2is}
\end{equation}
where $\bm{b}$ is a vector of the diagonal elements of the matrix $B$.

Note that precisely the same reasoning follows for a pseudo-Euclidean 
space ${\cal{R}}^{(p,q)}$, since the relation between square
distances and inner products in both spaces is the same.
Therefore, Formula (\ref{exp3}) holds for a pseudo-Euclidean space with
$\langle\cdot,\cdot\rangle$ being the inner product defined 
by (\ref{phiis}). As a consequence, $B$, the matrix of inner 
products, should now be presented with respect to the 
pseudo-Euclidean space, i.e.:
$$
B = X\, \begin{bmatrix} M & \\ & 0 \end{bmatrix} \, X^T.
$$
This leads to the fact that Formulas (\ref{Bis}) and (\ref{D2is}) 
remain true for a pseudo-Euclidean space, as well.


\subsection{Adding new points to an embedded space}\label{sec:app_adding}
Let $X \in {\cal{R}}^{n \times k}$ be a configuration found in 
a (pseudo-)Euclidean space (for a pseudo-Euclidean space $k\!=\!p\!+\!q$) 
and $D_n^{(2)} \in {\cal{R}}^{s \times n}$ be the square 
distance matrix between $s$ new objects $v_1,v_2,\ldots,v_s$ 
and $n$ objects of the representation set $R$. Let $\bm{y}_i$ be the 
vector representation of a novel object projected onto space ${\cal{R}}^k$.
It follows from Formula (\ref{exp1}) that the inner product between 
a new vector and the original points is given by:
$$
 \langle\bm{y}_i, \bm{x}_j \rangle \, = {\displaystyle -\frac{1}{2} \left [ 
 d_n^2 (\bm{y}_i, \bm{x}_j) - d_n^2(\bm{y}_i,\bm{0}) - d^2(\bm{x}_j,\bm{0}) \right ]}.
$$
Making use of Formula (\ref{exp2}) and the fact that the centroid coincides 
with the origin, the inner product becomes:
\begin{equation}
\begin{array}{rcl}
\langle\bm{y}_i, \bm{x}_j \rangle \!&\! = \!&\!
-\frac{1}{2} \left [
d_n^2 (\bm{y}_i, \bm{x}_j) - \frac{1}{n} \sum_{s=1}^n d_n^2(\bm{y}_i,\bm{x}_s) 
- \frac{1}{n} \sum_{s=1}^n d^2(\bm{x}_s,\bm{x}_j) + 
\frac{1}{n^2} \sum_{p,s=1}^n d^2(\bm{x}_p, \bm{x}_s) 
\right ] \\[3mm]
\!&\! =\! &\! -\frac{1}{2}\,(d_n^2(\bm{y}_i, \bm{x}_j) - (d^2_n)_{i\,.} - 
d^2_{.\, j} + d^2_{..}\,),
\label{Xnew}
\end{array}
\end{equation}
where $(d^2_n)_{i\,.}$ stands for the mean computed on each row of the 
dissimilarity matrix $D^{(2)}_n$.
Let $B_n \in {\cal{R}}^{s \times n}$ be the matrix of inner products 
between $s$ new vectors and $n$ original ones. Using elementary 
matrix operations, (\ref{Xnew}) can be rewritten as:
\begin{equation*}
 B_n = -\frac{1}{2} ( D_n^{(2)} \, J - U \, D^{(2)} \, J ), 
\end{equation*}
where $J = I - \frac{1}{s} \, \bm{1}^T \bm{1}$ and 
$U = \frac{1}{s} \, \bm{1}^T \bm{1} \in {\cal{R}}^{s \times n}$.
The configuration $X_n$ is found as a solution to $X_n \, M \, X^T = B_n$, 
yielding
$$
 X_n = B_n \, X |\Lambda|^{-1} M.
$$


\subsection{Generalized variability}\label{sec:app_variability}
In an embedded Euclidean or a pseudo-Euclidean space $\cal{E}$, the generalized 
variability of the configuration $X$ with the vectors
$\{\bm{x}_1,\bm{x}_2,\ldots,\bm{x}_{n}\}$ can be defined as the trace of 
the covariance matrix of $X$, i.e. the sum of variances, as follows:
\begin{equation}
V_d (X) = \frac{1}{n} \, \sum_{j=1}^n ||\bm{x}_j||^2 - ||\overline{\bm{x}}||^2
\label{Vdcov}
\end{equation}
Note that $||.||$ is a norm defined in a space ${\cal E}$. 
Since $X$ reflects the same geometry as imposed by the distance matrix 
$D^{(2)}(R,R)$, based on the representation set $R = \{p_1,p_2,\ldots,p_{n}\}$, 
it is possible to express $V_d (X)$ only in terms of those distances as follows:
\begin{equation}
V_d (R)  = \frac{1}{2n^2} \, \sum_{j=1}^n \, \sum_{k=1}^n \, d^2 (p_j,p_k)
\label{Vdis2}
\end{equation}
We will show now the equivalence of (\ref{Vdis2}) and (\ref{Vdcov}) by using 
equivalent transformations. Making use of Formula (\ref{D2is}) and
the facts that $\bm{1}^T \bm{1} = n$, and 
$\bm{1}^T \bm{b} = \mbox{tr} (B) = \sum_{j=1}^n || \bm{x}_j ||^2 $, one gets:
\begin{equation*}
\begin{array}{rcl}
V_d (R) 
& = & \frac{1}{2n^2} \, \sum_{j=1}^n \, \sum_{k=1}^n \, d^2 (p_j,p_k)\\[0.25cm]
& = & \frac{1}{2n^2} \, \bm{1}^T  D^{(2)} \, \bm{1} \\[0.25cm]
& = & \frac{1}{2n^2} \, [ \bm{1}^T  \bm{b}\, \bm{1}^T \bm{1} + 
      \bm{1}^T \bm{1}\, \bm{b}^T \bm{1} - 2\, \bm{1}^T B \, \bm{1} ]\\[0.25cm]
& = & \frac{1}{2n} \,\bm{1}^T \bm{b}  + \frac{1}{2n} \bm{b}^T \bm{1}  
      - \frac{1}{n^2} \,\bm{1}^T  B \, \bm{1} \\[0.25cm]
& = & \frac{1}{n} \,\bm{1}^T  \bm{b} -  \frac{1}{n^2} \bm{1}^T \, B \, \bm{1} \\[0.25cm]
& = & \frac{1}{n} \,\sum_{j=1}^n || \bm{x}_j ||^2  - ||\overline{\bm{x}}||^2,
\end{array}
\end{equation*}
since $\bm{1}^T \, B \, \bm{1} = \bm{1}^T \, X \, M \, X^T \, \bm{1} = 
\overline{\bm{x}} \, M \, \overline{\bm{x}} = ||\overline{\bm{x}}||^2$,
for $M$ being the matrix of inner products in the space $\cal{E}$ ($M=I$
in case of a Euclidean space).
Note that by following the reasoning given in (\ref{exp2}) for an arbitrary 
point $\bm{z}_{x}$, the equivalence between (\ref{fomega}) and the second 
equation of (\ref{vd_omega}) for the proximity function can be proven. 


\bibliography{ref}


\end{document}