\begin{thebibliography}{48}
\expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
\expandafter\ifx\csname url\endcsname\relax
  \def\url#1{{\tt #1}}\fi


\bibitem[M.~A.~Aizerman and Rozonoer(1964)]{abr-tfpf-64}
M.~A.~Aizerman, E.~M.~Braverman and L.~I. Rozonoer.
\newblock Theoretical foundations of the potential function method in pattern
  recognition learning.
\newblock {\em Automation and Remote Control}, 25:\penalty0 821--837, 1964.

\bibitem[E.~L.~Allwein and Singer(2000)]{ass-unify-00}
E.~L.~Allwein, R.~E.~Schapire and Y.~Singer.
\newblock Reducing multiclass to binary: a unifying approach for margin
  classifiers.
\newblock {\em Journal of Machine Learning Research}, 1:\penalty0 113--141,
  2000.

\bibitem[Angluin(1988)]{A88}
D.~Angluin.
\newblock Queries and concept learning.
\newblock {\em Machine Learning}, 2\penalty0 (4):\penalty0 319--342, 1988.

\bibitem[Anthony and Bartlett(1999)]{ab-book-99}
M.~Anthony and P.~Bartlett.
\newblock {\em Neural Network Learning: Theoretical Foundations}.
\newblock CMU, 1999.

\bibitem[P.~Auer and Gentile()]{acg-ascolla-00}
P.~Auer, N.~Cesa~Bianchi and C.~Gentile.
\newblock Adaptive and self-confident on-line learning algorithms.
\newblock {\em Journal of Computer and System Sciences},
\newblock forthcoming. Preliminary version in 
{\em Proceedings of 13th Annu. Conf. on Comput. Learning Theory},
pages 107--117, Palo Alto, CA, 2000.

\bibitem[C.~Blake and Merz(1998)]{uci}
C.~Blake, E.~Keogh and C.~Merz.
\newblock Uci repository of machine learning databases.
\newblock Technical report, Dept. of Information and Computer Sciences,
  University of California, Irvine, 1998.
\newblock http://www.ics.uci.edu/$\sim$mlearn/MLRepository.html.

\bibitem[Block(1962)]{b-pmbf-62}
H.~D. Block.
\newblock The perceptron: A model for brain functioning.
\newblock {\em Reviews of Modern Physics}, 34:\penalty0 123--135, 1962.

\bibitem[Cortes and Vapnik(1995)]{cv-svn-95}
C.~Cortes and V.~Vapnik.
\newblock Support-vector networks.
\newblock {\em Machine Learning}, 20\penalty0 (3):\penalty0 273--297, 1995.

\bibitem[Cristianini and Shawe-Taylor(2000)]{cst-book-00}
N.~Cristianini and J.~Shawe-Taylor.
\newblock {\em An introduction to support vector machines and other
  kernel-based learning methods}.
\newblock Cambridge University Press, 2000.

\bibitem[I.~Dagan and Roth(1997)]{dkr-emnlp-97}
I.~Dagan, Y.~Karov and D.~Roth.
\newblock Mistake-driven learning in text categorization.
\newblock In {\em Proceedings of 2nd Conference on Empirical Methods in Natural
  Language Processing}, pages 55--63. Association for Computational
  Linguistics, Somerset, New Jersey, 1997.


\bibitem[Dietterich and Bakiri(1995)]{db-ecoc-95}
T.~G. Dietterich and G.~Bakiri.
\newblock Solving multiclass learning problems via error-correcting output
  codes.
\newblock {\em Journal of Artificial Intelligence Research}, 2:\penalty0
  263--286, 1995.

\bibitem[Duda and Hart(1973)]{dh-pcsa-73}
R.~O. Duda and P.~E. Hart.
\newblock {\em Pattern Classification and Scene Analysis}.
\newblock Wiley, 1973.

\bibitem[Freund and Schapire(1999)]{fs-lmcupa-99}
Y.~Freund and R.~E. Schapire.
\newblock Large margin classification using the perceptron algorithm.
\newblock {\em Journal of Machine Learning}, 37\penalty0 (3):\penalty0
  277--296, 1999.

\bibitem[T.-T.~Friess and Campbell(1998)]{fcc-98}
T.-T.~Friess, N.~Cristianini and C.~Campbell.
\newblock The kernel adatron algorithm: a fast and simple learning procedure
  for support vector machines.
\newblock In {\em Proceedings of 15th International Conference in Machine
  Learning}, pages 188--196. Morgan Kaufmann, San Mateo, CA, 1998.

\bibitem[Gentile and Littlestone(1999)]{gl-99}
C.~Gentile and N.~Littlestone.
\newblock The robustness of the $p$-norm algorithms.
\newblock In {\em Proc. 12th Annu. Conf. on Comput. Learning Theory}, pages
  1--11. ACM, 1999.

\bibitem[Gentile and Warmuth(2001)]{gw-lhlam-98}
C.~Gentile and M.~K. Warmuth.
\newblock Linear hinge loss and average margin.
\newblock Unpublished. Preliminary version in {\em Proc. Advances in Neural
  Information Processing Systems 11}, pages 225--231, MIT Press, Cambridge, MA,
  1999, 2001.

\bibitem[Gentile(2001)]{g-alma-00}
C.~Gentile.
\newblock A new approximate maximal margin classification algorithm.
\newblock In {\em T. K. Leen, T. G. Dietterich, and V. Tresp editors, Advances
  in Neural Information Processing Systems 13}, pages 500--506. MIT Press,
  Cambridge, MA, 2001.

\bibitem[Golding and Roth(1996)]{gr-spell-96}
A.~R. Golding and D.~Roth.
\newblock Applying winnow to context-sensitive spelling correction.
\newblock In {\em Proceedings of 13th International Conference in Machine
  Learning}, pages 182--190. Morgan Kaufmann, San Mateo, CA,, 1996.

\bibitem[A.~J.~Grove and Schuurmans(2001)]{gls-gcrfldu-97}
A.~J.~Grove, N.~Littlestone and D.~Schuurmans.
\newblock General convergence results for linear discriminant updates.
\newblock {\em Journal of Machine Learning}, 43\penalty0 (3):\penalty0
  173--210, 2001.

\bibitem[Helmbold and Warmuth(1995)]{hw-wl-95}
D.~P. Helmbold and M.~K. Warmuth.
\newblock On weak learning.
\newblock {\em Journal of Computer and System Sciences}, 50\penalty0
  (3):\penalty0 551--573, 1995.

\bibitem[Joachims, 1998]{j-svml-98}
T. Joachims.
\newblock Making large-scale support vector machines learning practical.
\newblock In {\em B. Scholkopf, C. Burges and A. Smola (eds.):
Advances in kernel methods: support vector machines}. 
MIT Press, Cambridge, MA, 2000.

\bibitem[S.~S.~Keerthi and Murthy(1999)]{ksbm-99}
S.~S.~Keerthi, S. K.~Shevade, C.~Bhattacharyya and K.R.K. Murthy.
\newblock A fast iterative nearest point algorithm for support vector machine
  classifier design.
\newblock Technical report, Indian Institute of Science, ISL-99-03, 1999.

\bibitem[Kivinen and Warmuth(1997)]{kw-avegulp-97}
J.~Kivinen and M.~K. Warmuth.
\newblock Additive versus exponentiated gradient updates for linear prediction.
\newblock {\em Information and Computation}, 132\penalty0 (1):\penalty0 1--64,
  1997.

\bibitem[Kivinen and Warmuth()]{kw-rlbmrp-97}
J.~Kivinen and M.~K. Warmuth.
\newblock Relative loss bounds for multidimensional regression problems.
\newblock {\em Journal of Machine Learning}.
\newblock forthcoming. Preliminary version in {\em Proc. Advances in Neural
  Information Processing Systems 10}, pages 287--293, MIT Press, Cambridge, MA,
  1998.

\bibitem[J.~Kivinen and Auer(1997)]{kwa-paw-98}
J.~Kivinen, M.~K.~Warmuth and P.~Auer.
\newblock The perceptron algorithm vs. winnow: linear vs. logarithmic mistake
  bounds when few input variables are relevant.
\newblock {\em Artificial Intelligence}, 97:\penalty0 325--343, 1997.

\bibitem[Kowalczyk(1999)]{k-lmp-98}
A.~Kowalczyk.
\newblock {\em Maximal margin perceptron}.
\newblock MIT Press, Cambridge, MA, 1999.

\bibitem[Y.~Le~Cun and Jackel(1989)]{lc-bahzcr-89}
Y.~Le~Cun, B.~Boser, J.~S. Denker, D. Henderson, R. E. Howard, W.~Hubbard 
and L.~J. Jackel.
\newblock Backpropagation applied to handwritten zip code recognition.
\newblock {\em Neural Computation}, 1:\penalty0 541--551, 1989.

\bibitem[Y.~Le~Cun and Vapnik(1995)]{lc-mnist-95}
Y.~Le~Cun, L. J.~Jackel, L.~Bottou, A. Brunot, C. Cortes, 
J. S. Denker, H. Drucker, I. Guyon, U. Muller, 
S. Sackinger, P.~Simard and V.~Vapnik.
\newblock Comparison of learning algorithms for handwritten digit recognition.
\newblock In {\em Proceedings of ICANN 1995}, pages 53--60, 1995.

\bibitem[Li(2000)]{l-thesis-00}
Y.~Li.
\newblock {\em From support vector machines to large margin classifiers}.
\newblock PhD thesis, School of Computing, National University of
  Singapore, 2000.

\bibitem[Li and Long()]{ll-romma-99}
Y.~Li and P.~Long.
\newblock The relaxed online maximum margin algorithm.
\newblock {\em Journal of Machine Learning}.
\newblock forthcoming. Preliminary version in {\em S. A. Solla, T. K. Leen and
  K. R. Muller editors, Advances in Neural Information Processing Systems 12},
  pages 498--504, MIT Press, Cambridge, MA, 2000.

\bibitem[Littlestone(1988)]{l-liaanla-88}
N.~Littlestone.
\newblock Learning quickly when irrelevant attributes abound: {A} new
  linear-threshold algorithm.
\newblock {\em Machine Learning}, 2:\penalty0 285--318, 1988.

\bibitem[Littlestone and Warmuth(1994)]{lw-wma-94}
N.~Littlestone and M.~K. Warmuth.
\newblock The weighted majority algorithm.
\newblock {\em Information and Computation}, 108\penalty0 (2):\penalty0
  212--261, 1994.

\bibitem[Mangasarian(1968)]{m-msmps-68}
O.~Mangasarian.
\newblock Multi-surface method of pattern separation.
\newblock {\em IEEE Trans. on Information Theory}, 14:\penalty0 801--807, 1968.

\bibitem[Mangasarian(1997)]{m-mpdm-97}
O.~Mangasarian.
\newblock Mathematical programming in data mining.
\newblock {\em Data Mining and Knowledge Discovery}, 42\penalty0 (1):\penalty0
  183--201, 1997.

\bibitem[P.~Nachbar and Strobl(1993)]{nns-93}
P.~Nachbar, J.~A.~Nossek and J.~Strobl.
\newblock The generalized adatron algorithm.
\newblock In {\em Proceedings of 1993 IEEE ISCAS}, pages 2152--2155, 1993.

\bibitem[Novikov(1962)]{n-cpp-62}
A.~B.~J. Novikov.
\newblock On convergence proofs on perceptrons.
\newblock In {\em Proc. of the Symposium on the Mathematical Theory of
  Automata, vol. XII}, pages 615--622, 1962.

\bibitem[E.~Osuna and Girosi(1997)]{ofg-97}
E.~Osuna, R.~Freund and F.~Girosi.
\newblock An improved training algorithm for support vector machines.
\newblock In {\em Proceedings of IEEE NNSP'97}, 1997.

\bibitem[Platt(1998)]{p-98}
J.~C. Platt.
\newblock {\em Fast training of support vector machines using sequential
  minimal optimization}.
\newblock MIT Press, Cambridge, MA, 1998.

\bibitem[J.~C.~Platt and Shawe-Taylor(1999)]{pcst-dags-99}
J.~C.~Platt, N.~Cristianini and J.~Shawe-Taylor.
\newblock Large margin dags for multiclass classification.
\newblock In {\em S. A. Solla, T. K. Leen and K. R. Muller editors, Advances in
  Neural Information Processing Systems 12}, pages 547--553. MIT Press,
  Cambridge, MA, 1999.

\bibitem[Rosenblatt(1962)]{r-pn-62}
F.~Rosenblatt.
\newblock {\em Principles of neurodynamics: Perceptrons and the theory of brain
  mechanisms}.
\newblock Spartan Books, Washington, D.C., 1962.


\bibitem[R.~E.~Schapire and Lee(1998)]{sfbs-bm-98}
R.~E.~Schapire, P.~Bartlett, Y.~Freund and W.~S. Lee.
\newblock Boosting the margin: A new explanation for the effectiveness of
  voting methods.
\newblock {\em The Annals of Statistics}, 26\penalty0 (5):\penalty0 1651--1686,
  1998.

\bibitem[B.~Scholkopf and Smola(1999)]{smbkmrs-ivf-99}
B.~Scholkopf, S.~Mika, C.J.C. Burges, P. Knirsch, K. Muller, G.~Ratsch 
and A.~Smola.
\newblock Input space vs. feature space in kernel-based methods.
\newblock {\em IEEE Trans. on Neural Network}, 10\penalty0 (5):\penalty0
  1000--1017, 1999.

\bibitem[B.~Scholkopf and Vapnik(1997)]{ssbgnpv-svmcomp-97}
B.~Scholkopf, K.~Sung, C.J.C. Burges, F. Girosi, P. Niyogi, T.~Poggio and
  V.~Vapnik.
\newblock Comparing support vector machines with gaussian kernels to radial
  basis function classifiers.
\newblock {\em IEEE Trans. on Signal Processing}, 45:\penalty0 2758--2765,
  1997.

\bibitem[Schwenk and Bengio(2000)]{sb-bnn-00}
H.~Schwenk and Y.~Bengio.
\newblock Boosting neural networks.
\newblock {\em Neural Computation}, 12\penalty0 (8):\penalty0 1869--1887, 2000.

\bibitem[Servedio(1999)]{s-pluw-99}
R.~A. Servedio.
\newblock On pac learning using winnow, perceptron, and a perceptron-like
  algorithm.
\newblock In {\em Proc. 12th Annu. Conf. on Comput. Learning Theory}, pages
  296--307. ACM, 1999.

\bibitem[J.~Shawe-Taylor and Anthony(1998)]{stbwa-srmoddh-98}
J.~Shawe-Taylor, P.~Bartlett, R.~Williamson  and M.~Anthony.
\newblock Structural risk minimization over data-dependent hierarchies.
\newblock {\em IEEE Trans. on Information Theory}, 44\penalty0 (5):\penalty0
  1926--1940, 1998.

\bibitem[P.~Simard and Denker.(1993)]{sld-tr-93}
P.~Simard, Y.~LeCun and J.~Denker.
\newblock Efficient pattern recognition using a new transformation distance.
\newblock In {\em S. Hanson, J. Cowan, and L. Giles, editors, Advances in
  Neural Information Processing Systems, volume 5}. Morgan Kaufmann, 1993.

\bibitem[Vapnik(1998)]{v-book-98}
V.~Vapnik.
\newblock {\em Statistical learning theory}.
\newblock J. Wiley \& Sons, New York, 1998.


\end{thebibliography}