\begin{thebibliography}{38}
\expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
\expandafter\ifx\csname url\endcsname\relax
  \def\url#1{{\tt #1}}\fi

\bibitem[Alon et~al.(1997)Alon, Ben-David, Cesa-Bianchi, and Haussler]{ABCH97}
N.~Alon, S.~Ben-David, N.~Cesa-Bianchi, and D.~Haussler.
\newblock Scale-sensitive dimensions, uniform convergence, and learnability.
\newblock {\em Journal of the ACM}, 44\penalty0 (4):\penalty0 615--631, 1997.

\bibitem[Anthony and Bartlett(1999)]{AnBar99}
Martin Anthony and Peter~L. Bartlett.
\newblock {\em Neural Network Learning: Theoretical Foundations}.
\newblock Cambridge University Press, 1999.

\bibitem[Barron(1993)]{Bar93}
A.R. Barron.
\newblock Universal approximation bounds for superpositions of a sigmoidal
  function.
\newblock {\em IEEE Transactions on Information Theory}, 39\penalty0
  (3):\penalty0 930--945, 1993.

\bibitem[Bartlett and Shawe-Taylor(1999)]{BarSha99}
Peter Bartlett and John Shawe-Taylor.
\newblock Generalization performance of support vector machines and other
  pattern classifiers.
\newblock In Bernhard Sch\"{o}lkopf, Christopher J.~C. Burges, and Alexander~J.
  Smola, editors, {\em Advances in Kernel Methods : Support Vector Learning},
  pages 43--54. The MIT press, 1999.

\bibitem[Bartlett(1998)]{Bar98}
P.L. Bartlett.
\newblock The sample complexity of pattern classification with neural networks:
  the size of the weights is more important than the size of the network.
\newblock {\em IEEE Transactions on Information Theory}, 44\penalty0
  (2):\penalty0 525--536, 1998.

\bibitem[Cristianini and Shawe-Taylor(2000)]{CriST00}
Nello Cristianini and John Shawe-Taylor.
\newblock {\em An Introduction to Support Vector Machines and other
  Kernel-based Learning Methods}.
\newblock Cambridge University Press, 2000.

\bibitem[Devroye et~al.(1996)Devroye, Gy{\"o}rfi, and Lugosi]{DGL96}
L.~Devroye, L.~Gy{\"o}rfi, and G.~Lugosi.
\newblock {\em A probabilistic theory of pattern recognition}.
\newblock Springer-Verlag, New York, 1996.
\newblock ISBN 0-387-94618-7.

\bibitem[Dudley(1984)]{Dud84}
R.M. Dudley.
\newblock {\em A course on empirical processes}, volume 1097 of {\em Lecture
  Notes in Mathematics}.
\newblock 1984.

\bibitem[Freund and Schapire(1997)]{FS97}
Y.~Freund and R.E. Schapire.
\newblock A decision-theoretic generalization of on-line learning and an
  application to boosting.
\newblock {\em J. Comput. Syst. Sci.}, 55\penalty0 (1):\penalty0 119--139,
  1997.

\bibitem[Gentile and Warmuth(1998)]{GW98}
C.~Gentile and M.~K. Warmuth.
\newblock Linear hinge loss and average margin.
\newblock In {\em Proc. NIPS'98}, 1998.

\bibitem[Grove et~al.(2001)Grove, Littlestone, and Schuurmans]{GLS01}
A.J. Grove, N.~Littlestone, and D.~Schuurmans.
\newblock General convergence results for linear discriminant updates.
\newblock {\em Machine Learning}, 43:\penalty0 173--210, 2001.

\bibitem[Guo et~al.(1999)Guo, Bartlett, Shawe-Taylor, and Williamson]{GBSW99}
Ying Guo, Peter~L. Bartlett, John Shawe-Taylor, and Robert~C. Williamson.
\newblock Covering numbers for support vector machines.
\newblock In {\em COLT'99}, pages 267--277, 1999.

\bibitem[Gurvits(1997)]{Gurvits97}
L.~Gurvits.
\newblock A note on a scale-sensitive dimension of linear bounded functionals
  in banach spaces.
\newblock In {\em Proceedings of Algorithmic Learning Theory}, pages 352--363,
  1997.

\bibitem[Haussler(1989)]{Hau89}
D.~Haussler.
\newblock Generalizing the {PAC} model: sample size bounds from metric
  dimension-based uniform convergence results.
\newblock In {\em Proc. 30th IEEE Symposium on Foundations of Computer
  Science}, pages 40--45, 1989.

\bibitem[Hoeffding(1963)]{Hoe63}
W.~Hoeffding.
\newblock Probability inequalities for sums of bounded random variables.
\newblock {\em Journal of the American Statistical Association}, 58\penalty0
  (301):\penalty0 13--30, March 1963.

\bibitem[Jaakkola et~al.(2000)Jaakkola, Meila, and Jebara]{JaaMeiJeb00}
Tommi Jaakkola, Marina Meila, and Tony Jebara.
\newblock Maximum entropy discrimination.
\newblock In S.A. Solla, T.K. Leen, and K.-R. M{\"u}ller, editors, {\em
  Advances in Neural Information Processing Systems 12}, pages 470--476. MIT
  Press, 2000.

\bibitem[Jones(1992)]{Jon92}
Lee~K. Jones.
\newblock A simple lemma on greedy approximation in {H}ilbert space and
  convergence rates for projection pursuit regression and neural network
  training.
\newblock {\em Ann. Statist.}, 20\penalty0 (1):\penalty0 608--613, 1992.
\newblock ISSN 0090-5364.

\bibitem[Kivinen and Warmuth(1997)]{KW97}
J.~Kivinen and M.K. Warmuth.
\newblock Additive versus exponentiated gradient updates for linear prediction.
\newblock {\em Journal of Information and Computation}, 132:\penalty0 1--64,
  1997.

\bibitem[Kolmogorov(1956)]{Kol56}
A.N. Kolmogorov.
\newblock Asymptotic characteristics of some completely bounded metric spaces.
\newblock {\em Dokl. Akad. Nauk. SSSR}, 108:\penalty0 585--589, 1956.

\bibitem[Kolmogorov and Tihomirov(1961)]{KT61}
A.N. Kolmogorov and V.M. Tihomirov.
\newblock $\epsilon$-entropy and $\epsilon$-capacity of sets in functional
  spaces.
\newblock {\em Amer. Math. Soc. Transl.}, 17\penalty0 (2):\penalty0 277--364,
  1961.

\bibitem[Langford and Seeger(2001)]{LaSe01}
J.~Langford and M.~Seeger.
\newblock Bounds for averaging classifiers.
\newblock Technical Report CMU-CS-01-102, Carnegie Mellon University, 2001.

\bibitem[Ledoux and Talagrand(1991)]{LeTa91}
Michel Ledoux and Michel Talagrand.
\newblock {\em Probability in {B}anach spaces}.
\newblock Springer-Verlag, Berlin, 1991.
\newblock ISBN 3-540-52013-9.
\newblock Isoperimetry and processes.

\bibitem[Lee et~al.(1996)Lee, Bartlett, and Williamson]{LBW96}
Wee~Sun Lee, P.L. Bartlett, and R.C. Williamson.
\newblock Efficient agnostic learning of neural networks with bounded fan-in.
\newblock {\em IEEE Transactions on Information Theory}, 42\penalty0
  (6):\penalty0 2118--2132, 1996.

\bibitem[Littlestone(1988)]{Lit88}
N.~Littlestone.
\newblock Learning quickly when irrelevant attributes abound: a new
  linear-threshold algorithm.
\newblock {\em Machine Learning}, 2:\penalty0 285--318, 1988.

\bibitem[McAllester(1999)]{Mca99}
David McAllester.
\newblock {PAC}-{B}ayesian model averaging.
\newblock In {\em COLT'99}, pages 164--170, 1999.

\bibitem[Mendelson(2001)]{Mend01-geom}
Shahar Mendelson.
\newblock Geometric methods in the analysis of {G}livenko-{C}antelli classes.
\newblock In {\em COLT 01}, pages 256--272, 2001.

\bibitem[Pollard(1984)]{Pol84}
D.~Pollard.
\newblock {\em Convergence of stochastic processes}.
\newblock Springer-Verlag, New York, 1984.
\newblock ISBN 0-387-90990-7.

\bibitem[Pontriagin and Schnirelmann(1932)]{PonSch32}
L.S. Pontriagin and L.G. Schnirelmann.
\newblock Sur une propri\'{e}t\'{e} m\'{e}trique de la dimension.
\newblock {\em Annals of Mathematics}, 33:\penalty0 156--162, 1932.

\bibitem[Sauer(1972)]{Sau72}
N.~Sauer.
\newblock On the density of families of sets.
\newblock {\em Journal of Combinatorial Theory (Series A)}, 13:\penalty0
  145--147, 1972.

\bibitem[Schapire et~al.(1998)Schapire, Freund, Bartlett, and Lee]{SFBL98}
Robert~E. Schapire, Yoav Freund, Peter Bartlett, and Wee~Sun Lee.
\newblock Boosting the margin: a new explanation for the effectiveness of
  voting methods.
\newblock {\em Ann. Statist.}, 26\penalty0 (5):\penalty0 1651--1686, 1998.
\newblock ISSN 0090-5364.

\bibitem[Shawe-Taylor et~al.(1998)Shawe-Taylor, Bartlett, Williamson, and
  Anthony]{TBWA96}
J.~Shawe-Taylor, P.L. Bartlett, R.C. Williamson, and M.~Anthony.
\newblock Structural risk minimization over data-dependent hierarchies.
\newblock {\em IEEE Trans. Inf. Theory}, 44\penalty0 (5):\penalty0 1926--1940,
  1998.

\bibitem[Todd(1997)]{Todd97}
Michael~J. Todd.
\newblock Potential-reduction methods in mathematical programming.
\newblock {\em Math. Programming}, 76\penalty0 (1, Ser. B):\penalty0 3--45,
  1997.
\newblock ISSN 0025-5610.
\newblock Interior point methods in theory and practice (Iowa City, IA, 1994).

\bibitem[Vapnik(1998)]{Vap98}
V.N. Vapnik.
\newblock {\em Statistical learning theory}.
\newblock John Wiley \& Sons, New York, 1998.

\bibitem[Vapnik and Chervonenkis(1971)]{VC71}
V.N. Vapnik and A.J. Chervonenkis.
\newblock On the uniform convergence of relative frequencies of events to their
  probabilities.
\newblock {\em Theory of Probability and Applications}, 16:\penalty0 264--280,
  1971.

\bibitem[Williamson et~al.(1999)Williamson, Smola, and Sch\"{o}lkopf]{WSS99}
Robert~C. Williamson, Alexander~J. Smola, and Bernhard Sch\"{o}lkopf.
\newblock Entropy numbers, operators and support vector kernels.
\newblock In Bernhard Sch\"{o}lkopf, Christopher J.~C. Burges, and Alexander~J.
  Smola, editors, {\em Advances in Kernel Methods : Support Vector Learning},
  chapter~9. The MIT press, 1999.

\bibitem[Williamson et~al.(2000)Williamson, Smola, and Sch\"{o}lkopf]{WSS00}
Robert~C. Williamson, Alexander~J. Smola, and Bernhard Sch\"{o}lkopf.
\newblock Entropy numbers of linear function classes.
\newblock In {\em COLT'00}, pages 309--319, 2000.

\bibitem[Zhang(1999)]{Zhang99-colt}
Tong Zhang.
\newblock Theoretical analysis of a class of randomized regularization methods.
\newblock In {\em COLT 99}, pages 156--163, 1999.

\bibitem[Zhang(2002)]{Zhang00-dualth}
Tong Zhang.
\newblock On the dual formulation of regularized linear systems.
\newblock {\em Machine Learning}, 46:\penalty0 91--129, 2002.

\end{thebibliography}
