%% bib file on cross validation, loo, stability, bagging


%% One of the first paper on cross validation (see also Lachenbruch and Luntz and Brailovsky)
@ARTICLE{Sto74,
  AUTHOR =       "M. Stone",
  TITLE =        "Cross-validatory choice and assessment of statistical predictions (with discussion)",
  JOURNAL =      "Journal of the Royal Statistical Society B",
  YEAR =         "1974",
  volume =       "36",
  pages =        "111-147"
}
%% Contains a discussion on leave one out error

%% Some results about the consistency of leave one out cross validation criteria
@ARTICLE{Sto77,
  AUTHOR =       "M. Stone",
  TITLE =        "Asymptotics for and against cross-validation",
  JOURNAL =      "Biometrika",
  YEAR =         "1977",
  volume =       "64",
  number = "1",
  pages =        "29-35"
}
%%

%% vc bounds on leave one out cross validation
@INPROCEEDINGS{Hol96,
  AUTHOR =       "S.B. Holden",
  TITLE =        "PAC-like Upper Bounds for the Sample Complexity of Leave-One-Out CrossValidation",
  BOOKTITLE =    "Ninth Annual Conference on Computational Learning Theory",
  YEAR =         "1996",
  pages =        "41--50",
  publisher =    "ACM Press"
}
%%

%% tex report about partial stability and adaboost
@techreport{Kut02,
    author = "S. Kutin and P. Niyogi",
  title = "Almost-everywhere algorithmic stability and generalization error",
    number = "TR-2002-03",
    address = "University of Chicago",
    year = "2002"
  }


%% Very good survey about pattern recognition
@ARTICLE{Ku98,
  AUTHOR =       "S. Kulkarni, G. Lugosi, and S. Venkatesh",
  TITLE =        "Learning Pattern Classification---A Survey",
  JOURNAL =      "1948--1998 Special Commemorative Issue of IEEE Transactions on Information Theory",
  YEAR =         "1998",
  volume =       "44",
  pages =        "2178--2206"
}
%

%% The no free lunch paper
@techreport{Wol95,
    author = "D.H. Wolpert and W.G. Macready",
    title = "No Free Lunch Theorems for Search",
    number = "SFI-TR-95-02-010",
    address = "Santa Fe, NM",
    year = "1995"
}


%%Estimating the error rate of prediction (classification).
%%Comparing cross-validation, jackknife, bootstrap and
%%improvements of basic bs. Theoretical considerations and
%%simulations, showing that c-v has higher variability than bs,
%%and both are beaten by improved bs estimates. [The differences
%%may be smaller for continuous responses.]
 @ARTICLE{Efr83,
  AUTHOR =       "B. Efron",
  TITLE =        "Estimating the error rate of a prediction rule: Improvement on cross-validation",
  JOURNAL =      "Journal of the American Statistical Association",
  YEAR =         "1983",
  volume =       "78",
  pages =        "316-331"
}
%%


%% Shows the inconsistency of model selection based on  leave one
%% out estimator (for linear models)
 @ARTICLE{Sha93,
  AUTHOR =       "J. Shao",
  TITLE =        "Linear model selection by cross-validation",
  JOURNAL =      "Journal of the American Statistical Association",
  YEAR =         "1993",
  volume =       "88",
  pages =        "486-494"
} %%


%% One if not the first paper on Jacknife

@ARTICLE{Que49,
  AUTHOR =       "M.H. Quenouille",
  TITLE =        "Approximate tests of correlation in time-series",
  JOURNAL =      "Journal of the Royal Statistical Society B",
  YEAR =         "1949",
  volume =       "11",
  pages =        "68-84"
}
%%

%% Paper about the bias of Leave one out
@ARTICLE{Lun69,
  AUTHOR =       "A. Luntz and V. Brailovsky",
  TITLE =        "On estimation of characters obtained in statistical procedure of recognition (in Russian)",
  JOURNAL =      "Technicheskaya Kibernetica",
  YEAR =         "1969",
  volume =       "3"
}
%%

%% Paper about the Leave one out: one of the first
@ARTICLE{Lac67,
  AUTHOR =       "P.A. Lachenbruch",
  TITLE =        "An almost unbiased method for the probability of misclassification in discriminant analysis",
  JOURNAL =      "Biometrics",
  YEAR =         "1967",
  volume =       "23",
  pages = "639-645"
}
%%

%% Paper which is supposed to talk about leave one out
@INCOLLECTION{Cov69,
  AUTHOR =       "T.M. Cover",
  TITLE =        "Learning in pattern recognition",
  BOOKTITLE =    "Methodologies  of Pattern Recognition",
  PUBLISHER =    "Academic Press",
  YEAR =         "1969",
  editor =       "S. Watanabe",
  pages =        "111-132"
}
%%

%% Paper where the authors show that loo is biased for small m
@ARTICLE{Riv99,
  AUTHOR =       "I. Rivals and L. Personnaz",
  TITLE =        "On Cross-Validation for Model Selection",
  JOURNAL =      "Neural Computation",
  YEAR =         "1999",
  volume =       "11",
  number =      "4",
  pages =       "863-870"
}
%%

%% Paper by Kohavi et al. about cross validation and observations about variance and perf.
%% rule of thumb -> use the 10 fold CV
@inproceedings{Koh95,
    author = "R. Kohavi",
    title = "A Study of Cross-Validation and Bootstrap for Accuracy Estimation and Model Selection",
    booktitle = "{IJCAI}",
    pages = "1137-1145",
    year = "1995",
}

%% Paper where Devroye and Wagner derive exponential bounds for the k-NN
%% the bound depends on the dimension of the input space
%% Note: the bounds are actually valid for any k-local rule
@ARTICLE{Dev79,
  AUTHOR =       "L.P. Devroye and T.J. Wagner",
  TITLE =        "Distribution-Free Inequalities for the Deleted and Holdout Error Estimates",
  JOURNAL =      "IEEE Transactions on Information Theory",
  YEAR =         "1979",
  volume =       "25",
  number =      "2",
  pages =       "202-207"
}
%%

%% Paper where Devroye and Wagner derive bounds (exponential) for the k-NN but
%% for the empirical error (resubstitution error estimate)
%% valid for linear discrimination
%% for nearest neighbor
%% for histogram
%% Note for the k-NN: k needs to be large in order to get interesting bounds
%% this paper uses results from Vapnik and Chervonenkis
@INPROCEEDINGS{Dev77,
  AUTHOR =       "L.P. Devroye and T.J. Wagner",
  TITLE =        "Distribution-free performance bounds with the resubstitution error estimates",
  JOURNAL =      "IEEE Transactions on Information Theory",
  YEAR =         "1977",
  BOOKTITLE=    "Computer Society Conference on Pattern Recognition and Image Processing",
  pages =       "323-326"
}
%%


%% Tech report where the authors show a bound on the variance of the leave one out error
%% for the k-NN.
@TECHREPORT{Dev76,
  AUTHOR =       "L.P. Devroye and T.J. Wagner",
  TITLE =        "Nonparametric discrimination and density estimation",
  INSTITUTION =  "Information Systems Research Laboratory",
  YEAR =         "1976",
  number =       "183",
  address =      "University of Texas, Austin"
}
%%

%% Same publication as before but one author has changed (Devroye is not here any more)
%% Ref. on Rogers and Wagner, where the bound on the variance has been precised compare
%% to the previous tech report
@ARTICLE{Rog78,
  AUTHOR =       "W.H. Rogers and T.J. Wagner",
  TITLE =        "A finite sample distributio-free preformance bound for local discrimination rule",
  JOURNAL =      "Annals of Statistics",
  YEAR =         "1978",
  volume =       "6",
  pages =       "506-514"
}
%%

%% Paper where Devroye and Wagner derive polynomial bounds for potential
%% rules
@ARTICLE{Dev79b,
  AUTHOR =       "L.P. Devroye and T.J. Wagner",
  TITLE =        "Distribution-Free Performance Bounds for Potential Function Rules",
  JOURNAL =      "IEEE Transactions on Information Theory",
  YEAR =         "1979",
  volume =       "25",
  number =      "5",
  pages =       "601-604"
}
%%


%% Paper where the authors prove sanity check bounds for the leave one out
%% errors. They derived bounds on the leave one out error in terms of VC
%% dimension. The bounds are based on error stability
@ARTICLE{Kea99,
  AUTHOR =       "M. Kearns and D. Ron",
  TITLE =        "Algorithmic stability and sanity check bounds for leave-one-out cross validation bounds",
  JOURNAL =      "Neural Computation",
  YEAR =         "1999",
  volume =       "11",
  number =      "6",
  pages =       "1427-1453"
}
%%

%% Paper on stability. Definition of uniform stability.
%% general exponential and polynomial bounds for stable learning algorithm
@ARTICLE{Bou02,
  AUTHOR =       "O. Bousquet and A. Elisseeff",
  TITLE =        "Stability and generalization",
  JOURNAL =      "Journal of Machine Learning Research",
  YEAR =         "2002",
  volume =       "2"
}
%%


%% The paper on SVM
@InProceedings{Bos92,
  author =   {B. Boser and I. Guyon and V. Vapnik},
  title =    {A training algorithm for optimal margin classifiers},
  booktitle =    {Fifth Annual Workshop on Computational Learning Theory},
  pages =    {144-152},
  year =     {1992},
  address =      {Pittsburgh},
  publisher = {ACM}
  }
%%

%% The paper about Regularization Networks
@Article{Pog90,
  author =   {T. Poggio and F. Girosi},
  title =    {Regularization algorithms for learning that are equivalent to multilayer networks},
  journal =      {Science},
  year =     {1990},
  volume =   {247},
pages =      {978-982}
}
%%


%% Paper from Tukey about the Jacknife, has been first introduced in this paper
%% but takes ideas from the paper of Quenouille 1949
@ARTICLE{Tuk58,
  AUTHOR =       "J.W. Tukey",
  TITLE =        "Bias and confidence in not-quite large samples",
  JOURNAL =      "Annals of Mathematical Statistics",
  YEAR =         "1958",
  volume =       "29",
  page = "614"
}
%%

%% paper about the leave one out error that shows that previous attempts
%% to discredit leave one out error are not valid.
@article{Gou97,
    author = "Cyril Goutte",
    title = "Note on Free Lunches and Cross-validation",
    journal = "Neural Computation",
    volume = "9",
    number = "6",
    pages = "1245-1249",
    year = "1997"
}


%% Book of Ripley: p.74 use jacknife to compute the bias of the training error
@BOOK{Rip96,
  AUTHOR =       "B.D. Ripley",
  TITLE =        "Pattern Recognition and Neural Networks",
  PUBLISHER =    "Cambridge University Press",
  YEAR =         "1996",
}
%%

%% Paper where AIC is introduced
@INPROCEEDINGS{Aka73,
  AUTHOR =       "H. Akaike",
  TITLE =        "Information theory and an extension of the maximum likelihood principle",
  BOOKTITLE =    "2nd International Symposium on Information Theory",
  YEAR =         "1973",
  editor =       "B.N. Petrov and F. Csaki",
  pages =        "267-281",
  publisher =    "Akademia Kiado, Budapest"
}
%%

%% Paper that discuss the Mallows' C_P technique
@ARTICLE{Mal73,
  AUTHOR =       "C.L. Mallows",
  TITLE =        "Some comments on CP",
  JOURNAL =      "Technometrics",
  YEAR =         "1973",
  volume =       "15",
  number = "4",
  page = "661-675"
}
%%

%% Paper on model selection with a bound on the leave one out.
@INPROCEEDINGS{Cha00,
  AUTHOR =       "O. Chapelle and V. Vapnik",
  TITLE =        "Model selection for support vector machines",
  BOOKTITLE =    "Advances in Neural Information Processing Systems",
  YEAR =         "2000",
  editor =       "S.A. Solla, T.K. Leen and K.-R. Muller",
  publisher =    "MIT Press"
}
%%

%% these de MacKay sur model selection bayesien
@phdthesis{Mck92,
    author = "D. MacKay",
    title = "{B}ayesian Methods for Adaptive Models",
    year = "1992"
    }
%%

%% Present the metrix based model selection strategy
@inproceedings{Sch97,
    author = "D. Schuurmans",
    title = "A New Metric-Based Approach to Model Selection",
    booktitle = "9th Innovative Applications of Artificial Intelligence Conference",
    pages = "552-558",
    year = "1997"
    }
%%


%% Paper on R2W2
@article{Cha02,
    author = "O. Chapelle and V. Vapnik and O. Bousquet and S. Mukherjee",
    title = "Choosing Multiple Parameters for Support Vector Machines",
    journal = "Machine Learning",
    volume = "46",
    number = "1-3",
    publisher = "Kluwer Academic Publishers, Boston",
    pages = "131--159",
    year = "2002",
 }
%%

%% First Paper on MDL
@article{Ris78,
    author = "J. Rissanen",
    title = "Modeling by shortest data description",
    journal = "Automatica",
    volume = "14",
    pages = "465--471",
    year = "1978",
 }
%%


%%%%%%%%%%%%%%%%%%%%%%%
%%%% BIB FROM THE PAPER
%%%%%%%%%%%%%%%%%%%%%%%
%%%% bib from the original paper


@inproceedings{All00,
    author = "E. L. Allwein and R. E. Schapire and Y. Singer",
    title = "Reducing Multiclass to Binary: {A} Unifying Approach for Margin Classifiers",
    booktitle = "Proc. 17th International Conf. on Machine Learning",
    publisher = "Morgan Kaufmann, San Francisco, CA",
    pages = "9--16",
    year = "2000"
 }

@ARTICLE{Bar98,
    AUTHOR = "P.L. Bartlett",
    TITLE =  {The sample complexity of pattern classification with
          neural networks: the size of the weights is more
          important than the size of the network} ,
    JOURNAL = "IEEE transactions on Information Theory",
    YEAR = "1998",
    PAGES = "525--536",
    VOLUME = "44"}

@article{Bau99,
    author = "E. Bauer and R. Kohavi",
    title = "An Empirical Comparison of Voting Classification Algorithms: Bagging, Boosting, and Variants",
    journal = "Machine Learning",
    volume = "36",
    number = "1-2",
    pages = "105-139",
    year = "1999"
 }

@article{Bou00,
    author = "S. Boucheron and G. Lugosi and P. Massart",
    title = "A sharp concentration inequality with applications",
    journal = "Random Structures and Algorithms",
    volume = "16",
    number = "3",
    pages = "277-292",
    year = "2000"
}

@article{Bre96,
    author = "L. Breiman",
    title = "Bagging Predictors",
    journal = "Machine Learning",
    volume = "24",
    number = "2",
    pages = "123-140",
    year = "1996"
}

@article{Bre96b,
    author = "L. Breiman",
    title = "Heuristics of instability and stabilitzation in model
selection",
    journal = "Annals of Statistics",
    volume = "24",
    number = "6",
    pages = "2350-2383",
    year = "1996"
}


@article{Bre98,
    author = "L. Breiman",
    title = "Arcing classifiers",
    journal = "Annals of Statistics",
    volume = "26",
    number = "3",
    pages = "801-849",
    year = "1998"
}


@TECHREPORT{Bre00,
  AUTHOR =       "L. Breiman",
  TITLE =        "Some Infinite Theory for Predictor Ensemble",
  INSTITUTION =  "University of California, Berkeley",
  YEAR =         "2000",
  number =       "579"
}

@MISC{Bhu00,
  author =       "P. Bhulmann and B. Yu.",
  title =        "Analyzing bagging",
  howpublished = "to appear in Annals of Statistics, available at www.stat.Berkeley.EDU/users/binyu/publications.html",
  year =         "2000"
}


@TECHREPORT{Cap90,
  AUTHOR =       "B. Caprile and F. Girosi.",
  TITLE =        "A nondeterministic minimization algorithm",
  INSTITUTION =  "MIT AI Lab memo",
  YEAR =         "1990",
  number =       "1254"
}

@inproceedings{Cra00,
    author = "K. Crammer and Y. Singer",
    title = "On the Learnability and Design of Output Codes for Multiclass Problems",
    booktitle = "Computational Learing Theory",
    pages = "35-46",
    year = "2000"
}


@BOOK{Nel00,
  author =       {Nello Cristianini and John Shawe-Taylor},
  title =        {Introduction to Support Vector Machines},
  publisher =    {Cambridge University Press},
  year =         {2000}
}

@ARTICLE{Cuc02,
  AUTHOR =       "F. Cucker and S. Smale",
  TITLE =        "On the mathematical foundations of learning",
  JOURNAL =      "Bulletin (New Series) of the American Mathematical Society",
  YEAR =         "2002",
  volume =       "39",
  number =       "1",
  pages =        "1-49"
}


@InCollection{Dev91,
  author =   {L. Devroye},
  title =    {Exponential inequalities in nonparametric estimation},
  booktitle =    {Nonparametric Functional Estimation and Related Topics},
  pages =    {31--44},
  publisher =    {Kluwer Academic Publishers},
  year =     1991,
  editor =   {G. Roussas},
  series =   {NATO ASI Series},
  address =  {Dordrecht}
}


@Article{Die_Bak95,
  author =   {T.G. Dietterich and G. Bakiri},
  title =    {Solving multiclass learning problems via error-correcting output codes},
  journal =      {Journal of Artificial Intelligence Research},
  year =     {1995},
  volume =   {2},
  pages =    {263-286}
}

@misc{ evgeniou00regularization,
  author = "T. Evgeniou and M. Pontil and T. Poggio",
  title = "Regularization Networks and Support Vector Machines",
  text = "T. Evgeniou, M. Pontil, and T. Poggio. Regularization Networks and Support
    Vector Machines. Advances in Computational Mathematics 13 (2000) 1, pages
    1-50.",
  year = "2000",
  url = "citeseer.nj.nec.com/evgeniou00regularization.html" }

@misc{ freund-discussion,
  author = "Yoav Freund and Robert E. Schapire",
  title = "Discussion of the paper "Arcing Classifiers" by Leo Breiman",
  url = "citeseer.nj.nec.com/freund97discussion.html" }

@ARTICLE{Fre_Sch96,
    AUTHOR = " Y. Freund and R.E. Schapire",
    TITLE =  {A decision-theoretic generalization of on-line learning and an application to boosting},
    JOURNAL = "Journal of Computer and System Sciences",
    YEAR = "1997",
VOLUME = "55(1)",
    PAGES = "119--139"}

@TECHREPORT{Fri98,
  AUTHOR =       "J.~Friedman, T.~Hastie, and R.~Tibshirani",
  TITLE =        "Additive logistic regression: a statistical view of boosting",
  INSTITUTION =  "Department of Statistics",
  YEAR =         "1998",
  address =      "Stanford University"
}


@misc{Fri00,
  author = "J. Friedman and P. Hall",
  title = "On bagging and nonlinear estimation",
  text = "J.H. Friedman and P. Hall(2000). On bagging and nonlinear estimation.
    Preprint.",
  year = "2000",
  url = "citeseer.nj.nec.com/friedman99bagging.html" }


@InProceedings{Jaa98,
  author =   {T. Jaakkola and D. Haussler},
  title =    {Probabilistic kernel regression models},
  booktitle =    { Neural Information Processing Systems},
 year =      {1998},
address =    {Cambridge, MA},
publisher = {MIT Press}
}

@inproceedings{Koh96,
    author = "R. Kohavi and D.H. Wolpert",
    title = "Bias Plus Variance Decomposition for Zero-One Loss Functions",
    booktitle = "Machine Learning: Proceedings of the Thirteenth International Conference",
    publisher = "Morgan Kaufmann",
    editor = "Lorenza Saitta",
    pages = "275--283",
    year = "1996"
}

@ARTICLE{Lug98,
    AUTHOR = "G.~Lugosi and M.~Pawlak",
    TITLE =  {On the posterior-probability estimate of the error of
nonparametric classification rules},
    JOURNAL = "IEEE Transactions on Information Theory",
    YEAR = "1994",
VOLUME = "40(2)",
    PAGES = "475--481"}

@misc{Moo91,
  author = "A.W. Moore",
  title = "An intoductory tutorial on kd-trees",
  year = "1991",
  howpublished = "Carnegie Mellon University",
  url = "citeseer.nj.nec.com/140157.html" }

@InCollection{McD89,
  author =   {C. McDiarmid},
  title =    {On the method of bounded differences},
  booktitle =    {Surveys in Combinatorics},
  year =     "1989",
  pages =    {148--188},
  publisher =    {Cambridge University Press},
  address =  {Cambridge}
}


@BOOK{Pre90,
    AUTHOR="W.H. Press, B.P. Flannery, S.A. Teukolsky, and W.T. Vetterling",
    TITLE =  {Numerical Recipes in C} ,
    PUBLISHER = "Cambridge, U.K.: Cambridge Univ. Press",
    YEAR = "1990"}


@INPROCEEDINGS{Qui96,
  AUTHOR =       "J.R. Quinlan",
  TITLE =        "Bagging, boosting and C4.5",
  BOOKTITLE =    "Fourteenth National Conference on Artificial Intelligence",
  YEAR =         "1996",
  pages =        "725-730"
}


@ARTICLE{Sch98,
    AUTHOR = "R.E. Schapire and Y. Freund and P. Bartlett and W.S. Lee",
    TITLE =  {Boosting the Margin: A New Explanation for the Effectiveness of Voting Methods},
    JOURNAL = "The Annals of Statistics",
    YEAR = "1998",
VOLUME = "26(5)",
    PAGES = "1651--1686"}

@BOOK{Sch98b,
    AUTHOR="B.~Scholkopf, C.~Burges, and A.~Smola",
    TITLE =  {Advances in kernel methods --
  support vector learning} ,
    PUBLISHER = "MIT Press",
    YEAR = "1998"}

@Article{Vap71,
  author =   {V.N. Vapnik and A.Y. Chervonenkis},
  title =    {On the uniform convergence of relative frequencies of events to their probabilities},
  journal =      {Theory Probab. Appl.},
  year =     {1971},
  volume =   {16},
 pages =     {264-180}
}

@BOOK{Vap98,
    AUTHOR="V. Vapnik",
    TITLE =  {Statistical Learning Theory} ,
    PUBLISHER = "John Wiley \& Sons",
    address = "N.Y.",
    YEAR = "1998"}

@article{Wah90,
author ="G.~Wahba.",
title ="Splines Models for Observational Data",
journal = "Series in Applied Mathematics, SIAM, Philadelphia",
volume = "59",
year = "1990"
}

@article{AIJ97,
    EDITOR="R.~Greiner",
    TITLE =  {Special Issue on Relevance} ,
    PUBLISHER = "Elsevier",
    VOLUME = "97",
    NUMBER = "1-2",
    MONTH = "December",
    JOURNAL = "Artificial Intelligence",
    YEAR = "1997"}

@article{Blu97,
    TITLE =  {Selection of relevant features and examples in machine learning} ,
    AUTHOR = "A.~Blum and P.~Langley",
    PUBLISHER = "Elsevier",
    VOLUME = "97",
    NUMBER = "1-2",
    MONTH = "December",
    JOURNAL = "Artificial Intelligence",
    Pages = "245-271",
    YEAR = "1997"}

    %Special issue of Artificial Intelligence on "Relevance" -- 97(1-2), Dec 1997
%http://www.elsevier.nl/locate/inca/505601
%Artificial Intelligence, Volume 97, Issue 1-2, 18 December 1997 Relevance

@article{dhillon,
    TITLE =  {A divisive information-theoretic feature clustering algorithm for text classification} ,
    AUTHOR = "I. Dhillon, and S. Mallela, and R. Kumar",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1265-1287 (this issue)",
    YEAR = "2003"}

@article{weston,
    TITLE =  {Use of the zero norm with linear models and kernel methods} ,
    AUTHOR = "J. Weston, and A. Elisseff, and B. Schoelkopf, and M. Tipping",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1439-1461 (this issue)",
    YEAR = "2003"}

@article{bennett,
    TITLE =  {Dimensionality reduction via sparse support vector machines} ,
    AUTHOR = "J. Bi, and K. Bennett, and M. Embrechts, and C. Breneman, and M. Song",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1229-1243 (this issue)",
    YEAR = "2003"}

@article{rivals,
    TITLE =  {{MLP}s (mono-layer polynomials and multi-layer perceptrons) for non-linear modeling} ,
    AUTHOR = "I. Rivals and L. Personnaz",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1383-1398 (this issue)",
    YEAR = "2003"}

@article{bekkerman,
    TITLE =  {Distributional word clusters vs. words for text categorization} ,
    AUTHOR = "R. Bekkerman, and R. El-Yaniv, and N. Tishby, and Y. Winter",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1183-1208 (this issue)",
    YEAR = "2003"}

@article{reunanen,
    TITLE =  {Overfitting in making comparisons between variable selection methods} ,
    AUTHOR = "J. Reunanen",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1371-1382 (this issue)",
    YEAR = "2003"}

@article{stoppiglia,
    TITLE =  {Ranking a random feature for variable and feature selection} ,
    AUTHOR = "H. Stoppiglia, and G. Dreyfus, and R. Dubois, and Y. Oussar",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1399-1414 (this issue)",
    YEAR = "2003"}

@article{torkkola,
    TITLE =  {Feature extraction by non-parametric mutual information maximization} ,
    AUTHOR = "K. Torkkola",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1415-1438 (this issue)",
    YEAR = "2003"}

@article{alain,
    TITLE =  {Variable selection using {SVM}-based criteria} ,
    AUTHOR = "A. Rakotomamonjy",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1357-1370 (this issue)",
    YEAR = "2003"}

@article{bengio,
    TITLE =  {Extensions to metric-based model selection} ,
    AUTHOR = "Y. Bengio, and N. Chapados",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1209-1227 (this issue)",
    YEAR = "2003"}

@article{perkins,
    TITLE =  {Grafting: Fast incremental feature selection by gradient descent in function space} ,
    AUTHOR = "S. Perkins, and K. Lacker, and J. Theiler",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1333-1356 (this issue)",
    YEAR = "2003"}

@article{forman,
    TITLE =  {An extensive empirical study of feature selection metrics for text classification} ,
    AUTHOR = "G. Forman",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1289-1306 (this issue)",
    YEAR = "2003"}

@article{caruana,
    TITLE =  {Benefitting from the variables that variable selection discards} ,
    AUTHOR = "R. Caruana, and V. de Sa",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1245-1264 (this issue)",
    YEAR = "2003"}

@article{globerson,
    TITLE =  {Sufficient dimensionality reduction} ,
    AUTHOR = "A. Globerson, and N. Tishby",
    PUBLISHER = "MIT Press",
    VOLUME = "3",
    NUMBER = "",
    MONTH = "",
    JOURNAL = "Journal of Machine Learning Research",
    Pages = "1307-1331 (this issue)",
    YEAR = "2003"}

@BOOK{DHS01,
  author =       {R. O. Duda, and P. E. Hart, and D. G. Stork},
  title =        {Pattern Classification},
  publisher =    {John Wiley \& Sons},
  year =         {2001},
  address =      {USA},
  edition =      {2nd},
}

@article{ almuallim94learning,
    author = "Hussein Almuallim and Thomas G. Dietterich",
    title = "Learning Boolean Concepts in the Presence of Many Irrelevant Features",
    journal = "Artificial Intelligence",
    volume = "69",
    number = "1-2",
    pages = "279-305",
    year = "1994"
 %   url = "citeseer.nj.nec.com/almuallim94learning.html" }

@inproceedings{ ng98feature,
    author = "A. Y. Ng",
    title = "On feature selection: learning with exponentially many irrelevant features as training examples",
    booktitle = "15th International Conference on Machine Learning",
    publisher = "Morgan Kaufmann, San Francisco, CA",
    pages = "404--412",
    year = "1998"
 %   url = "citeseer.nj.nec.com/ng98feature.html" }

@misc{ langley94selection,
  author = "P. Langley",
  title = "Selection of relevant features in machine learning",
  text = "Langley, P. (1994), Selection of relevant features in machine learning,
    in AAAI Fall Symposium on Relevance, pp. 140--144.",
  year = "1994"
%  url = "citeseer.nj.nec.com/langley94selection.html" }

@ARTICLE{Tus01,
  author =       {V. G. Tusher, and R. Tibshirani, and G. Chu},
  title =        {Significance analysis of microarrays applied to the ionizing radiation response},
  journal =      {PNAS},
  year =         {2001},
  volume =       {98},
  pages =        {5116-5121},
  month =        {April}
%  url   =        {www-stat.stanford.edu/~tibs/SAM/pnassam.pdf}
}

@BOOK{Has01,
  author =       {T.~Hastie, and R.~Tibshirani, and J.~Friedman},
  title =        {The Elements of Statistical Learning},
  publisher =    {Springer},
  year =         {2001},
  series =       {Springer series in statistics},
  address =      {New York}
}

@ARTICLE{Fur00,
  author =       {Furey, T. and Cristianini, N. and Duffy, and Bednarski N., and D., Schummer and M. and Haussler, D.},
  title =        {Support Vector Machine Classification and Validation of Cancer Tissue Samples Using Microarray Expression Data},
  journal =      {Bioinformatics},
  year =         {2000},
  volume =       {16},
  pages =        {906-914}
}

@BOOK{Vap82,
  author =       {V. Vapnik},
  title =        {Estimation of dependencies based on empirical data},
  publisher =    {Springer},
  year =         {1982},
  series =       {Springer series in statistics},
}

@article{Gol99,
    TITLE =  {Molecular Classification of Cancer: Class Discovery and Class Prediction by Gene Expression Monitoring} ,
    AUTHOR = "T. R. Golub et al.",
    VOLUME = "286",
    JOURNAL = "Science",
    Pages = "531-537",
    YEAR = "1999"
%    URL = "www.genome.wi.mit.edu/MPR/data_set_ALL_AML.html"
    }

@article{Koh97,
    TITLE =  {Wrappers for Feature Selection} ,
    AUTHOR = "R.~Kohavi and G.~John",
    PUBLISHER = "Elsevier",
    VOLUME = "97",
    NUMBER = "1-2",
    MONTH = "December",
    JOURNAL = "Artificial Intelligence",
    Pages = "273-324",
    YEAR = "1997"}


@ARTICLE{Aml98,
  author =       {E. Amaldi and V. Kann},
  title =        {On the approximation of minimizing non zero variables or unsatisfied relations in linear systems},
  journal =      {Theoretical Computer Science},
  year =         {1998},
  volume =       {209},
  pages =        {237-260}
}

@INPROCEEDINGS{Ng01,
  AUTHOR =       "A. Y. Ng and M. Jordan",
  TITLE =        "Convergence rates of the Voting {G}ibbs classifier, with application to {B}ayesian feature selection",
  BOOKTITLE =    "18th International Conference on Machine Learning",
  YEAR =         "2001"
}

@BOOK{Sch02,
  author =       {B. Schoelkopf and A. Smola},
  title =        {Learning with Kernels},
  publisher =    {MIT Press},
  year =         {2002},
  address =      {Cambridge MA}
}

@INPROCEEDINGS{Jeb00,
  AUTHOR =       "T. Jebara and T. Jaakkola",
  TITLE =        "Feature selection and dualities in maximum entropy discrimination",
  BOOKTITLE =    "16th Annual Conference on Uncertainty in Artificial Intelligence",
  YEAR =         "2000"
}

@techreport{Tib94,
  author =       {Tibshirani, R.},
  title =        {Regression selection and shrinkage via the lasso},
  month =        {June},
  year =         {1994},
  address =      {Palo Alto, CA},
  institution =  {Stanford University}
}

@INPROCEEDINGS{Wes00,
  AUTHOR =       "J. Weston and S. Mukherjee and O. Chapelle and M. Pontil and T. Poggio and V. Vapnik",
  TITLE =        "Feature Selection for {SVM}s",
  BOOKTITLE =    "NIPS 13",
  YEAR =         "2000"
}

@INPROCEEDINGS{Kol96,
  AUTHOR =       "D. Koller and M. Sahami",
  TITLE =        "Toward optimal feature selection",
  BOOKTITLE =    "13th International Conference on Machine Learning",
  YEAR =         "1996",
  MONTH =        "July",
  PAGES =        "284-292"
}

@inproceedings{Per93,
    author = "F. Pereira and N. Tishby and L. Lee",
    title = "Distributional Clustering of {E}nglish Words",
    booktitle = "Proc. Meeting of the Association for Computational Linguistics",
    pages = "183-190",
    year = "1993"
%    url = "citeseer.nj.nec.com/pereira93distributional.html"
}

@inproceedings{ Tis99,
  author = "N. Tishby and F. C. Pereira and W. Bialek",
  title = "The information bottleneck method",
  booktitle = "Proc. of the 37th Annual Allerton Conference on Communication, Control and Computing",
  pages = "368-377",
  year = "1999",
%url = "citeseer.nj.nec.com/tishby99information.html"
}

@BOOK{Bre84,
  author =       {L. Breiman and J. H. Friedman and R. A. Olshen and C. J. Stone},
  title =        {Classification and Regression Trees},
  publisher =    {Wadsworth and Brooks},
  year =         {1984},
}

@article{ Guy02,
    author = "I. Guyon and J. Weston and S. Barnhill and V. Vapnik",
    title = "Gene Selection for Cancer Classification using Support Vector Machines",
    journal = "Machine Learning",
    volume = "46",
    number = "1-3",
    publisher = "Kluwer Academic Publishers, Boston",
    pages = "389--422",
    year = "2002"
    %url = "citeseer.nj.nec.com/guyon00gene.html"
    }

@article{ Die98,
    author = "Dietterich, T. G.",
    title = "Approximate Statistical Test For Comparing Supervised Classification Learning Algorithms",
    journal = "Neural Computation",
    volume = "10",
    number = "7",
    publisher = "Kluwer Academic Publishers, Boston",
    pages = "1895-1924",
    year = "1998"
    }

@article{ Nad01,
    author = "C. Nadeau and Y. Bengio",
    title = "Inference for the Generalization Error",
    journal = "Machine Learning (to appear)",
    year = "2001"
    }

@MISC{Veh02,
  author =       {A. Vehtari and J. Lampinen},
  title =        {Bayesian input variable selection using posterior probabilities and expected utilities},
  howpublished = {Report B31},
  year =         {2002},
  address =         {Laboratory of Computational Engineering, Helsinki University of Technology},
}

@inproceedings{ Kir92,
  author = "K. Kira and L. Rendell",
  title = "A practical approach to feature selection",
  booktitle = "International Conference on Machine Learning",
  pages = "368-377",
  month = "July",
  year = "1992",
  editor = "D. Sleeman and P. Edwards",
  address = "Aberdeen",
  publisher = "Morgan Kaufmann",
  pages = "249-256"
  }
%url = "citeseer.nj.nec.com/tishby99information.html"
}

@article{ Blu97,
    author = "A. Blum and P. Langley",
    title = "Selection of Relevant Features and Examples in Machine Learning",
    journal = "Artificial Intelligence",
    volume = "97",
    number = "1-2",
    pages = "245-271",
    year = "1997",
%    url = "citeseer.nj.nec.com/blum97selection.html"
}

@BOOK{Per00,
  author =       {J. Pearl},
  title =        {Causality},
  publisher =    {Cambridge University Press},
  year = {2000}
}

@inproceedings{ Lec90,
    author = "Y. LeCun and J. Denker and S. Solla and R.~E. Howard and L.~D. Jackel",
    title = "Optimal Brain Damage",
    booktitle = "Advances in Neural Information Processing Systems {II}",
    publisher = "Morgan Kaufmann",
    address = "San Mateo, CA",
    editor = "D.~S. Touretzky",
    year = "1990"
    %url = "citeseer.nj.nec.com/lecun90optimal.html"
    }

@inproceedings{ Xin01,
    author = "E.P. Xing and R.M. Karp",
    title = "CLIFF: Clustering of High-Dimensional Microarray Data via Iterative Feature Filtering Using Normalized Cuts.",
    booktitle = "9th International Conference on Intelligence Systems for Molecular Biology",
    year = "2001"
    }

@inproceedings{ Ben01,
  author  =      {A. Ben-Hur and I. Guyon},
  title =        {Detecting Stable Clusters Using Principal Component Analysis},
  booktitle =    {Methods In Molecular Biology Series (in press)},
  publisher =    {Humana Press},
  year =         {}
}

@inproceedings{ Yan97,
    author = "Y. Yang and J. O. Pedersen",
    title = "A comparative study on feature selection in text categorization",
    booktitle = "14th International Conference on Machine Learning",
    publisher = "Morgan Kaufmann Publishers, San Francisco, US",
    address = "Nashville, US",
    editor = "Douglas H. Fisher",
    pages = "412--420",
    year = "1997"
    %url = "citeseer.nj.nec.com/yang97comparative.html"
    }

@ARTICLE{Mon00,
  author =       {G. Monari and G. Dreyfus},
  title =        {Withdrawing an example from the training set: an analytic estimation of its
effect on a nonlinear parameterized model},
  journal =      {Neurocomputing Letters},
  year =         {2000},
  volume =       {35},
  pages =        {195-201}
}

@INPROCEEDINGS{Gra02,
  AUTHOR =       "Y. Grandvalet and S. Canu",
  TITLE =        "Adaptive Scaling for Feature Selection in {SVM}s",
  BOOKTITLE =    "NIPS 15",
  YEAR =         "2002"
}