\documentclass[twoside,11pt]{article}
\usepackage{jmlr2e}

\usepackage{makeidx}  % allows for indexgeneration
\usepackage{hhline}
\usepackage{amsmath}
\usepackage{latexsym,setspace}
\usepackage{psfig}
%
\newtheorem{alg_env}{Algorithm}
\newenvironment{algorithm}[1][\  ] %
{
\vspace{0.25in}
\begin{alg_env}{#1}
\rm
\begin{tabbing}
....\=...\=...\=...\=...\=  \+ \kill
} %
{\end{tabbing}
  \end{alg_env}
  \vspace{0.25in}
 }
%

\jmlrheading{2}{2002}{313-334}{5/01}{2/02}{Tong Zhang and Vijay S. Iyengar}
\ShortHeadings{Recommender Systems Using Linear Classifiers}{Zhang and Iyengar}
\firstpageno{313}

\begin{document}

\title{Recommender Systems Using Linear Classifiers}


\author{\name Tong Zhang  \email tzhang@watson.ibm.com \\
  \name Vijay S. Iyengar \email vsi@us.ibm.com \\
  \addr   IBM Research Division, T. J. Watson Research Center  \\
  P.O. Box 218, Yorktown Heights, NY 10598, U.S.A. \\
}

\editor{Leslie Pack Kaelbling}

\date{}

\maketitle

\begin{abstract}%
Recommender systems
use historical data on user preferences and other available
data on users (for example, demographics) and items (for example, taxonomy)
to predict items a new user might like.  Applications of these methods include
recommending items for purchase and personalizing the
browsing experience on a web-site.
Collaborative filtering methods have focused on using just the
history of user preferences to make the recommendations.
These methods have been categorized as \textit{memory-based} if they operate
over the entire data to make predictions and as \textit{model-based} if
they use the data to build a model which is then used for predictions.
In this paper, we propose the use of linear classifiers in a
model-based recommender system.
We compare our method with another model-based method using
decision trees and with
memory-based methods using data from various domains.
Our experimental results indicate that these linear models
are well suited for this application.
They outperform a commonly proposed
memory-based method in accuracy and also
have a better tradeoff between off-line and on-line computational requirements.
\end{abstract}

\begin{keywords}
  Recommender Systems, Collaborative Filtering, Decision Trees, Linear Models,
  Unbalanced Data
\end{keywords}

\section{Introduction}

Recommender systems use historical data on user preferences and purchases
and other available data on users and items
to recommend items that might be interesting to a new user.
One of the earliest techniques developed for recommendations
was based on \textit{nearest-neighbor collaborative filtering} algorithms
\citep{resnick94,shardanand95}
that used just the history of user preferences as input.
Sometimes in the literature
the term \textit{collaborative filtering} is used
to refer to just these methods.
However, we will follow the taxonomy introduced by
\citet{breese98} in which collaborative filtering (\textit{CF})
refers to a broader set of methods that use prior
preferences to predict new ones.
In this taxonomy,
nearest-neighbor collaborative filtering algorithms
are categorized as being memory-based \textit{CF}.
Nearest-neighbor methods use some notion of similarity between
the user for whom predictions are being generated and
users in the database.
Variations on this notion of similarity and other aspects
of memory-based algorithms are discussed by
\citet{breese98}.
Scalability is an issue with nearest-neighbor methods.
The use of dimension reduction techniques like latent
semantic indexing has been proposed to address this issue
\citep{sarwar00}.

In contrast, model-based \textit{CF} methods use
the historical data
to build models which are then used for predicting new
preferences.
A model-based approach using
Bayesian networks was found to be comparable to the
memory-based approach of \citet{breese98}.
More recently, models based on a newer graphical representation called
dependency networks \citep{hofmann97} have been applied
to this problem
\citep{heckerman00}.
For this task,
dependency network models seem
to have slightly poorer accuracy
but require significantly less computation
when compared to Bayesian network models \citep{heckerman00}.
Another model-based method uses
clustering to group users based on their past preferences.
The parameters for this clustering model can be
estimated by methods like Gibbs sampling and EM
\citep{ungar98a,ungar98b,breese98}.
The clustering model explored by \citet{breese98} was
outperformed by the model-based approach using Bayesian networks
and by the memory-based approach CR+ described by \citet{breese98}.

In this paper, we explore the use of various linear classifiers
in a model-based approach to the recommendation task.
Linear classifiers have been quite successful in
the text classification domain \citep{Joa98,YC94,ZhaOle01}.
Some of the characteristics
shared between the text and CF domains include the
high dimensionality and sparseness of the data in these domains.
The main computational cost of using linear classifiers is
in the model building phase, which is an off-line activity.
The application of the models is very straightforward
especially with sparse data.

Our empirical study will use two data sets that reflect
users' browsing behavior and one data set that captures
their purchases.
Because of its wider applicability,
we focus on data that is implicitly gathered, e.g., a boolean flag
for each web page representing whether or not it was browsed as in
the anonymous-msweb dataset \citep{uci98}.
This is in contrast with explicitly collected data such as
ratings explicitly gotten for movies \citep{eachmovie}.
Section~\ref{sec:experiments}  presents results achieved on these
datasets by various model-based approaches using linear classifiers.
For comparison we also include results achieved by
our implementation of the memory-based algorithm
CR+ described by \citet{breese98} and a model-based approach
using decision trees.
The linear models studied in this paper
are described in the next section.

This paper expands on some early experimental results reported
by \citet{iyengar01}.
We have derived the algorithms more rigorously and added the naive Bayes model
in addition to the regularized linear models considered by
\citet{iyengar01}.  We have also investigated the differences between
the recommendation domain and the text categorization domain. This leads to a
study on the impact of loss function on the accuracy of the
recommendations. The result of this study  gives us some insights on the
failure of some linear models in the recommendation domain even
though they perform very well in the text categorization domain.

\section{Model-Based Approaches}

The problem of predicting whether a user (or a customer) will accept a specific
recommendation can be modeled as a binary classification problem.
However, in a recommender system, we are also interested in the
likelihood that a customer will accept a recommendation.
This information can be used  to rank all of the potential choices according
to their likelihoods, so that we can select the top choices to
present to the customer.
It is thus necessary that the classifier we use returns a score (or a
confidence level), where a higher score corresponds to a higher
possibility that the customer will accept the recommendation.

Recommender systems also have characteristics that are similar to those of
text categorization. For example, the standard document representation
in text categorization is the ``bag of words'' vector space model. In this
model, a text document is represented by a vector of word occurrences in
the document. Each vector component represents a word feature,
and its value is the number of occurrences of the word in the document.
This representation leads to a very large feature space consisting of all possible
words in a language. However, since each document to be categorized (such as
an e-mail message) is usually relatively short, a vector that represents a
document is highly sparse. Therefore text categorization algorithms have to be
suitable to problems with large but sparse features.

This characteristic of text categorization is also shared by
recommender systems. Typically the set
of all possible available merchandises (corresponding to all possible words in text
categorization) in a recommender system is very large.
However, a customer (corresponding to a document in text categorization) only
buys a small number of items.
Items that have already been bought by the customer correspond
to words appearing in a document.
This correspondence leads to a sparse feature representation of the custom
profile that is very similar to the vector space model in text categorization.
Consequently, it is reasonable to start with the assumption that
algorithms that perform well in text categorization may also perform well
in the CF domain.

On the other hand, there are also differences between text categorization and
CF. There are typically many fewer categories in a text
categorization problem than in a recommender system.
A text categorization system usually determines whether
a document belongs to a certain class or not, while a recommender system
provides a ranked list of recommendations indicating the likelihoods of
buying certain items. As a result, the evaluation criteria for text categorization
and recommender systems are different. In addition, categories in text
categorization systems are more likely to be mutually exclusive (that is, a text
document usually focuses on a single or very few topics), while a customer
may be interested in a relatively large number of different items.
Also in a recommender system, the features (items bought) are inter-related to
the categories (what item to buy). This is not true in text categorization.
A related difference is that when we observe that a person has not bought
an item, it can either be the case that the person is not interested in
the item, or the case that although the person is interested in the item,
(s)he has not yet bought it. Clearly it is difficult to distinguish these
two situations --- in fact, the purpose  of a recommender system is simply to
distinguish the two situations. This ambiguity shows that the input to a
recommender system is noisy, which makes the CF problem more difficult.
On the other hand, this type of noise does not occur in text categorization.


All these differences between the CF problem and the text categorization problem
suggest that we may need to modify
algorithms used in text categorization. This also suggests that
not all algorithms that do well in text categorization will automatically
do well in the CF domain.
Based on the above discussion, in the following,
we shall motivate algorithms considered in this paper from text categorization.
However, we also discuss issues specifically related to the CF application
as well as necessary modifications of the algorithms.

\subsection{Base-line Systems}

As a baseline system, we have implemented a version of the nearest
neighbor style algorithm, CR+, described by \citet{breese98} and included it
in our study.
As suggested by \citet{breese98},
inverse user frequency, case amplification and default
voting heuristics are used in our implementation of CR+.
CR+ achieves very good performance in the recommender system application.
As a comparison, it is also known that nearest neighbor algorithms achieve
very good performance in text categorization \citep{Yang99}. However, the major
disadvantage of this method is the large computational and memory complexity.
Consequently, simplifying heuristics have to be used
to overcome this problem in practical applications.
Such simplifications were not addressed in our baseline implementation.

%VSI
In the recommender system application,
interpretability of the model used is a desirable characteristic
to be considered in addition to the
accuracy achieved and the computational requirements.
For example, in cross-sell applications, interpretability allows
marketing analysts to review and possibly modify the generated
models.
%VSI

The interpretability property can be satisfied by using a rule-based
system, such as rules obtained from a decision tree.
We shall thus include a decision-tree-based recommender system
in this empirical study as an example of using an interpretable
model.
In this decision-tree package, the splitting criterion during
tree growth is a modified version of entropy and the tree pruning is done
using a Bayesian model combination approach originated
from data compression \citep{WST,Zhang98DCC}.
A similar approach has been suggested by \citet{KM98}.
One useful aspect of our decision tree is that we take advantage of the
sparse data representation. The algorithm has low memory and
time complexity for sparse data. A standard decision-tree package such as
the C4.5 program \citep{Quinlan95} that does not take advantage of sparsity
in the data will not be able to handle our problems.

Although a decision-tree rule-based system is efficient and interpretable,
it does not provide the best performance in text categorization.
Similarly, experiments in this paper show that our decision tree does not
provide the best performance as a recommender system.
One remedy is to use the so-called ``boosting'' procedure,
which votes on a large number of decision trees. 
In fact, the best text categorization
result on the standard Reuters evaluation dataset is achieved using boosted
decision trees \citep{Weiss99}.
Unfortunately, this approach requires  a large number
of trees \citep[one hundred in][]{Weiss99}. The resulting system not
only loses the interpretability of a single decision tree, 
but is also computationally
too expensive to be practically interesting for this application.
We thus do not consider this method in this paper.

It is known that in text categorization, the same level of performance achieved by
boosted decision trees can be achieved by computationally
more efficient linear classification methods \citep{DPHS98,Joa98,ZhaOle01}.
It is thus natural to consider linear classification in the context of
CF.

\subsection{Linear Models}

We formally define a two-class categorization problem as one to determine a
label $y \in \{-1,1\}$ associated with a vector $x$ of input
variables.  A useful method for solving this problem is by
linear discriminant functions, which consist of linear combinations of
the input variables.  Various techniques have been proposed for
determining the weight values for linear discriminant classifiers from
a training set of labeled data $(x_1,y_1), \ldots, (x_n, y_n)$.
Specifically, we seek a weight vector $w$ and a threshold $\theta$
such that $w^T x < \theta $ if its label $y=-1$ and $w^T x \geq \theta$ if its
label $y=1$.
%VSI
A score of value $w^T x - \theta$ can be assigned to each data point
as a surrogate for the likelihood of $x$ to be in class.
%VSI

The problem just described may readily be converted into one in which
the threshold $\theta$ is taken to be zero.
One does this by converting a data point $x$ in the original
space into $\tilde{x}=[x,1]$ in the enlarged space.
Each hyperplane $w$ in the original space with threshold $\theta$
can then be converted into $[w, -\theta]$ that passes through the origin
in the enlarged space.
Instead of searching for both an
$d$-dimensional weight vector along with a threshold $\theta$, we can
search for an $(d+1)$-dimensional weight vector along with
an anticipated threshold of zero.
In the following, unless otherwise indicated, we assume that
the vectors of input variables have been suitably transformed so that we
may take $\theta = 0$.
We also assume that $x$ and $w$ are $d$-dimensional vectors.

\subsubsection{Regularized Discriminative Models}

Many algorithms have been proposed for linear classification.
We start our discussion with the least squares algorithm, which is based on
the following formulation to compute a linear separator $\hat{w}$:
\begin{equation}
  \hat{w} = \arg\min_{w} \frac{1}{n} \sum_{i=1}^n (w^T x_i - y_i)^2.
  \label{eq:llsf}
\end{equation}
The least squares method is extensively used in engineering and
statistics.
Although the method has mainly been associated with regression problems,
it can also be used in classification.
Examples include use
in text categorization
\citep{YC94} and uses in combination with neural networks \citep{Ripley96}.

The solution of (\ref{eq:llsf}) is given by
\[
\hat{w} = \left(\sum_{i=1}^n x_i x_i^T\right)^{-1} 
\left(\sum_{i=1}^n x_i y_i\right).
\]
One problem with the above formulation is that the matrix
$\sum_{i=1}^n x_i x_i^T$ may be singular or ill-conditioned.
This occurs, for example, when $n$ is less than the dimension of $x$.
Note that in this case, for any $\hat{w}$, there exist infinitely
many solutions $\tilde{w}$ of $\tilde{w}^T x_i = \hat{w}^T x_i$ for
$i=1,\ldots,n$.
This implies that (\ref{eq:llsf}) has infinitely many possible
solutions $\hat{w}$.

A remedy of this problem is to use a pseudo-inverse \citep{YC94}.
However, one problem of the pseudo-inverse approach is its computational
complexity. In order to handle large sparse systems, we need to use iterative
algorithms which do not rely on matrix factorization
techniques. Therefore in this paper, we use the standard ridge
regression method \citep{HoKe70} that adds a regularization term to (\ref{eq:llsf}):
\begin{equation}
  \hat{w} = \arg\min_{w}\left[\frac{1}{n} \sum_{i=1}^n (w^T x_i y_i -1)^2 +
  \lambda w^2\right],
  \label{eq:llsf_reg}
\end{equation}
where $\lambda$ is an appropriately chosen regularization parameter.
The solution is given by
\[
\hat{w} = \left(\sum_{i=1}^n x_i x_i^T+\lambda n I\right)^{-1} 
\left(\sum_{i=1}^n x_i y_i\right),
\]
where $I$ denotes the identity matrix. Note that
$\sum_{i=1}^n x_i x_i^T+\lambda n I$ will always be non-singular, which solves
the ill-condition problem.
The regularized least squares formulation (\ref{eq:llsf_reg}) can be solved
by Algorithm~\ref{alg:CLS} in Appendix~\ref{apx:primal}.
In order to avoid cluttering the main text,
we leave the algorithms and their justifications to the appendices.

Another popular linear classification method is the support vector machine,
which is a method originally proposed by \citet{CV95} that has
nice properties from the sample complexity theory.
Slightly different from our approach of forcing threshold $\theta=0$,
and then compensating by appending 1
to each data vector, the standard linear
support vector machine \citep{Vap98} explicitly includes
$\theta$ in a quadratic formulation as follows:
\begin{align}
  (\hat{w},\hat{\theta}) &=
  \arg\inf_{w,\theta} \left[\frac{1}{n}\sum_{i=1}^n \xi_i + \lambda w^2\right],
  \label{eq:svm_quad} \\
  \mathrm{s.t.} \quad & y_i (w^T x_i - \theta) \geq 1 - \xi_i, \nonumber
  \quad \xi_i \geq 0, \qquad i=1,\ldots,n.
\end{align}

By eliminating $\xi_i$, the above formula is equivalent to the following
formulation:
\begin{equation}
  (\hat{w},\hat{\theta}) =
  \arg\inf_{w,\theta} \frac{1}{n} \left[\sum_{i=1}^n g( y_i (w^T x_i - \theta) )
  + \lambda w^2\right],
  \label{eq:svm}
\end{equation}
where
\begin{equation}
g(z) =
\left\{
\begin{array}{ll}
1-z & \mbox{if $z\leq 1,$} \\
0 & \mbox{if $z > 1.$}
\end{array}
\right.
\end{equation}

The support vector machine method (\ref{eq:svm_quad}) has been applied
to text categorization \citep{Joa98,DPHS98}. It achieves a performance
comparable to the much more complex boosted decision trees of \citet{Weiss99}.

It is interesting to compare the least squares approach and the support
vector machine approach.
In the least squares formulation, the loss function $(z-1)^2$ implies that
we try to find a weight $w$ such that $w^T x \approx 1$ for an in-class
data point $x$, and $w^T x \approx -1$ for an out-of-class data point $x$.
Although this means that the formulation attempts to separate
the in-class data from the out-of-class data, it also penalizes a
well behaved data point $x$ such that $w^T x y >1$.
The support vector machine approach remedies this
problem by choosing a loss function that does not penalize a well-behaved
data point such that $w^T x y >1$.


Although a support vector machine in (\ref{eq:svm_quad}) is conceptually
appealing for classification problems, it performs poorly for CF.
This is rather surprising since SVMs perform very well in text categorization.
To understand what causes this failure, we have studied histograms of the
linear prediction score $w^T x$ from support vector machines
(See Section~\ref{sec:experiments}). This study revealed that the problem
is caused by  extremely unbalanced class distributions typically observed in CF.
That is, for many items, there are usually only a small percentage
of buyers who will be interested in each of them. From the classification point of view,
we can achieve a high confidence by saying that no buyer is interested in the
item, although this forfeits the purpose of using a recommender system.

We observe that a major reason that contributes to support vector machines'
susceptibility to unbalanced class distribution is the shape of $g(z)$ in
(\ref{eq:svm}). Specifically it does not penalize an error ($w^T x y <0$)
significant enough. Consider the initial point at
$\tilde{w}=0$ and $\tilde{\theta}=-1$.
In this case, all in-class data contribute $g(y_i(w^T x_i - \theta))=0$
in (\ref{eq:svm}), and all out-of-class data contribute
$g(y_i(w^T x_i - \theta))=2$ in (\ref{eq:svm}).
We now consider a change $\Delta w$ of $\tilde{w}$ that decreases the
objective function in (\ref{eq:svm}). For an in-class data point
$(x_i,y_i)$ such that $\Delta w^T x_i <0$,
$g(y_i(w^T x_i - \theta))$ is increased by $-\Delta w^T x_i$.
For an out-of-class data point $(x_i,y_i)$, $g(y_i(w^T x_i - \theta))$ is
decreased when $\Delta w^T x_i <0$, but no more than $\Delta w^T x_i$
(which may also be an increase).
Clearly if there are significantly more in-class data than out-of-class data,
it is difficult to find $\Delta w$ that can result in a net effect of decreasing
the right hand side of (\ref{eq:svm}).

The above analysis shows why the standard support vector machine formulation
(\ref{eq:svm_quad}) is susceptible to unbalanced class distribution. The trouble
is clearly caused by the fact that $g(z)$ has a constant gradient for $z <1$.
This also suggests a simple remedy: we replace the loss term $g$ by a
smooth non-increasing function $h$ so that the gradient magnitude $|h'(z)|$
of $h(z)$ is
large when $z<0$ and small when $z>0$. In this paper, we consider the function
$h(z)=g^2(z)$. Since $h'(1)=0$, our above analysis suggests that at
$\tilde{w}=0$ and $\tilde{\theta}=-1$, a small change $\Delta w$ leads to an
increase of the objective function in the order of $O(\Delta w^2)$ for
an in-class data point, and a potential decrease of the objective function in
the order of $O(\Delta w)$ for an out-of-class data point.
Therefore with the modified loss function $h(z)$, unbalanced class distribution
causes significantly fewer problems:
\begin{equation}
  \hat{w} = \arg\min_{w} \left[\frac{1}{n} \sum_{i=1}^n h(w^T x_i y_i) + 
    \lambda w^2\right],
  \label{eq:mls_reg}
\end{equation}
where
\begin{equation}
h(z) =
\left\{
\begin{array}{ll}
(1-z)^2 & \mbox{if $z\leq 1,$} \\
0 & \mbox{if $z > 1.$}
\end{array}
\right.
\end{equation}

Another interpretation of (\ref{eq:mls_reg}) is that it is a
modification of the least squares algorithm so that the method does not penalize a
data point with $w^T x y >1$.
In this sense, it is a mixture of the least squares method and a standard
SVM.\footnote{
It also belongs to an extended family of support vector machines, although such
an extension is recommended against by Vapnik.
However, in our case, it performs significantly better
than a standard SVM for CF problems due to its ability to handle unbalanced
class distribution.
}
We thus call it modified least squares.
In addition, it has a loss function that is
more smooth than that of an SVM, which makes it numerically simpler to solve.
A direct numerical optimization of (\ref{eq:mls_reg}) can be performed
relatively efficiently. Similar to (\ref{eq:llsf_reg}),
Algorithm~\ref{alg:PLS} in Appendix~\ref{apx:primal}
solves (\ref{eq:mls_reg}).

We would like to mention that there are other ways to deal with unbalanced
data. One well-known idea is to over-sample the minority class.\footnote{
Equivalently, one may also consider using different loss functions for in-class
and out-of-class data so as to penalize errors for the minority class more.}
Although this
method works well for binary classification problems, we find for CF, it is
difficult to perform this trick in a consistent way for all items without
affecting the relative ranking.
We should note that the class distribution itself does contain
important information since even the most naive ``Popular'' method
performs reasonably well despite its simplicity
(see Section~\ref{sec:experiments}). In this sense, the scheme of using
(\ref{eq:mls_reg}) is a much better way to handle the unbalanced problem.

To be consistent with other methods described in this paper,
we set $\theta=0$ in (\ref{eq:svm}).
As we have mentioned earlier, the effect of $\theta$ will be compensated by
appending a constant 1 to each data vector. This modification, given below,
does not affect the classification performance:
\begin{equation}
  \hat{w} =
  \arg\inf_{w} \left[\frac{1}{n} \sum_{i=1}^n g( w^T x_i y_i)
  + \lambda w^2\right].
  \label{eq:mod_svm}
\end{equation}

Due to the non-smoothness of $g$ (it has a discontinuous derivative),
it is numerically difficult to solve the optimization problem
(\ref{eq:mod_svm}) directly in the primal form.
We shall introduce an equivalent dual-form
of (\ref{eq:mod_svm}) and develop an algorithm to solve the resulting system in
Appendix~\ref{apx:dual}. Similarly, a dual form can also be obtained for
(\ref{eq:mls_reg}), which can be solved by
Algorithm~\ref{alg:DLS} in Appendix~\ref{apx:dual}.


\subsubsection{Naive Bayes Generative Model}

Another very popular linear classification algorithm is naive Bayes.
It has been applied to text categorization with reasonable performance,
although the performance is significantly worse than that achieved by
regularized linear classifiers such as support
vector machines. Still it is interesting to apply this method for CF due
to its simplicity. It is also very suitable for online updating, which
could be important in practice.
Our experiments indicate that the naive Bayes model outperforms
the decision tree model, but is poorer than the regularized
linear classification methods.

In this paper, we adopt the multinomial model described by \citet{MN98}.
Instead of taking $\theta=0$ by embedding the original data vectors into
a space of one larger dimension, in the naive Bayes approach, we
explicitly compute $\theta$ without doing the data transformation.
The linear weight $w$ is given by $w = w^1 - w^{-1}$, and
$\theta = \theta^1 - \theta^{-1}$.
Denote by $x_{i,j}$ the $j$-th component of the data vector $x_{i}$,
then the $j$-th component $w_{j}^c$ of $w^c$ ($c= \pm 1$) is given by
\[
w_j^c = \log \frac{\lambda + \sum_{i:y_i = c} x_{i,j}}
{\lambda d + \sum_{j=1}^d  \sum_{i:y_i = c} x_{i,j}},
\]
and $\theta^c$ ($c= \pm 1$) is given by
$\theta^c = -\log \frac{ |\{ i: y_i = c\}|}{n}$.

The parameter $\lambda>0$ in the above formulation is a smoothing
(regularization) parameter.
\citet{MN98} fixed $\lambda$ to be $1$, which corresponds to the Laplacian
smoothing. As we shall see from Section~\ref{sec:experiments}, the choice
of regularization $\lambda$ can significantly affect the performance.

There is another difference between the naive Bayes method described above and
the standard naive Bayes method used in text categorization
\citep{MN98}. A standard naive Bayes method would have
computed $w_j$ as $w_j^1$ and $\theta$ as $\theta^1$. That is, it only uses
the in-class data. Although this approach is reasonable in text categorization
(in that the quantity $\exp((w^{1})^{T} x_i - \theta^1)$ is proportional to the
probability of the data belonging to the category), it completely fails in CF.
The reason is that text categories tend to be more mutually exclusive, while
CF recommendations are not---after all, we want to find associations among
different items. Note that  $\exp(w^{1T} x_i - \theta^1)$ does not correspond
to the likelihood that we buy a particular item, which causes the failure
of the standard naive Bayes. The reason for using
$w=w^1-w^{-1}$ and $\theta=\theta^1-\theta^{-1}$ is now clear:
$1/(1+\exp(-(w^T x_i - \theta)))$ is the conditional probability of buying the
current item under the observation $x_i$.

Due to the above mentioned differences, we call the naive Bayes method
used in our study modified naive Bayes.

\section{Experiments}
\label{sec:experiments}
The true value of a recommender system can only be measured
by controlled experiments with actual users.  Such an experiment
could measure the improvement achieved by a specific
recommendation algorithm when compared to, say, recommending
the most popular item. Experiments with historical data have
been used to estimate the value of recommender algorithms
in the absence of controlled live experiments \citep{breese98,sarwar00}.
In this paper we will follow experimental procedures similar to those
introduced by \citet{breese98}.

\subsection{Data Sets}

\begin{table*}[htb]
\begin{center}
\begin{tabular}{||c|c|c|c||} \hline
Characteristics & \multicolumn{3}{|c|}{Dataset} \\ \cline{2-4}
& \textit{msweb} & \textit{pageweb} & \textit{wine}  \\ \hline
Training cases & 32711 & 9195 & 13103  \\ \hline
Total test cases & 5000 & 1804 & 2610  \\ \hline
Test cases with at least 2 items (\textit{All But 1}) & 3453 & 1243 & 1770  \\ \hline
Test cases with at least 3 items (\textit{Given 2})& 2213 &  932 & 1280  \\ \hline
Test cases with at least 6 items (\textit{Given 5})&  657 &  455 &  624  \\ \hline
Test cases with at least 11 items(\textit{Given 10}) &  102 &  168 &  268  \\ \hline
Total items & 294 & 5781 & 663  \\ \hline
Mean items per case & 3.02 & 4.36 & 4.60  \\
in training set & & & \\ \hline
\end{tabular}
\end{center}
\caption{Description of the data sets}
\label{dataset-description}
\end{table*}

Characteristics of the data sets used in our experiments are
given in Table~\ref{dataset-description}.
The first dataset \textit{msweb} was introduced by
\citet{breese98} and added to the UCI repository under the name
\textit{anonymous-msweb}.
As described by \citet{breese98}, this dataset contains
for each user the web page groups (called vroots) that were visited in
a fixed time period.
The total number of items is relatively small (around 300)
for this dataset and this can be attributed to
the fact that an item refers to a group of web pages.

The second dataset \textit{pageweb} also captures visits by
users to a different web site but at the individual page level
(with about 6000 total items).
Intuitively, one might expect the task of recommending specific
pages to be more difficult than that of recommending page groups.
But the other factor to be considered is that we also have
more fine-grained information at the individual page level
about user preferences that can be used by the models.
This dataset will be useful in evaluating how the various
algorithms handle recommending from a large number of items.

The third dataset \textit{wine} represents wine purchases
made by customers of a leading supermarket chain store
within a specified period.
The dataset captures for each customer the wines purchased
in this period as a binary value (purchased versus not purchased).

We have chosen to use a binary representation of the item/page
variables in all the experiments.
An alternative representation would be use more information like
the number of visits to a web page or the time spent viewing
a web page or the quantity of wine purchased.

\subsection{Experimental Setup}

Following the experimental setup introduced by \citet{breese98}, the
datasets are split into two disjoint sets (training and test)
of users.
The entire set of visits (or purchases) for users in the training set
is available for the model building process.
The known visits (purchases) for users
in the test set are split into two disjoint sets: given and hidden.
The given set is used by the recommender methods to rank all the
remaining items in the order of predicted preference to the user.
This ranked list is evaluated by using the hidden set as the reference
indicating what should have been predicted.

The evaluation metric R, proposed by \citet{breese98}, is based
on the assumption that each successive item in a list is less
likely to be interesting to the user
with an exponential decay.
This metric uses a
parameter, $\alpha$, which is the position in the ranked list
which has a 50-50 chance of being considered by the user.
Following \citet{breese98}, we will set $\alpha$ such that
the fifth position has a 50-50 chance of being considered.

The exponential decay in interest, which forms the basis for the $R$-metric,
is a plausible behavior model for consumers.  However, this metric
is not easy to interpret.
Also, the
number of allowed recommendations can also be constrained
by the environment.  For example, the form factor of a hand-held
device might restrict the number of recommendations on it to a small number.
Hence, we will also report results using a simpler metric which measures
the fraction of users for whom at least one valid recommendation
(according to the hidden set as reference)
was given in the top $K$ items of the ranked list.
In particular, we will report this metric for $K$ values 1, 3 and 10.

The split of the test set data into the given and hidden sets
is done as suggested by \citet{breese98}.
Three of these splits are denoted as \textit{Given 2}, \textit{Given 5},
and \textit{Given 10}.  These have 2, 5, and 10 items
chosen into the given sets, respectively.
The fourth split is denoted as \textit{All But 1} because one
item for each test user is randomly chosen to be hidden in this scenario.
These scenarios can be used to assess how each recommender system handles
different amounts of information
being known and to be hidden (predicted) for each test user.
Table~\ref{dataset-description} provides for each dataset the
number of test users that are included in each of these scenarios.

Each scenario will be run five times with different random choices
for the split between given and hidden subsets in the test data.
Mean values and standard deviations are computed over these
five experiments.
We have adopted this approach to be compatible with the prior
literature with regard to the training/test splits.
A more traditional approach would have been to use $n$-fold
cross validation where both training and test sets are different
in the experiments.
However, given the compatibility constraints,
performing multiple experiments with the given/hidden splits
provides some information on the experimental variability.

\subsection{Results}

The results achieved for all the four scenarios
(\textit{Given 2, Given 5, Given 10, All But 1})
are given in Tables~\ref{msweb-results}, \ref{psgweb-results}
and \ref{wine-results} for the datasets
\textit{msweb}, \textit{pageweb} and \textit{wine},
respectively.
The format in which these results are provided in these tables for each
combination of algorithm and scenario is
explained in
Table~\ref{table-description}.
The mean and the standard deviation for the $R$-metric
(expressed as percentage) are given on top.
The three numbers below indicate the percentage of
test users that had at least one successful recommendation
in the top 1, 3 and 10 positions of the ranked list.
The least squares and the modified least squares (primal and dual)
linear models were generated using 25 iterations
with the regularization parameter $\lambda$ set at 0.001.
The naive Bayes model used the value of 1 for the smoothing
parameter $\lambda$ \citep[as in][]{MN98}.


\begin{table}[bht]
\begin{center}
\begin{tabular}{||c|c|c||} \hline \hline
\multicolumn{3}{||c||}{$R$-metric $\ \pm \ $ std. dev.} \\ \hline
Success & Success & Success \\
within top  & within top & within top \\
1   & 3  & 10  \\ \hline \hline
\end{tabular}
\end{center}
\caption{Explanation of the entries in the Tables~\ref{msweb-results}, \ref{psgweb-results} and \ref{wine-results}}
\label{table-description}
\end{table}


The baseline approach of recommending
popular items does significantly poorer when compared
to the other algorithms on datasets with more items.
The decision tree model
also exhibits this pattern of not performing as
well on datasets with more items.

As mentioned earlier, one advantage of model-based
methods is that the model building is done off-line.
The model building times for the dataset \textit{msweb}
were around 500 seconds for linear least squares
and modified linear (primal) models and around 200 seconds
for modified linear (dual) model.
These times were recorded using our prototype implementation
on an IBM RISC System/6000, Model 43P-140 using
a 332 MHz PowerPC 604e processor.
These model build times can be compared to those
reported by \citet{heckerman00} for Bayesian networks (144.65 seconds)
and for dependency networks (98.31 seconds)
on a 300 MHz Pentium system.
%VSI
Applying our linear models to compute the scores is comparable in
simplicity to using models like decision trees and rule systems.
Hence, recommendations can be generated quickly in our system using these
computed scores.
%VSI


\begin{table*}[htb]
\begin{center}
\begin{tabular}{||c||c|c|c||c|c|c||c|c|c||c|c|c||} \hline \hline
algorithm &
\multicolumn{3}{c||}{Given2} &
\multicolumn{3}{c||}{Given5} &
\multicolumn{3}{c||}{Given10} &
\multicolumn{3}{c||}{AllBut1} \\ \hline \hline
Popular &   \multicolumn{3}{c||}{46.5$ \ \pm \ 0.2$} & \multicolumn{3}{c||}{43.7$ \ \pm \ 0.5$} &
            \multicolumn{3}{c||}{41.6$ \ \pm \ 2.0$} & \multicolumn{3}{c||}{46.5$ \ \pm \ 0.6$}  \\ \cline{2-13}
             & 33.9 & 55.8 & 82.0 & 29.0 & 55.6 & 80.6 &
               32.2 & 57.1 & 79.8 & 22.7 & 38.3 & 63.8 \\ \hline \hline
CR+     &   \multicolumn{3}{c||}{56.7$ \ \pm \ 0.1$} & \multicolumn{3}{c||}{54.2$ \ \pm \ 0.6$} &
            \multicolumn{3}{c||}{51.5$ \ \pm \ 1.9$} & \multicolumn{3}{c||}{60.8$ \ \pm \ 0.6$}  \\ \cline{2-13}
             & 45.0 & 70.5 & 88.7 & 39.9 & 68.3 & 88.1 &
               43.7 & 67.7 & 87.8 & 34.6 & 54.8 & 76.2 \\ \hline \hline
Decision &  \multicolumn{3}{c||}{53.4$ \ \pm \ 0.3$} & \multicolumn{3}{c||}{54.3$ \ \pm \ 0.7$} &
            \multicolumn{3}{c||}{53.0$ \ \pm \ 1.0$} & \multicolumn{3}{c||}{62.3$ \ \pm \ 0.5$}  \\ \cline{2-13}
Tree         & 46.6 & 71.3 & 87.4 & 48.0 & 73.9 & 88.5 &
               51.6 & 72.9 & 87.8 & 38.4 & 58.4 & 74.9 \\ \hline \hline
Least   &   \multicolumn{3}{c||}{55.7$ \ \pm \ 0.3$} & \multicolumn{3}{c||}{57.5$ \ \pm \ 0.7$} &
            \multicolumn{3}{c||}{57.0$ \ \pm \ 1.5$} & \multicolumn{3}{c||}{64.1$ \ \pm \ 0.5$}  \\ \cline{2-13}
Squares      & 46.9 & 72.4 & 89.6 & 49.9 & 75.0 & 90.9 &
               55.5 & 77.1 & 91.0 & 38.5 & 58.8 & 79.2 \\ \hline \hline
Mod LS  &   \multicolumn{3}{c||}{55.6$ \ \pm \ 0.3$} & \multicolumn{3}{c||}{57.7$ \ \pm \ 0.8$} &
            \multicolumn{3}{c||}{56.9$ \ \pm \ 1.4$} & \multicolumn{3}{c||}{64.4$ \ \pm \ 0.5$}  \\ \cline{2-13}
Primal       & 46.9 & 72.6 & 89.8 & 50.3 & 75.2 & 91.0 &
               56.1 & 76.9 & 91.8 & 38.9 & 59.1 & 79.6 \\ \hline \hline
Mod LS  &   \multicolumn{3}{c||}{55.2$ \ \pm \ 0.2$} & \multicolumn{3}{c||}{57.5$ \ \pm \ 0.9$} &
            \multicolumn{3}{c||}{56.7$ \ \pm \ 1.4$} & \multicolumn{3}{c||}{64.4$ \ \pm \ 0.6$}  \\ \cline{2-13}
Dual        & 46.5 & 72.9 & 89.7 & 50.5 & 75.6 & 90.5 &
              57.1 & 77.3 & 91.8 & 39.0 & 59.0 & 79.4 \\ \hline \hline
Modified &  \multicolumn{3}{c||}{57.8$ \ \pm \ 0.3$} & \multicolumn{3}{c||}{57.0$ \ \pm \ 0.9$} &
            \multicolumn{3}{c||}{52.1$ \ \pm \ 0.8$} & \multicolumn{3}{c||}{62.5$ \ \pm \ 0.5$}  \\ \cline{2-13}
Naive Bayes  & 48.4 & 72.0 & 89.4 & 48.7 & 71.8 & 89.5 &
               49.4 & 71.8 & 88.2 & 37.5 & 57.1 & 77.0 \\ \hline \hline
\end{tabular}
\end{center}
\caption{Results on dataset \textit{msweb}. For explanation of entries refer to Table~\ref{table-description}.}
\label{msweb-results}
\end{table*}



\begin{table*}[htb]
\begin{center}
\begin{tabular}{||c||c|c|c||c|c|c||c|c|c||c|c|c||} \hline \hline
algorithm &
\multicolumn{3}{c||}{Given2} &
\multicolumn{3}{c||}{Given5} &
\multicolumn{3}{c||}{Given10} &
\multicolumn{3}{c||}{AllBut1} \\ \hline \hline
Popular &   \multicolumn{3}{c||}{ 8.3$ \ \pm \ 0.3$} & \multicolumn{3}{c||}{ 7.0$ \ \pm \ 0.3$} &
            \multicolumn{3}{c||}{ 6.2$ \ \pm \ 0.7$} & \multicolumn{3}{c||}{ 7.6$ \ \pm \ 0.5$}  \\ \cline{2-13}
             &  7.5 & 14.5 & 29.5 &  6.0 & 12.4 & 27.9 &
                6.0 & 11.6 & 29.5 &  2.3 &  5.1 & 11.1 \\ \hline \hline
CR+     &   \multicolumn{3}{c||}{29.3$ \ \pm \ 0.8$} & \multicolumn{3}{c||}{31.9$ \ \pm \ 1.1$} &
            \multicolumn{3}{c||}{32.5$ \ \pm \ 1.4$} & \multicolumn{3}{c||}{33.3$ \ \pm \ 0.3$}  \\ \cline{2-13}
             & 28.8 & 47.6 & 67.2 & 30.2 & 50.1 & 68.7 &
               33.5 & 55.7 & 74.2 & 15.2 & 27.5 & 44.9 \\ \hline \hline
Decision &  \multicolumn{3}{c||}{16.2$ \ \pm \ 0.1$} & \multicolumn{3}{c||}{19.9$ \ \pm \ 0.6$} &
            \multicolumn{3}{c||}{23.3$ \ \pm \ 1.2$} & \multicolumn{3}{c||}{22.7$ \ \pm \ 0.7$}  \\ \cline{2-13}
Tree         & 23.3 & 33.5 & 47.0 & 28.9 & 41.6 & 52.7 &
               36.9 & 50.7 & 63.2 & 13.5 & 20.2 & 28.4 \\ \hline \hline
Least   &   \multicolumn{3}{c||}{27.7$ \ \pm \ 0.3$} & \multicolumn{3}{c||}{32.5$ \ \pm \ 0.9$} &
            \multicolumn{3}{c||}{35.4$ \ \pm \ 0.8$} & \multicolumn{3}{c||}{34.9$ \ \pm \ 0.6$}  \\ \cline{2-13}
Squares      & 30.1 & 48.4 & 66.8 & 33.7 & 53.2 & 71.6 &
               41.7 & 62.9 & 77.6 & 17.1 & 29.8 & 46.6 \\ \hline \hline
Mod LS  &   \multicolumn{3}{c||}{28.3$ \ \pm \ 0.3$} & \multicolumn{3}{c||}{33.0$ \ \pm \ 0.9$} &
            \multicolumn{3}{c||}{35.7$ \ \pm \ 1.1$} & \multicolumn{3}{c||}{35.5$ \ \pm \ 0.5$}  \\ \cline{2-13}
Primal       & 30.3 & 48.7 & 67.7 & 33.8 & 53.3 & 73.4 &
               41.1 & 61.4 & 77.9 & 17.2 & 30.2 & 47.4 \\ \hline \hline
Mod LS  &   \multicolumn{3}{c||}{27.8$ \ \pm \ 0.3$} & \multicolumn{3}{c||}{32.9$\ \pm \ 0.9$} &
            \multicolumn{3}{c||}{35.5$ \ \pm \ 1.2$} & \multicolumn{3}{c||}{35.2$\ \pm \ 0.5$}  \\ \cline{2-13}
Dual        & 29.9 & 48.7 & 67.1 & 34.6 & 53.4 & 73.1 &
              41.8 & 62.3 & 77.6 & 17.4 & 30.0 & 46.7 \\ \hline \hline
Modified&   \multicolumn{3}{c||}{26.1$ \ \pm \ 0.4$} & \multicolumn{3}{c||}{29.1$ \ \pm \ 0.8$} &
            \multicolumn{3}{c||}{30.2$ \ \pm \ 1.3$} & \multicolumn{3}{c||}{27.7$ \ \pm \ 0.4$}  \\ \cline{2-13}
Naive Bayes  & 25.9 & 43.4 & 60.9 & 28.8 & 47.4 & 63.9 &
               32.5 & 52.5 & 69.9 & 12.3 & 22.3 & 37.6 \\ \hline \hline
\end{tabular}
\end{center}
\caption{Results on dataset \textit{pageweb}. For explanation of entries refer to Table~\ref{table-description}.}
\label{psgweb-results}
\end{table*}

\begin{table*}[h]
\begin{center}
\begin{tabular}{||c||c|c|c||c|c|c||c|c|c||c|c|c||} \hline \hline
algorithm &
\multicolumn{3}{c||}{Given2} &
\multicolumn{3}{c||}{Given5} &
\multicolumn{3}{c||}{Given10} &
\multicolumn{3}{c||}{AllBut1} \\ \hline \hline
Popular &   \multicolumn{3}{c||}{15.3$\ \pm \ 0.2$} & \multicolumn{3}{c||}{14.5$\ \pm \ 0.2$} &
            \multicolumn{3}{c||}{14.2$\ \pm \ 0.4$} & \multicolumn{3}{c||}{13.6$\ \pm \ 0.8$}  \\ \cline{2-13}
             & 10.1 & 21.7 & 47.6 &  6.8 & 20.4 & 50.0 &
                5.4 & 19.9 & 51.2 &  5.3 &  9.2 & 19.8 \\ \hline \hline
CR+     &   \multicolumn{3}{c||}{23.7$\ \pm \ 0.3$} & \multicolumn{3}{c||}{24.6$\ \pm \ 0.4$} &
            \multicolumn{3}{c||}{26.7$\ \pm \ 0.8$} & \multicolumn{3}{c||}{21.4$\ \pm \ 0.5$}  \\ \cline{2-13}
             & 20.3 & 37.3 & 60.3 & 21.6 & 38.7 & 61.6 &
               25.5 & 42.5 & 65.3 &  8.5 & 16.7 & 29.8 \\ \hline \hline
Decision &  \multicolumn{3}{c||}{16.9$\ \pm \ 0.4$} & \multicolumn{3}{c||}{18.6$\ \pm \ 0.4$} &
            \multicolumn{3}{c||}{22.1$\ \pm \ 0.5$} & \multicolumn{3}{c||}{17.9$\ \pm \ 0.4$}  \\ \cline{2-13}
Tree         & 16.8 & 27.8 & 49.0 & 18.9 & 33.9 & 52.8 &
               21.4 & 41.3 & 60.1 &  7.6 & 14.3 & 24.1 \\ \hline \hline
Least   &   \multicolumn{3}{c||}{21.1$\ \pm \ 0.3$} & \multicolumn{3}{c||}{24.7$\ \pm \ 0.4$} &
            \multicolumn{3}{c||}{28.1$\ \pm \ 0.3$} & \multicolumn{3}{c||}{22.2$\ \pm \ 0.5$}  \\ \cline{2-13}
Squares      & 19.4 & 35.8 & 57.1 & 23.5 & 43.2 & 63.8 &
               25.9 & 46.3 & 68.1 &  8.8 & 17.5 & 31.3 \\ \hline \hline
Mod LS  &   \multicolumn{3}{c||}{20.8$\ \pm \ 0.2$} & \multicolumn{3}{c||}{24.4$\ \pm \ 0.4$} &
            \multicolumn{3}{c||}{27.8$\ \pm \ 0.3$} & \multicolumn{3}{c||}{22.3$\ \pm \ 0.4$}  \\ \cline{2-13}
Primal       & 19.2 & 35.7 & 56.9 & 23.9 & 42.8 & 63.6 &
               25.7 & 46.9 & 67.2 &  8.7 & 17.5 & 31.2 \\ \hline \hline
Mod LS  &   \multicolumn{3}{c||}{19.7$\ \pm \ 0.2$} & \multicolumn{3}{c||}{23.4$\ \pm \ 0.4$} &
            \multicolumn{3}{c||}{27.1$\ \pm \ 0.5$} & \multicolumn{3}{c||}{21.8$\ \pm \ 0.4$}  \\ \cline{2-13}
Dual        & 18.0 & 34.0 & 55.6 & 23.1 & 41.8 & 62.9 &
              26.6 & 46.2 & 66.9 &  8.6 & 17.3 & 30.2 \\ \hline \hline
Modified &  \multicolumn{3}{c||}{24.0$\ \pm \ 0.3$} & \multicolumn{3}{c||}{26.7$\ \pm \ 0.5$} &
            \multicolumn{3}{c||}{29.0$\ \pm \ 0.6$} & \multicolumn{3}{c||}{22.0$\ \pm \ 0.4$}  \\ \cline{2-13}
Naive Bayes  & 20.1 & 37.6 & 59.9 & 23.7 & 42.4 & 65.1 &
               25.5 & 45.8 & 68.2 &  8.3 & 16.4 & 31.4 \\ \hline \hline
\end{tabular}
\end{center}
\caption{Results on dataset \textit{wine}. For explanation of entries refer to Table~\ref{table-description}.}
\label{wine-results}
\end{table*}

The accuracy achieved on the public dataset \textit{msweb} cannot
be directly compared with the results of \citet{breese98} and
\citet{heckerman00} because of random choices made in the
given/hidden sets.
The results in Tables~\ref{msweb-results}, \ref{psgweb-results}
and \ref{wine-results} suggest that linear least squares
and the primal and dual forms of the modified version
fare well in comparison with our implementation
of CR+. For example, the linear (dual) model is more
accurate than CR+ in 11 out of the 12 experimental setups
(3 datasets with 4 scenarios in each)
using the success in the top 3 metric.
If we use the $R$-metric the linear (dual) model beats
CR+ in 8 out of the 12 experimental setups.


The impact of the regularization parameter $\lambda$
on the accuracy of three of the models is
shown in Figures~\ref{llsq-lambda}, ~\ref{mlsq-lambda} and ~\ref{nbayes-lambda}.
The choice of $\lambda$ makes a non-negligible
difference for all algorithms.
The value for this parameter could be chosen
using cross-validation experiments with the training data, though this
was not done in our study.
The figures also suggest that in practice, one may choose a fixed $\lambda$
with reasonable performance across a number of datasets, without any
cross-validation $\lambda$ selection.
We would like to mention that for all algorithms, the value
of $\lambda$ should be same for every potential recommendation item.
Otherwise, the computed scores $w^T x$ will not be comparable for different
items. A side effect is that we only have a single $\lambda$
to determine for each algorithm. Therefore a cross-validation procedure can
be used to determine this value stably.

\begin{figure}[htb]
\setlength{\unitlength}{1.0in}
\begin{center}
\begin{picture}(4.6,2.1)
\put(0,0.2){\psfig{figure=./msweb.llsq.lambda.eps,height=1.75in,width=1.5in}}
\put(0.5,0){\textit{msweb}}
\put(1.6,0.2){\psfig{figure=./psgweb.llsq.lambda.eps,height=1.75in,width=1.5in}}
\put(2.2,0){\textit{pageweb}}
\put(3.2,0.2){\psfig{figure=./wine.llsq.lambda.eps,height=1.75in,width=1.5in}}
\put(3.8,0){\textit{wine}}
\end{picture}
\end{center}
\caption{Linear least squares classifier accuracy vs. regularization parameter $\lambda$}
\label{llsq-lambda}
\end{figure}

\begin{figure}[htb]
\setlength{\unitlength}{1.0in}
\begin{center}
\begin{picture}(4.6,2.1)
\put(0,0.2){\psfig{figure=./msweb.linear.lambda.eps,height=1.75in,width=1.5in}}
\put(0.5,0){\textit{msweb}}
\put(1.6,0.2){\psfig{figure=./psgweb.linear.lambda.eps,height=1.75in,width=1.5in}}
\put(2.2,0){\textit{pageweb}}
\put(3.2,0.2){\psfig{figure=./wine.linear.lambda.eps,height=1.75in,width=1.5in}}
\put(3.8,0){\textit{wine}}
\end{picture}
\end{center}
\caption{Modified linear least squares (primal) classifier accuracy vs. regularization parameter $\lambda$}
\label{mlsq-lambda}
\end{figure}


\begin{figure}[htb]
\setlength{\unitlength}{1.0in}
\begin{center}
\begin{picture}(4.6,2.1)
\put(0,0.2){\psfig{figure=./msweb.nbayes.lambda.eps,height=1.75in,width=1.5in}}
\put(0.5,0){\textit{msweb}}
\put(1.6,0.2){\psfig{figure=./psgweb.nbayes.lambda.eps,height=1.75in,width=1.5in}}
\put(2.2,0){\textit{pageweb}}
\put(3.2,0.2){\psfig{figure=./wine.nbayes.lambda.eps,height=1.75in,width=1.5in}}
\put(3.8,0){\textit{wine}}
\end{picture}
\end{center}
\caption{Modified naive Bayes classifier accuracy vs. regularization parameter $\lambda$}
\label{nbayes-lambda}
\end{figure}

\subsection{Impact Of Loss Function}

To study the impact of the loss functions we will consider
the classifier performance for one particular item in the {\em msweb}
dataset.
The chosen item has an in-class probability of 0.0205.
The impact of loss functions on the classifier for this item
is shown in Figure~\ref{fig:loss}. Each plot
is a histogram of the projected data of $w^T x - \theta$, either for in-class,
or for out-of-class, or for combined in-class and out-of-class data.
The top row employs formulation (\ref{eq:mls_reg})
with loss function $h(z)$. The middle row employs formulation (\ref{eq:mod_svm})
with loss function $g(z)$. The bottom row employs a balancing technique in
(\ref{eq:mod_svm}), where we duplicate each in-class data point five times, which
gives an effective $10\%$ in-class population.

Clearly, from the results, we note that without balancing
(i.e., over sampling the minority in-class data), the SVM formulation
(\ref{eq:mod_svm}) performs very poorly since it does not separate the in-class
data from the out-of-class data at all. However, with balancing, it correctly
classifies some in-class data, but also misclassified some out-of-class data.
In this experiment, we over-sample the in-class data five times. However,
it is not clearly what is the best trade-off. We can also see that the modified
least squares formulation (\ref{eq:mls_reg}) manages to partially separate
in-class data from
out-of-class data even without balancing. Although the binary classification
error is still poor since the majority of in-class data are centered around
$w^T x - \theta \approx -0.5$, the resulting partial class separation is sufficient
to yield useful ranking information when we compare different items.
Table~\ref{msweb-svm-results} compares the performance of the SVM
and the modified least squares (dual) formulations on the
{\em msweb} dataset.



\begin{table*}[bht]
\begin{center}
\begin{tabular}{||c||c|c|c||c|c|c||c|c|c||c|c|c||} \hline \hline
algorithm &
\multicolumn{3}{c||}{Given2} &
\multicolumn{3}{c||}{Given5} &
\multicolumn{3}{c||}{Given10} &
\multicolumn{3}{c||}{AllBut1} \\ \hline \hline
Mod LS  &   \multicolumn{3}{c||}{55.2$ \ \pm \ 0.2$} & \multicolumn{3}{c||}{57.5$ \ \pm \ 0.9$} &
            \multicolumn{3}{c||}{56.7$ \ \pm \ 1.4$} & \multicolumn{3}{c||}{64.4$ \ \pm \ 0.6$}  \\ \cline{2-13}
Dual        & 46.5 & 72.9 & 89.7 & 50.5 & 75.6 & 90.5 &
              57.1 & 77.3 & 91.8 & 39.0 & 59.0 & 79.4 \\ \hline \hline
SVM      &  \multicolumn{3}{c||}{33.6$ \ \pm \ 0.4$} & \multicolumn{3}{c||}{42.1$ \ \pm \ 0.9$} &
            \multicolumn{3}{c||}{43.6$ \ \pm \ 0.8$} & \multicolumn{3}{c||}{46.2$ \ \pm \ 0.3$}  \\ \cline{2-13}
             & 41.3 & 53.9 & 66.0 & 44.6 & 65.0 & 76.8 &
               48.8 & 66.3 & 81.0 & 32.7 & 42.5 & 53.7 \\ \hline \hline
\end{tabular}
\end{center}
\caption{Comparing modified least squares (dual) and SVM on dataset \textit{msweb}. For explanation of entries refer to Table~\ref{table-description}.}
\label{msweb-svm-results}
\end{table*}

In addition, the histogram plots can also be used to partially explain why in
this particular application, the standard least squares  method does as well
as the more complicated modified least squares method.
We simply notice that histograms of projections resulted from (\ref{eq:mls_reg})
are concentrated in the interval $[-1,1]$. Consequently there is virtually no
difference between the standard least squares loss and the modified least squares
loss. However, for other applications such as text categorization, the data
projection is typically not contained in $[-1,1]$.
Consequently, the modified least squares
method performs better than the standard least squares method in those applications.

\begin{figure}[phtb]
  \begin{center}
    \setlength{\unitlength}{1.0in}
    \begin{picture}(6.2,5.7)
      \put(0,3.9){\psfig{file=L2in.eps,width=2in}}
      \put(0.4,3.8){loss=$h(z)$ (in-class)}
      \put(2.1,3.9){\psfig{file=L2out.eps,width=2in}}
      \put(2.4,3.8){loss=$h(z)$ (out-of-class)}
      \put(4.2,3.9){\psfig{file=L2comb.eps,width=2in}}
      \put(4.6,3.8){loss=$h(z)$ (combined)}

      \put(0,2.0){\psfig{file=L1s1in.eps,width=2in}}
      \put(0.3,1.9){loss=$g(z)$ (in-class)}
      \put(2.1,2.0){\psfig{file=L1s1out.eps,width=2in}}
      \put(2.3,1.9){loss=$g(z)$ (out-of-class)}
      \put(4.2,2.0){\psfig{file=L1s1comb.eps,width=2in}}
      \put(4.5,1.9){loss=$g(z)$ (combined)}

      \put(0,0.1){\psfig{file=L1s5in.eps,width=2in}}
      \put(0.3,0){balanced $g(z)$  (in-class)}
      \put(2.1,0.1){\psfig{file=L1s5out.eps,width=2in}}
      \put(2.3,0){balanced $g(z)$ (out-of-class)}
      \put(4.2,0.1){\psfig{file=L1s5comb.eps,width=2in}}
      \put(4.5,0){balanced $g(z)$ (combined)}
    \end{picture}
  \end{center}
  \caption{Impact of different loss functions ($2\%$ in-class population)}
  \label{fig:loss}
\end{figure}


\section{Conclusion}

This paper presents a model-based approach to recommender systems
using linear classification models.
We have explored
various linear formulations and the corresponding
algorithms for building the models.
Experiments are performed with three datasets
and recommendation accuracies compared using two
different metrics.
The experiments indicate that
the linear models are more accurate than
a memory-based
collaborative filtering approach reported earlier.
This improved accuracy in combination with
the better computational characteristics
makes these linear models very attractive
for this application.

\acks{We would like to thank Murray Campbell, Richard Lawrence and George Almasi
for their help during this work.}

\appendix
\section{Primal Algorithms}
\label{apx:primal}

Algorithms considered here are modified from those of
\citet{ZhaOle01} for text categorization. We include them here for completeness.
We consider a more general formulation
\begin{equation}
  \hat{w} = \arg\inf_{w} \left[\frac{1}{n} \sum_{i=1}^n f(w^T x_i y_i) + 
    \lambda w^2\right],
  \label{eq:primal}
\end{equation}
where $f$ is a relatively smooth function which has a continuous first order
derivative and a non-negative piecewise continuous second order derivative.
In this case, the formulation in (\ref{eq:primal}) is convex, thus
it has a unique local minimum which is also the global minimum.
Methods investigated
in this section are based on the following generic relaxation algorithm
\citep[see][]{GV96}:

\begin{minipage}{6in}
\begin{algorithm}[(Primal Gauss-Seidel)]
  let $w=0$ and $r_i= w^T x_i$ \\
  {\bf for} $k=1, 2 , \ldots $ \+ \\
  {\bf for} $j=1,\ldots, d$ \+ \\
  find $\Delta w_j$ by approximately minimizing \+\+ \\
  $\frac{1}{n}\sum_i f(r_i+\Delta w_j x_{ij} y_i) + \lambda (w_j+\Delta w_j)^2
  \quad \quad (*)$
  \-\-  \\
  update $r$: $r_i = r_i + \Delta w_j x_{ij} y_i \qquad \qquad (i=1,\ldots,n)$ \\
  update $w$: $w_j = w_j + \Delta w_j$ \- \\
  {\bf end} \- \\
  {\bf end}
  \label{alg:GS}
\end{algorithm}
\end{minipage}

The above relaxation method  is often called
the Gauss-Seidel procedure in numerical optimization. The algorithm
cycles through components of $w$, and optimizes one component at a time
(while keeping others fixed). For our problems, this method converges under 
quite moderate
conditions as long as we reduce the objective function value in $(*)$ at
each step.

We can now apply the above procedure to the regularized linear least squares
fit formulation (\ref{eq:llsf_reg}), and obtain Algorithm~\ref{alg:CLS}
where $(*)$ is solved exactly.

\begin{minipage}{4.65in}
\begin{algorithm}[(Least Squares Primal)]
  let $w=0$ and $r_i=0$ \\
  {\bf for} $k=1, 2 , \ldots $ \+ \\
  {\bf for} $j=1,\ldots, d$ \+ \\
  $\Delta w_j =-\left(\sum_i (r_i-1)x_{ij} y_i + \lambda n w_j\right)/
  \left(\sum_i x_{ij}^2 + \lambda n\right)$ \\
  update $r$: $r_i = r_i + \Delta w_j x_{ij} y_i \qquad \qquad (i=1,\ldots,n)$ \\
  update $w$: $w_j = w_j + \Delta w_j$ \- \\
  {\bf end} \- \\
  {\bf end}
  \label{alg:CLS}
\end{algorithm}
\end{minipage}

To apply Algorithm~\ref{alg:GS} to (\ref{eq:mls_reg}), it is helpful to
further enhance the smoothness of $h$  by introducing a
continuation parameter $c \in [0,1]$, so that
$h(x)=h_0(x)$ and the smoothness of $h_c''(x)$ decreases as $c$ decreases:
\[
  h_c(x) =
  \begin{cases}
    (x-1)^2 & x \leq 1    \\
    c(x-1)^2 & x>1.
  \end{cases}
\]
Algorithm~\ref{alg:GS} should then be modified so that at each step $k$, a
different $c_k$ is chosen (so that $1=c_1 \geq c_2 \geq \cdots \geq c_K =0$), and
the function $f$ shall be replaced by $f_{c_k}$.
Note that this introduction of a continuation parameter is not required in order
for Algorithm~\ref{alg:GS} to converge. However, in our experience, this
simple modification accelerates the rate of convergence. In the following,
we compute a Newton update to approximately solve $(*)$. However, to make the
method more robust, we use a step size $\Delta w_j$ in $(*)$ that is
half of the computed Newton update.

\begin{minipage}{4.5in}
\begin{algorithm}[(Modified Least Squares Primal)]
   let $w=0$ and $r_i= 0$ \\
   pick a decreasing sequence of $1=c_1 \geq  c_2 \geq \cdots \geq c_K=0$ \\
  {\bf for} $k=1, 2 , \ldots, K$ \+ \\
  define function $C_k(r_i) = 1$ if $r_i \leq 1$ and
  $C_k(r_i)=c_k$ otherwise \\
  {\bf for} $j=1,\ldots, d$ \+ \\
  $\Delta w_j = - 0.5\left[\sum_{i} C_k(r_i) (r_i-1) x_{ij} y_i + 
    \lambda n w_j\right]/
  \left[\sum_{i} C_k(r_i) x_{ij}^2 + \lambda n\right]$ \\
  update $r$: $r_i = r_i + \Delta w_j x_{ij} y_i \qquad \qquad (i=1,\ldots,n)$ \\
  update $w$: $w_j = w_j + \Delta w_j$  \- \\
  {\bf end} \- \\
  {\bf end}
  \label{alg:PLS}
\end{algorithm}
\end{minipage}

\section{Dual Algorithms}
\label{apx:dual}

As we have mentioned, it is inappropriate to solve
(\ref{eq:mod_svm}) directly using the primal Gauss-Seidel method in
Appendix~\ref{apx:primal}, due to the non-smoothness of $h$.
We need to introduce a dual form of (\ref{eq:primal}) and use a
dual form of the Gauss-Seidel method to solve the resulting system.

To obtain a dual form of (\ref{eq:primal}), we consider an auxiliary
variable $\zeta_i$ for each data point $x_i$:
\[
  (\hat{w},\hat{\zeta}) = \arg\inf_{w} \sup_{\zeta}
  \left[\frac{1}{n} \sum_{i=1}^n [-k(\zeta_i) + \zeta_i w^T x_i y_i] + 
    \lambda w^2\right],
\]
where $k(\cdot)$ is the Legendre transform of $f(\cdot)$:
$k(v) = \sup_{u} (uv - f(u))$. It is well known that $k$ is convex.
By switching the order of $\inf_{w}$ and $\sup_{\zeta}$, which is valid
for the above minimax convex programming problem
%(a proof is given in Appendix~\ref{apx:duality}), we obtain
\citep[see][for proof]{ZhaOle01}, we obtain
\[
\hat{\zeta} = \arg\sup_{\zeta}
\left[\frac{1}{n} \sum_{i=1}^n [-k(\zeta_i) + \zeta_i w^T x_i y_i] + \lambda w^2
\right],
\]
where $w$ is minimized at $w=-\frac{1}{2\lambda n} \sum_i \zeta_i x_i y_i$.
Substituting into the above equation, we obtain
\begin{equation}
\hat{\zeta} = \arg\inf_{\zeta} \left[\sum_{i=1}^n k(\zeta_i) + \frac{1}{4\lambda n}
(\sum_{i=1}^n \zeta_i x_i y_i)^2\right],
\label{eq:dual}
\end{equation}
and
\[
\hat{w} = -\frac{1}{2\lambda n} \sum_i \hat{\zeta}_i x_i y_i.
\]

Similar to Algorithm~\ref{alg:GS} which solves the primal problem
(\ref{eq:primal}), the following generic
relaxation algorithm solves the dual problem (\ref{eq:dual}):

\begin{minipage}{6in}
\begin{algorithm}[(Dual Gauss-Seidel)]
  let $\zeta=0$ and $v_j= 0$ for $j=1,\ldots,d$\\
  {\bf for} $k=1, 2 , \ldots $ \+ \\
  {\bf for} $i=1,\ldots, n$ \+ \\
  find $\Delta \zeta_i$ by approximately minimizing \+\+ \\
  $k(\zeta_i+\Delta \zeta_i) + \frac{1}{4\lambda n}(2 \Delta \zeta_i v^Tx_i y_i
  +\Delta \zeta_i^2 x_i^2)  \quad \quad (**)$
  \-\-  \\
  update $v$: $v_j = v_j + \Delta \zeta_i x_{ij} y_i \qquad \qquad (j=1,\ldots,d)$ \\
  update $\zeta$: $\zeta_i = \zeta_i + \Delta \zeta_i$ \- \\
  {\bf end} \- \\
  {\bf end}  \\
  let $w=-\frac{1}{2\lambda n} v$.
  \label{alg:DGS}
\end{algorithm}
\end{minipage}

An important implementation issue for Algorithm~\ref{alg:DGS}
is that the data ordering can have a significant effect on the rate
of convergence (the feature ordering does not
appear to have a very noticeable effect on Algorithm~\ref{alg:DGS}).
More specifically, if we order the data points such that members
in each class are grouped together, then we may experience a very slow convergence.
On the other hand, the dual algorithm appears to work very well with a
random data ordering.

For the modified SVM formulation in (\ref{eq:mod_svm}), we obtain
\[
k(z) =
  \begin{cases}
    z & -1 \leq z \leq 0,    \\
    +\infty & \text{otherwise}.
  \end{cases}
\]
The $+\infty$ value is effectively a simple constraint on each dual variable:
$-1 \leq z \leq 0$. Algorithm~\ref{alg:DGS} can be applied to this formulation.
The optimization step $(**)$ in Algorithm~\ref{alg:DGS} can be solved exactly,
and this leads to the following method:

\begin{minipage}{4.5in}
\begin{algorithm}[(Modified SVM Dual)]
  let $\zeta=0$ and $v_j= 0$ for $j=1,\ldots,d$\\
  {\bf for} $k=1, 2 , \ldots $ \+ \\
  {\bf for} $i=1,\ldots, n$ \+ \\
  $\Delta \zeta_i =  \min\left(-\zeta_i,\max\left(-1-\zeta_i, -\frac{2\lambda n +
    v^T x_i y_i}{x_i^2}\right)\right)$ \\
  update $v$: $v_j = v_j + \Delta \zeta_i x_{ij} y_i \qquad \qquad (j=1,\ldots,d)$\\
  update $\zeta$: $\zeta_i = \zeta_i + \Delta \zeta_i$ \- \\
  {\bf end} \- \\
  {\bf end}  \\
  let $w=-\frac{1}{2\lambda n} v$.
  \label{alg:DSVM}
\end{algorithm}
\end{minipage}

For the modified least squares formulation (\ref{eq:mls_reg}),
$k(z)$ is defined only for $z \leq 0$ ($k(z)=+\infty$ for $z>0$):
$k(z)= z^2/4+z$. We thus obtain an instance of Algorithm~\ref{alg:DGS}
with $(**)$ solved exactly:

\begin{minipage}{4.5in}
\begin{algorithm}[(Modified Least Squares Dual)]
  let $\zeta=0$ and $v_j= 0$ for $j=1,\ldots,d$\\
  {\bf for} $k=1, 2 , \ldots $ \+ \\
  {\bf for} $i=1,\ldots, n$ \+ \\
  $\Delta \zeta_i =\min\left(-\zeta_i,
  -\frac{(2+\zeta_i)\lambda n+ v^T x_i y_i}{\lambda n + x_i^2}\right)$ \\
  update $v$: $v_j = v_j + \Delta \zeta_i x_{ij} y_i \qquad \qquad (j=1,\ldots,d)$\\
  update $\zeta$: $\zeta_i = \zeta_i + \Delta \zeta_i$ \- \\
  {\bf end} \- \\
  {\bf end}  \\
  let $w=-\frac{1}{2\lambda n} v$.
  \label{alg:DLS}
\end{algorithm}
\end{minipage}


%\section{Validity Of The Dual Formulation}
%\label{apx:duality}

%We would like to show that it is valid to interchange the order of
%$\inf_w$ and $\sup_{\zeta}$ in (\ref{eq:primal-dual}).
%Let
%\[
%L(w,\zeta)=\frac{1}{n} \sum_{i=1}^n [-k(\zeta_i) + \zeta_i (w^T x_i y_i)]
%  + \lambda w^2.
%\]
%Then we need to show that there exists $(\hat{w},\hat{\zeta})$ such that
%\begin{equation}
%L(\hat{w},\hat{\zeta})=\inf_{w} \sup_{\zeta} L(w,\zeta) = \sup_{\zeta} \inf_{w} L(w,\zeta).
%\label{eq:duality}
%\end{equation}

%It is well-known (for example, see \citep{Roc70}) that the duality gap $G$
%is non-negative, where
%\[
%G = \inf_{w} \sup_{\zeta} L(w,\zeta) - \sup_{\zeta} \inf_{w} L(w,\zeta).
%\]
%Furthermore, equation (\ref{eq:duality}) has a solution if $G = 0$.
%We need to find $(\hat{w},\hat{\zeta})$ such that
%\begin{equation}
%  L(\hat{w},\hat{\zeta}) =\inf_{w} L(w,\hat{\zeta}) = \sup_{\zeta} L(\hat{w},\zeta).
%\end{equation}
%In this case, $(\hat{w},\hat{\zeta})$ is called a saddle point.

%To construct a saddle point, we consider $\hat{w}$ that minimizes the
%primal problem, which satisfies the following estimation equation:
%\[
%\frac{1}{n} \sum_{i=1}^n f'(\hat{w}^T x_i y_i) x_i y_i
%+ 2 \lambda \hat{w}=0.
%\]

%Now, let $\hat{\zeta}_i= f' (\hat{w}^T x_i y_i)$, we obtain:
%\[
%\hat{w} = -\frac{1}{2 \lambda n} \sum_{i=1}^n \hat{\zeta}_i x_i y_i.
%\]
%This implies that $\hat{w}$ achieves the minimum of
%\[
%\frac{1}{n} w^T \sum_{i=1}^n \hat{\zeta}_i x_i + \lambda w^2.
%\]
%That is,
%\begin{equation}
%L(\hat{w},\hat{\zeta}) = \inf_{w} L(w,\hat{\zeta}).
%\label{eq:duality-1}
%\end{equation}

%Also by the definition of $\hat{\zeta}_i$ as the derivative of $f(\cdot)$ at
%$\hat{w}^T x_i y_i$, we know that $\hat{\zeta}_i$ achieves the maximum of
%\[
%\zeta_i w^T x_i y_i -k(\zeta_i).
%\]
%That is,
%\begin{equation}
%L(\hat{w},\hat{\zeta}) = \sup_{\zeta} L(\hat{w},\zeta).
%\label{eq:duality-2}
%\end{equation}

%Combining (\ref{eq:duality-1}) and (\ref{eq:duality-2}),
%we obtain (\ref{eq:duality}),
%which implies the interchangeability of the order of
%$\inf_w$ and $\sup_{\zeta}$ in (\ref{eq:primal-dual}).

\bibliography{./alg,./mining}


\end{document}
% LocalWords:  LLNCS DEM Vijay Iyengar Zhang NY CF CR msweb interpretability cf
% LocalWords:  hyperplane Regularized regularization regularized Vapnik SVMs LS
% LocalWords:  SVM nomial UCI vroots pageweb PowerPC Pentium Almasi std dev vs
% LocalWords:  AllBut sparsity unbalancing Kaelbling resnick shardanand breese
% LocalWords:  sarwar hofmann heckerman ungar Joa YC ZhaOle uci eachmovie VSI
% LocalWords:  iyengar WST DCC Quinlan DPHS HoKe CV Vap MN GV Roc
