function newd = addfeatures(data) % function addfeatures(data), % returns the data with 4 extra features: % x1*x1, x1*x2, x2*x2, and log(x1)*log(x2) newd = [data data(:,1).*data(:,1) data(:,2).*data(:,2) data(:,1).*data(:,2) ... log(data(:,1)).*log(data(:,2))]; %---------------------------------------------------------------------------- function [means, sigmas, priors] = fit_gaussians(input, target) % function [means sigmas priors] = fit_gaussians(input, target) % % For each target class fit a multi-dimensional gaussian. % % Assumes the targets are stored one column per class. When the i'th target % column is greater than 0 it identifies that row as being an element of % class i. % % Returns the means, the covariance matrices, and the priors. % The ith row of means is the mean of the i'th class % invsigmas is a 3D array. The third index ranges of the classes. % priors is a vector. numclasses = size(target, 2); for i = 1:numclasses select = target(:,i) > 0; size(select) data = input(select,:); means(i,:) = mean(data); sigmas(:,:,i) = cov(data) priors(i,1) = size(data, 1) / size(input, 1); end %---------------------------------------------------------------------------- function [means, sigmas, priors, ss] = fit_sphere(data, target) % function [means, sigmas, priors, ss] = fit_sphere(data, target) % returns the estimates Gaussians for each class (their mean vectors, % covariance matrices, and priors) with the constraint that the % covariance matrices must be spherical. The ss returned variable % is a list of the scaling of each covariance matrix from the identity % matrix nc = size(target,2); dim = size(data,2); for i= 1:nc, s = target(:,i) > 0; size(s); d = data(s,:); means(i,:) = mean(d); diff = d-repmat(means(i,:),size(d,1),1); ss(i) = sum(sum(diff.*diff))/(size(d,1)*dim); sigmas(:,:,i) = eye(dim)*ss(i); ss(i) = sqrt(ss(i)); priors(i) = size(d,1) / size(data,1); end %---------------------------------------------------------------------------- function F = flagmax(X) % F = flagmax(X) % returns f which is the matrix of all zeros except for one 1 in each % row in the place of the maximum element of that row in X F = zeros(size(X)); [Y,I] = max(X,[],2); for i=1:size(X,1), F(i,I(i)) = 1; end; %---------------------------------------------------------------------------- function [class] = gaussian_classify(DATA, MEANS, SIGS, PRIORS) % % Computes the density of each datapoint under each of the Gaussian classes. % % ---- Preconditions ---- % DATA: rows contain multidimensional datapoints (d columns) % MEANS: the ith row is the mean of the ith model % SIGS: array of covariance matrices sigs(:,:,i) is the cov of ith model % PRIOR: column vector with the prior of the ith model in the ith row. % % ---- Postconditions ---- % class: a column vector where the nth row contains the classes of % the datapoint in the nth row of data. % % ----- Description ----- % This function computes a density from a multigaussian model, where each % gaussian has a different mean, covariance, and prior probability. In the % case of one gaussian, the function will also work, naturally. dens = gaussian_density(DATA, MEANS, SIGS, PRIORS); class = zeros(size(dens)); [val ind] = max(dens'); for i = 1:size(class,1) class(i,ind(i)) = 1; end %---------------------------------------------------------------------------- function [dens] = gaussian_density(DATA, MEANS, SIGS, PRIORS) % % Computes the density of each datapoint under each of the Gaussian classes. % % ---- Preconditions ---- % DATA: rows contain multidimensional datapoints (d columns) % MEANS: the ith row is the mean of the ith model % SIGS: array of covariance matrices sigs(:,:,i) is the cov of ith model % PRIOR: column vector with the prior of the ith model in the ith row. % % ---- Postconditions ---- % dens: a column vector where the nth row contains the density associated % with the datapoint in the nth row of data. % % ----- Description ----- % This function computes a density from a multigaussian model, where each % gaussian has a different mean, covariance, and prior probability. In the % case of one gaussian, the function will also work, naturally. [N,D] = size(DATA); % N <- number of datapoints, D <- dimensionality M = size(MEANS, 1); dens = zeros(N,M); for m=1:M, const = 1 / (sqrt((2*pi)^D) * sqrt(det(SIGS(:,:,m)))); % Tricky.. we want the sqrt of the inverse. invsig = SIGS(:,:,m)^-0.5; % Subtract mean cdata = DATA - ones(N,1)*MEANS(m,:); % Multiply each row by inv. cov. ndata = cdata * invsig; % Take the componentwise product pdata = ndata .* ndata; % Take the row sum of the componentwise products dots = sum(pdata')'; dens(:,m) = const * PRIORS(m) * exp(-0.5 * dots); end; %---------------------------------------------------------------------------- function out = gperceptron(inpts, params) % out = gperceptron(inpts, params) % classifies all of the input vectors (row vectors) stacked in the % matrix inpts by the generalized perceptron (using addfeatures function % to compute the new features -- these will be added by *this* function % and should not be added to inpts). params are the weights to the matrix % (notice that these are one wider than inpts due to the extra bias term and % the extra terms for the extra features). % I have chosen to use the maximum activation energy as my method % for disambiguating the classes out = flagmax(linear(addfeatures(inpts),params)); %---------------------------------------------------------------------------- function out = linear(inpts, params), % out = linear(inpts, params) % This function computes the linear output for a linear unit (it is % not a classification, but rather the outputs which when run through % flagmax become the classification). The params are the weights to % the network out = [ones(size(inpts,1),1) inpts]*params; %---------------------------------------------------------------------------- function out = glinear(inpts, params), % out = glinear(inpts, params) % This function computes the generalized linear output for a generalized linear unit (it is % not a classification, but rather the outputs which when run through % flagmax become the classification). The params are the weights to % the network inpts=addfeatures(inpts); out = [ones(size(inpts,1),1) inpts]*params; %---------------------------------------------------------------------------- function out = mlp(inpts, w, b) % out = mlp(inpts,w,b) % This function returns the output layer's unit's outputs for a multi-layer % perceptron with softmax output function on the last layer. flagmax of % this output will be a hard classification. w is the cell array of % weights at each layer and b is the cell array of bias weights at each % layer. nlayers = size(w,2); z=mlpforward(inpts,w,b); out = z{nlayers}; %---------------------------------------------------------------------------- function delta = mlpbackward(target, w, b, z) % delta = mlpbackward(target,w,b,z) % given the targets, the weights (w), the bias weights (b) and the % forward pass outputs (z) for each unit, this function returns the % delta values for each unit nlayers = size(w,2); % this works for the softmax output layer: delta{nlayers} = target - z{nlayers}; % now propigate backwards: for i=nlayers-(1:nlayers-1), delta{i} = delta{i+1}*w{i+1}' .* (z{i}.*(1-z{i})); end; %---------------------------------------------------------------------------- function z=mlpforward(input, w, b) % z = mlpforward(input, w, b) % given the input, the weights (w) and the bias weights (b), this functino % returns the output for each unit in the network (including the last % layer) nex = size(input,1); nin = size(input,2); nlayers = size(w,2); % set up first layer: zin{1} = input*w{1} + (ones(nex,1)*b{1}); z{1} = sigmoid(zin{1}); for i=2:nlayers, zin{i} = z{i-1}*w{i} + (ones(nex,1)*b{i}); z{i} = sigmoid(zin{i}); end; % for multiple output untis, we will take the softmax approach z{nlayers} = exp(-1 .* zin{nlayers}); outsum = sum(z{nlayers},2) * ones(1, size(z{nlayers},2)); z{nlayers} = z{nlayers} ./ outsum; %---------------------------------------------------------------------------- function [w,b] = mlpupdate(input,w,b,z,delta,eta) % [w,b] = mlpupdate(input,w,b,z,delta,eta) % This function updates the weights and bias weights in the network (w, % and b) using the outputs of the units (z) from the forwar pass, the % delta values from the backward pass and the learning rate (eta) nlayers = size(w,2); % first do first layer: w{1} = w{1} - eta*(input' * delta{1}); b{1} = b{1} - eta*sum(delta{1}); % now the rest, for i=2:nlayers, w{i} = w{i} - eta*(z{i-1}' * delta{i}); b{i} = b{i} - eta*sum(delta{i}); end; %---------------------------------------------------------------------------- function out = perceptron(inpts, params) % out = perceptron(inpts, params) % returns the classification of each of the points in inpts using the % weights in params % I have chosen to use the maximum activation energy as my method % for disambiguating the classes out = linear(inpts,params); if (size(out,2)==1), % special case out = out>0; else, out = flagmax(out); end; %---------------------------------------------------------------------------- function plot_gaussian(mean, sigma) % % function plot_gaussian(mean, sigma) % % ---- Preconditions ---- % MEAN: row % SIGMA: the covariance matrix % % ---- Postconditions ---- % % ----- Description ----- % Plots the eigenvectors of the Covariance matrix. % [u, s, v] = svd(sigma); drawvec1 = [mean; mean+(u(:,1)*sqrt(s(1,1)))'; mean-1*(u(:,1)*sqrt(s(1,1)))']; drawvec2 = [mean; mean+(u(:,2)*sqrt(s(2,2)))'; mean-1*(u(:,2)*sqrt(s(2,2)))']; drawvec = [drawvec1 ; drawvec2]; plot(drawvec(:,1), drawvec(:,2), 'r'); %---------------------------------------------------------------------------- function plot_gaussians(means, invsigmas) % % function plot_gaussians(means, sigmas) % % ---- Preconditions ---- % MEANS: each row is a different mean % INVSIGMAS: 3D matrix, invsigmas(:,:,1) is the first inv. covariance % % ---- Postconditions ---- % % ----- Description ----- % Plots the eigenvectors of the Covariance matrix. % hold on for i = 1:size(means,1) plot_gaussian(means(i,:), invsigmas(:,:,i)); end hold off %---------------------------------------------------------------------------- function plotc(f,npts,varargin) % plotc(f,npts,param1,param2,param3,...) % This function takes in a string with the name of a function that will % classify a matrix of points (each example is one row), f. npts is % the resolution at which to plot and the remaining parameters (any number) % are passed on without modification to the function to be plotted % % This function assumes that a graph has already been plotted and proceedes % to plot the decision boundries inside the axes of the current graph (thus % drawing over the current graph). For this to work, f needs to be a % function which takes a matrix of points (one per row) as the first % argument followed by the parameters given to plotc. f should return % a matrix of classification (one target vector per row) of 0's and 1's. % grab current dimensions of the plot: xr = get(gca,'XLim'); yr = get(gca,'YLim'); maxx = xr(2); minx = xr(1); maxy = yr(2); miny = yr(1); xstep = (maxx-minx)/(npts-1); ystep = (maxy-miny)/(npts-1); xvec = zeros(npts,1); yvec = zeros(npts,1); % evaluated the function f with parameters varargin at each point: i = 0; for y=0:npts-1, yvec(y+1) = ystep*y+miny; end; for x=0:npts-1, xvec(x+1) = xstep*x+minx; for y=0:npts-1, i = i+1; X(i,:) = [xvec(x+1) yvec(y+1)]; end; end; Z = feval(f,X,varargin{:}); if (size(Z,2)==1), Z = Z>0.5; else, Z = flagmax(Z); end; Z = permute(Z,[1 3 2]); Z = reshape(Z,npts,npts,size(Z,3)); % plot all of the contours and color in the regions nbound = size(Z,3); classcolor(1,:) = [0.7 0.3 0.3]; % red (r) classcolor(2,:) = [0.3 0.3 0.7]; % blue (b) classcolor(3,:) = [0.7 0.7 0.7]; % black (k) classcolor(4,:) = [0.3 0.7 0.7]; % cyan (c) classcolor(5,:) = [0.3 0.7 0.3]; % green (g) classcolor(6,:) = [0.7 0.3 0.7]; % magenta (m) classcolor(7,:) = [0.7 0.7 0.3]; % yellow/brown (y) linecolor(1) = 'r'; linecolor(2) = 'b'; linecolor(3) = 'k'; linecolor(4) = 'c'; linecolor(5) = 'g'; linecolor(6) = 'm'; linecolor(7) = 'y'; for j=1:nbound, [c,h]=contourf(xvec,yvec,Z(:,:,j),[0.5 0.5],linecolor(j)); hold on; for i=1:length(h), if (length(get(h(i),'FaceColor')) ~=3 | ... get(h(i),'FaceColor') ~= [1 1 1]), set(h(i),'FaceColor',classcolor(j,:)); end; end; end; % reset axes just to make sure they didn't get scaled set(gca,'XLim',xr); set(gca,'YLim',yr); hold off; %---------------------------------------------------------------------------- function plotlines(w), % function plotlines(w) % for 2D classification, this function plots the lines implied by the % weights in w xr = get(gca,'XLim'); yr = get(gca,'YLim'); px = []; py = []; for x=0:1:6, y = (-w(1,:)-w(2,:)*x)./w(3,:); px = [px x]; py = [py y']; end; plot(px',py(1,:)','r-'); hold on; if (size(py,1)>1), plot(px',py(2,:)','b-'); end; if (size(py,1)>2), plot(px',py(3,:)','k-'); end; set(gca,'XLim',xr); set(gca,'YLim',yr); hold off; % script to run all of ps2 solutions in order: res = 50; %---------------------------------------------------------------------------- function y = sigmoid(x) y = 1./(1+exp(-x)); %---------------------------------------------------------------------------- function w = traingperceptron(input,target,eta,etarate) % w = traingperceptron(input,target, eta, etarate) % returns the weights resulting from training a generalized perceptron % (the added features are produced by addfeatures). input and target % are the training points and eta and etarate define the learning rates % (starting and reduction factor respectively) % The training is carried out with on-line learning input = addfeatures(input); id = size(input,2); td = size(target,2); dd = size(input,1); % start the weights off randomly w = rand(id+1,td)*2 - 1; % on-line learning: lastdelta = w; % last weight change delta = zeros(size(w)); donecount = 0; % to count the number of times the angle of the % movement relative to the last movement is too small while(donecount<10 | eta>0.01), % itterate a few times just so that the cos(theta) gives % a less noisy value totaldelta = 0; for j=1:100, i = ceil(rand(1)*dd); if (i<0.5), i = dd; end; x = [1 input(i,:)]; t = target(i,:); y = (x*w) > 0.5; delta = x'*(t-y); totaldelta = totaldelta + delta; w = w + delta*eta; end; if (id==2), clf; plot_data(input,target); hold on; plotlines(w); hold off; pause(0.01); end; thisdot = sum(sum(totaldelta.*totaldelta)) costheta = sum(sum(lastdelta.*totaldelta))/... sum(sum(lastdelta.*lastdelta))/thisdot if (costheta < 0 | thisdot*eta < 0.01) donecount = donecount+1; else, donecount = 0; end; eta = eta*etarate lastdelta = totaldelta; end; %---------------------------------------------------------------------------- function W = trainlinear(data,target) % W = trainlinear(data,target) % returns the optimal linear discriminant classifier by solving the % least squares problem with the pseudo-inverse d = [ones(size(data,1),1) data]; W = pinv(d)*target; %---------------------------------------------------------------------------- function W = trainglinear(data,target) % W = trainglinear(data,target) % returns the optimal generalized linear discriminant classifier by solving the % least squares problem with the pseudo-inverse d = [ones(size(data,1),1) addfeatures(data)]; W = pinv(d)*target; %---------------------------------------------------------------------------- function [w,b] = trainmlp(input, target, layers, scale), % [w,b] = trainmlp(input,target,layers,scale) % creates and trains a multi-layered perceptron network on the training % data input and target. layers is a row vector indicating the number % of units in each layer other than the input (thus [2 4 5 2] indiciates % a network with 2 hidden units in the first layer, 4 in the next layer, % 5 in the last hidden layer, and 2 in the output layer). The mlp is % trained with softmax outputs (and thus must have one output for each % class). scale is the approximate range in which to set the weights. % % the w and b resulting parameters are the weights and bias weights for % the whole network (stored in a cell array of matrices). % initialize the network: nex = size(input,1); nin = size(input,2); nlayers = size(layers,2); layers = [nin , layers]; for i=1:nlayers, w{i} = scale*randn(layers(i),layers(i+1)); b{i} = scale*randn(1,layers(i+1)); end; % now train: eta = 0.01; for j=1:10, for i=1:100, z = mlpforward(input,w,b); % get all of the outputs delta = mlpbackward(target,w,b,z); [w,b] = mlpupdate(input,w,b,z,delta,eta); end; eta = eta*0.8; error = sum((z{nlayers}-target).^2) end; %---------------------------------------------------------------------------- function [w,b] = trainoptmlp(input,target,n) % [w,b] = trainoptmlp(input, target, n) % performs n-way cross-validation over different mlp topologies % using input and target as the training data. It tries a number of % networks with 1 to 3 hidden layers with 1 to 2*out hidden units each % (where out is the number of output units). nout = size(target,2); % try a number of hidden units per layer each for a number of hidden layers nhidden = 1:floor(nout/2):nout*2; nlayers = 1:1:3; % shuffle data npts = size(input,1); ppn = npts/n; rp = randperm(npts); d = input(rp,:); t = target(rp,:); besterror = -1; for h = nhidden, for l = nlayers, currerror = 0; for k=1:n, in = d([1:(k-1)*ppn k*ppn+1:npts],:); tar = t([1:(k-1)*ppn k*ppn+1:npts],:); [currw,currb] = trainmlp(in,tar,... [ones(1,l)*h nout],0.2); o = mlp(d((k-1)*ppn+1:k*ppn,:),currw,currb); currerror = currerror + ... sum(sum((o-t((k-1)*ppn+1:k*ppn,:)).^2)); end; if (besterror < 0 | besterror > currerror), besterror = currerror; bestnh = h; bestnl = l; end; end; end; % now that we know the "best network topology," we train on all the examples [w,b] = trainmlp(input,target,[ones(1,bestnl)*bestnh nout],0.2); %---------------------------------------------------------------------------- function w = trainperceptron(input,target,eta,etarate) % w = traingperceptron(input, target, eta, etarate) % returns the weights resulting from training a perceptron % input and target % are the training points and eta and etarate define the learning rates % (starting and reduction factor respectively) % The training is carried out with on-line learning id = size(input,2); td = size(target,2); dd = size(input,1); % start the weights off randomly w = rand(id+1,td)*2 - 1; % on-line learning: lastdelta = w; % last weight change delta = zeros(size(w)); donecount = 0; % to count the number of times the angle of the % movement relative to the last movement is too small while(donecount<10 | eta>0.01), % itterate a few times just so that the cos(theta) gives % a less noisy value totaldelta = 0; for j=1:100, i = ceil(rand(1)*dd); if (i<0.5), i = dd; end; x = [1 input(i,:)]; t = target(i,:); y = (x*w) > 0.5; delta = x'*(t-y); totaldelta = totaldelta + delta; w = w + delta*eta; end; if (id==2), clf; plot_data(input,target); hold on; plotlines(w); hold off; pause(0.01); end; thisdot = sum(sum(totaldelta.*totaldelta)) costheta = sum(sum(lastdelta.*totaldelta))/... sum(sum(lastdelta.*lastdelta))/thisdot if (costheta < 0 | thisdot*eta < 0.01) donecount = donecount+1; else, donecount = 0; end; eta = eta*etarate lastdelta = totaldelta; end; % since scaling the weights on a perceptron makes no difference and I % will be using the activation energy to disambiguate, I will normalize % all of the weight vectors. This will insure that comparing the % activation energies will make sense wsum = abs(sum(w,1)); w = w./repmat(wsum,id+1,1); %------------------------------------------------------------------------ %------------------------------------------------------------------------ % script to run all of ps3 solutions in order: res = 50; % first we'll do all of the aritificial dataset problems: load artificial; clf; % Problem #4: w = trainlinear(input,target); pred_p4_d1 = flagmax(linear(test,w)); figure; plot_data(input,target); hold on; plotc('linear',res,w); hold on; plot_data(input,target); title('Problem 4'); hold off; pause(0.01); % Problem #5: figure; s = target(:,3) == 0; easyi = input(s,:); easyt = target(s,1); w = trainperceptron(easyi,easyt,1,0.95); hold off; clf; plot_data(easyi,target(s,1:2)); hold on; plotc('perceptron',res,w); hold on; plot_data(easyi,target(s,1:2)); title('Problem 5 - 1 v 2'); hold off; pause(0.01); figure; s = target(:,2) == 0; easyi = input(s,:); easyt = target(s,1); w = trainperceptron(easyi,easyt,1,0.95); hold off; clf; plot_data(easyi,[target(s,1),target(s,3)]); hold on; plotc('perceptron',res,w); hold on; plot_data(easyi,[target(s,1),target(s,3)]); title('Problem 5 - 1 v 3'); hold off; pause(0.01); figure; s = target(:,1) == 0; easyi = input(s,:); easyt = target(s,2); w = trainperceptron(easyi,easyt,1,0.95); hold off; clf; plot_data(easyi,[target(s,2),target(s,3)]); hold on; plotc('perceptron',res,w); hold on; plot_data(easyi,[target(s,3),target(s,3)]); title('Problem 5 - 2 v 3'); hold off; pause(0.01); figure; w = trainperceptron(input,target,1,0.95); hold off; pred_p5_d1 = flagmax(perceptron(test,w)); clf; plot_data(input,target); hold on; plotc('perceptron',res,w); hold on; plot_data(input,target); title('Problem 5'); hold off; pause(0.01); % Problem #6: w = trainglinear(input,target,1,0.95); pred_p6_d1a = flagmax(glinear(test,w)); figure; plot_data(input,target); hold on; plotc('glinear',res,w); hold on; plot_data(input,target); title('Problem 6 - glinear'); hold off; pause(0.01); w = traingperceptron(input,target,1,0.95); pred_p6_d1b = flagmax(gperceptron(test,w)); figure; plot_data(input,target); hold on; plotc('gperceptron',res,w); hold on; plot_data(input,target); title('Problem 6 - gperceptron'); hold off; pause(0.01); % Problem #7: [w,b] = trainmlp(input,target,[4 3],0.2); pred_p7_d1 = flagmax(mlp(test,w,b)); figure; plot_data(input,target); hold on; plotc('mlp',res,w,b); hold on; plot_data(input,target); title('Problem 7'); hold off; pause(0.01); % Problem #8: [w,b] = trainoptmlp(input,target); pred_p8_d1 = flagmax(mlp(test,w,b)); figure; plot_data(input,target); hold on; plotc('mlp',res,w,b); hold on; plot_data(input,target); title('Problem 8'); xl = sprintf('%d hidden layers of %d units each',size(w,2)-1,size(w{1},2)); xlabel(xl); hold off; pause(0.01); % now the pen dataset load pen w = trainlinear(input,target); pred_p4_d2 = flagmax(linear(test,w)); w = trainperceptron(input,target,1,0.95); pred_p5_d2 = flagmax(perceptron(test,w)); [w,b] = trainmlp(input,target,[4 3],0.2); pred_p7_d2 = flagmax(mlp(test,w,b)); [w,b] = trainoptmlp(input,target); pred_p8_d2 = flagmax(mlp(test,w,b)); % now the mystery dataset load mystery w = trainlinear(input,target); pred_p4_d3 = flagmax(linear(test,w)); w = trainperceptron(input,target,1,0.95); pred_p5_d3 = flagmax(perceptron(test,w)); [w,b] = trainmlp(input,target,[4 3],0.2); pred_p7_d3 = flagmax(mlp(test,w,b)); [w,b] = trainoptmlp(input,target); pred_p8_d3 = flagmax(mlp(test,w,b));