# Prepare training and test samples olive <- read.csv(file.choose()) head(olive) dim(olive) olive.south<-subset(olive, region == "South", select = area:eicosenoic) head(olive.south) dim(olive.south) olive.south$area <- factor(olive.south$area, levels=c("Calabria", "North Apulia", "Sicily", "South Apulia")) plotmatrix(olive.south[,2:9], colour=alpha("black",0.2)) p <- ggpcp(olive.south, vars=names(olive.south)[c(2,3,6,4,5,7,8,9)]) p + geom_line(aes(colour=area)) library(rggobi) ggobi(olive.south) table(olive.south$area) table(olive.south$area)*0.67 training.indx<-c(sample(1:56,38), sample(57:81,17), sample(82:117,24), sample(118:323,138)) training.indx<-sort(training.indx) olive.south.train<-olive.south[training.indx,] olive.south.test<-olive.south[-training.indx,] # Trees library(rpart) ?rpart olive.tree <- rpart(area ~ ., data=olive.south.train) table(olive.south.train$area, predict(olive.tree, olive.south.train, type="class")) table(olive.south.test$area, predict(olive.tree, olive.south.test, type="class")) # Random forests library(randomForest) ?randomForest olive.rf<-randomForest(area~.,data=olive.south.train, importance=TRUE,proximity=TRUE,mtry=2,ntree=1500) olive.rf$err.rate[1500,] olive.rf$importance table(olive.south.train$area,olive.rf$predicted) table(olive.south.test$area,predict(olive.rf, olive.south.test)) qplot(linoleic, palmitoleic, data=olive.south.train, colour=area) test.cl <- function(true, pred){ true <- max.col(true) cres <- max.col(pred) table(true, cres) } # Neural networks library(nnet) ?nnet olive.nn<-nnet(area~.,olive.south.train,size=4,linout=T,decay=0.005, range=0.06,maxit=1000) targetr <- class.ind(olive.south.train$type) targets <- class.ind(olive.south.test$type) test.cl(targetr, predict(olive.nn,olive.south.train)) test.cl(targets, predict(olive.nn,olive.south.test)) olive.nn.keep <- olive.nn # Your turn music <- read.csv(file.choose(), row.names=1) head(music) dim(music) music.train<-music[1:54,-1] music.test<-music[55:59,-1] music.rf<-randomForest(type~.,data=music.train, importance=TRUE,proximity=TRUE,mtry=2,ntree=500) music.rf$err.rate[500,] music.rf$importance table(music.train$type,music.rf$predicted) predict(music.rf, music.test) music.nn<-nnet(type~.,music.train,size=4,decay=0.01, maxit=1000) table(music.train$type, round(predict(music.nn, music.train))) round(predict(music.nn, music.test)) # Model-based clustering library(mclust) help(package="mclust") data(diabetes) head(diabetes) dim(diabetes) plotmatrix(diabetes[,-1]) ggpcp(diabetes, vars=names(diabetes[,-1])) + geom_line() diabetes.mc<-mclustBIC(diabetes[,-1]) plot(diabetes.mc) diabetes.mc.best<-mclustModel(diabetes[,-1], diabetes.mc, G=3, modelNames="VVV") diabetes$cl<-summary(diabetes.mc.best)$classification #plotmatrix(diabetes[,-1], aes(colour=factor(cl))) clrs<-brewer.pal(3, "Set2") pairs(diabetes[,2:4], col=clrs[diabetes$cl],pch=16) diabetes.mc.best$parameters # Self-organizing maps library(kohonen) help(package="kohonen") diabetes.som<-som(scale(diabetes[,2:4]), grid=somgrid(5,5, "hexagonal")) diabetes.som$codes plot(diabetes.som) sqrt(sum(diabetes.som$distances^2)) # Show map in data space f.ggobi.som<-function(x.som) { grd <- x.som$grid$pts[x.som$unit.classif,] xmx<-jitter(grd[,1],factor=2) xmy<-jitter(grd[,2],factor=2) ncols <- ncol(x.som$data) x.ggobi<-cbind(x.som$data,xmx,xmy) dimnames(x.ggobi)[[2]][ncols+1]<-"Map 1" dimnames(x.ggobi)[[2]][ncols+2]<-"Map 2" x.grid<-cbind(x.som$codes,x.som$grid$pts) dimnames(x.grid)[[2]]<-dimnames(x.ggobi)[[2]] x.clust<-rbind(x.ggobi,x.grid) } f.ggobi.som.net<-function(x.som) { x.net<-NULL for (i in 1:x.som$grid$xdim) { for (j in 1:x.som$grid$ydim) { if (j