lrec <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data", header=F) # go to http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.names for more information on the data summary(lrec) library(rpart) library(help=rpart) help(rpart) lrec.t1 <- rpart(V1~., data=lrec, method="class") plot(lrec.t1) text(lrec.t1) require(ggplot2) qplot(V12, binwidth=2.5, geom="histogram", data=lrec, fill=V1) qplot(V12, binwidth=2.5, geom="histogram", position="fill", data=lrec, fill=V1) qplot(V10, binwidth=0.5, geom="histogram", data=subset(lrec, V12 <=2.5), fill=V1) qplot(V10, binwidth=1, geom="histogram", position="fill", data=subset(lrec, V12 <=2.5), fill=V1) Y <- lrec$V1 X <- lrec$V2 qplot(V2, data=lrec) gini <- function(Y) { ns <- table(Y) probs <- ns/sum(ns) return(sum(probs*(1-probs))) } gini(Y) gini(Y[X<=1]) gini(Y[X<=2]) gini(Y[X<=3]) X <- lrec$V12 gini(Y[X<=1]) gini(Y[X>1]) gini(Y[X<=2]) gini(Y[X<=3]) gini(Y[X<=4]) entropy <- function(Y) { ns <- table(Y) probs <- ns/sum(ns) lprobs <- log(probs) lprobs[which(probs==0)] <- 0 return(-2*sum(probs*lprobs)) } method <- entropy method <- gini library(plyr) res <- ldply(2:ncol(lrec), function(x) { X <- lrec[,x] res <- ldply((min(X)+.5):(max(X)-.5), function(i) { return(c(x=i, li=sum(X <= i), left=method(Y[X <= i]), ri=sum(X > i), right=method(Y[X > i]))) }) res$var <- names(lrec)[x] return(res) }) res <- subset(res, (li>=20) &(ri >=20)) res[which.min(res$left+res$right),] qplot(V11, fill=V1, data=lrec) qplot(x, left+right, data=subset(res, var=="V11")) qplot(var, left+right, data=res) ############## # not particularly helpful - list of each split summary(lrec.t1) # inspect output object str(lrec.t1) lrec$tree <- predict(lrec.t1, type="class") qplot(tree, data=lrec) ggfluctuation(xtabs(~V1+tree, data=lrec)) lrec.t2 <- rpart(V1~., data=lrec[,1:17], method="class", control=list(cp=0.005)) plot(lrec.t2) text(lrec.t2) lrec$t2 <- predict(lrec.t2, type="class") qplot(t2, data=lrec) ggfluctuation(xtabs(~V1+t2, data=lrec)) loss <- matrix(1, ncol=26, nrow=26) diag(loss) <- 0 # same as default # assume it's 5 times as bad to mistake Gs and Os for each other loss[7,13] <- 5 loss[13,7] <- 5 lrec.t3 <- rpart(V1~., data=lrec[,1:17], method="class", parms=list(loss=loss)) plot(lrec.t3) text(lrec.t3) lrec$t3 <- predict(lrec.t3, type="class") ggfluctuation(xtabs(~V1+t3, data=lrec))