library(ggplot2) # Generating samples from different distributions rnorm(10) x <- rnorm(1000) qplot(x, geom="histogram", fill=I("black")) x <- rnorm(1000, mean=10, sd=5) qplot(x, geom="histogram", fill=I("black")) rpois(10, 10) x <- rt(500, 2) qplot(x, geom="histogram", fill=I("black")) x <- rcauchy(100, 2) qplot(x, geom="histogram", fill=I("black")) x <- rgamma(1000, 0.5) qplot(x, geom="histogram", fill=I("black")) qplot(x, geom="density", fill=I("black")) x <- rgamma(1000, 0.1, fill=I("black")) qplot(x, geom="density") x <- rgamma(1000, 1) qplot(x, geom="density", fill=I("black")) # Your turn qplot(rnorm(100), geom="histogram", fill=I("black"), binwidth=0.5) qplot(rnorm(100), geom="density", fill=I("black")) qplot(rnorm(50, 10, 5), geom="density", fill=I("black")) qplot(rpois(1000, 50), geom="histogram", fill=I("black"), binwidth=5) qplot(rbeta(100, 0.1, 0.1), geom="density", fill=I("black")) qplot(runif(30), geom="histogram", fill=I("black"), binwidth=0.1) # Repetition replicate(10, mean(rnorm(100))) qplot(replicate(100, mean(rnorm(10))),geom="histogram", fill=I("black"),binwidth=0.2, xlim=c(-1,1)) qplot(replicate(100, mean(rnorm(100))),geom="histogram", fill=I("black"), binwidth=0.05, xlim=c(-1,1)) # Your turn qplot(replicate(1000, mean(runif(2, 0, 10))),geom="histogram", fill=I("black"), binwidth=0.5, xlim=c(0,10)) qplot(replicate(1000, mean(runif(10, 0, 10))),geom="histogram", fill=I("black"), binwidth=0.5, xlim=c(0,10)) qplot(replicate(1000, mean(runif(100, 0, 10))),geom="histogram", fill=I("black"), binwidth=0.2, xlim=c(0,10)) # Central Limit Theorem # Functions clt <- function(n, m) { x <- replicate(n, mean(runif(m))) qplot(x, geom="histogram", fill=I("black")) } clt(1000, 2) clt(1000, 10) clt(1000, 100) # Adding flexibility clt <- function(n, m, r) { x <- replicate(n, mean(r(m))) qplot(x, geom="histogram", fill=I("black")) } clt(1000, 2, runif) clt(1000, 2, rexp) clt(1000, 20, rexp) clt(1000, 1, rpois) qplot(replicate(1000, mean(rpois(2, 1))), geom="histogram", fill=I("black")) clt(1000, 2, function(m) rpois(m, lambda=1)) clt <- function(n, m, r, ...) { f <- function() mean(r(m, ...)) x <- replicate(n, f()) qplot(x, geom="histogram", fill=I("black"), ...) } clt(n=1000, m=2, r=rnorm, mean = 10, sd = 10) clt(n=1000, m=2, r=rpois, lambda=1) clt(n=1000, m=2, r=rpois, lambda=10) clt(n=1000, m=2, r=rpois, lambda=0.5) # Exploring t-tests ?t.test t.test(rnorm(20), rnorm(20)) str(t.test(rnorm(20), rnorm(20))) t.test(rnorm(20), rnorm(20))$p.value tt.check <- function(n) { t.test(rnorm(n), rnorm(n))$p.value } qplot(replicate(5000, tt.check(10)), geom="histogram", fill=I("black"), binwidth=0.1) qplot(replicate(5000, tt.check(100)), geom="histogram", fill=I("black"), binwidth=0.1) qplot(replicate(5000, tt.check(1000)), geom="histogram", fill=I("black"), binwidth=0.1) tt.check <- function(n, mn=1) { t.test(rnorm(n), rnorm(n, mn))$p.value } qplot(replicate(5000, tt.check(10)), geom="histogram", fill=I("black"), binwidth=0.1, xlim=c(0,1)) qplot(replicate(5000, tt.check(20)), geom="histogram", fill=I("black"), binwidth=0.1, xlim=c(0,1)) qplot(replicate(5000, tt.check(30)), geom="histogram", fill=I("black"), binwidth=0.1, xlim=c(0,1)) qplot(replicate(5000, tt.check(10, 0.5)), geom="histogram", fill=I("black"), binwidth=0.1, xlim=c(0,1)) qplot(replicate(5000, tt.check(20, 0.5)), geom="histogram", fill=I("black"), binwidth=0.1, xlim=c(0,1)) qplot(replicate(5000, tt.check(30, 0.5)), geom="histogram", fill=I("black"), binwidth=0.1, xlim=c(0,1)) # Check proportion of times: alpha=0.05 # Both means are equal, but reject the null hypothesis x<-replicate(5000, tt.check(10, 0)) length(x[x<0.05])/5000 x<-replicate(5000, tt.check(20, 0)) length(x[x<0.05])/5000 x<-replicate(5000, tt.check(30, 0)) length(x[x<0.05])/5000 # Both means are different, but fail to reject the null hypothesis x<-replicate(5000, tt.check(10)) length(x[x>0.05])/5000 x<-replicate(5000, tt.check(20)) length(x[x>0.05])/5000 x<-replicate(5000, tt.check(30)) length(x[x>0.05])/5000 x<-replicate(5000, tt.check(10, 0.5)) length(x[x>0.05])/5000 x<-replicate(5000, tt.check(20, 0.5)) length(x[x>0.05])/5000 x<-replicate(5000, tt.check(30, 0.5)) length(x[x>0.05])/5000 # What happens with samples from different different distributions tt.check.dist <- function(n, r1, r2) { t.test(r1(n), r2(n))$p.value } # Uniform x<-replicate(5000,tt.check.dist(10, r1=runif, r2=runif)) length(x[x<0.05])/5000 x<-replicate(5000,tt.check.dist(20, r1=runif, r2=runif)) length(x[x<0.05])/5000 x<-replicate(5000,tt.check.dist(30, r1=runif, r2=runif)) length(x[x<0.05])/5000 qplot(x, geom="histogram", fill=I("black"), binwidth=0.1, xlim=c(0,1)) # Exponential x<-replicate(5000,tt.check.dist(10, r1=rexp, r2=rexp)) length(x[x<0.05])/5000 x<-replicate(5000,tt.check.dist(20, r1=rexp, r2=rexp)) length(x[x<0.05])/5000 x<-replicate(5000,tt.check.dist(30, r1=rexp, r2=rexp)) length(x[x<0.05])/5000 qplot(x, geom="histogram", fill=I("black"), binwidth=0.1, xlim=c(0,1)) # t, df=2, df=30 x<-replicate(5000, tt.check.dist(10, r1=function(n) rt(n,2), r2=function(n) rt(n,2))) length(x[x<0.05])/5000 x<-replicate(5000, tt.check.dist(20, r1=function(n) rt(n,2), r2=function(n) rt(n,2))) length(x[x<0.05])/5000 x<-replicate(5000, tt.check.dist(30, r1=function(n) rt(n,2), r2=function(n) rt(n,2))) length(x[x<0.05])/5000 qplot(x, geom="histogram", fill=I("black"), binwidth=0.1, xlim=c(0,1)) x<-replicate(5000, tt.check.dist(10, r1=function(n) rt(n,30), r2=function(n) rt(n,30))) length(x[x<0.05])/5000 x<-replicate(5000, tt.check.dist(20, r1=function(n) rt(n,30), r2=function(n) rt(n,30))) length(x[x<0.05])/5000 x<-replicate(5000, tt.check.dist(30, r1=function(n) rt(n,30), r2=function(n) rt(n,30))) length(x[x<0.05])/5000 qplot(x, geom="histogram", fill=I("black"), binwidth=0.1, xlim=c(0,1)) # Bootstrap ?galaxies summary(galaxies) qplot(galaxies, geom="density", fill=I("black")) median(sample(galaxies, replace=T)) x<-replicate(5000, median(sample(galaxies, replace=T))) mean(x) sd(x) qplot(x, geom="density", fill=I("black"), xlim=range(galaxies)) qplot(x, geom="histogram", fill=I("black")) # Your turn qplot(diamonds$price, geom="density", fill=I("black")) x<-replicate(1000, var(sample(diamonds$price, replace=T))) mean(x) sd(x) qplot(x, geom="density", fill=I("black")) x<-replicate(1000, var(sample(diamonds$price, 10, replace=T))) mean(x) sd(x) qplot(x, geom="density", fill=I("black")) # Permutation x<-cbind(rnorm(10), rnorm(10)) t.test(x[,1],x[,2])$statistic y<-as.vector(x) lb<-rep(c(1,2), rep(nrow(x),2)) y<-cbind(y,sample(lb)) colnames(y)<-c("x","label") t.test(y[lb==1],y[lb==2])$p.value qplot(x,data=data.frame(y), geom="histogram", facets=label~.) tt.check.permute <- function(x) { y<-as.vector(x) lb<-rep(c(1,2), rep(nrow(x),2)) y<-cbind(y,sample(lb)) t.test(y[lb==1],y[lb==2])$statistic } x<-cbind(rnorm(10), rnorm(10)) y<-as.vector(x) lb<-rep(c(1,2), rep(nrow(x),2)) y<-cbind(y,lb) colnames(y)<-c("x","label") qplot(x,data=data.frame(y), geom="histogram", fill=I("black"), facets=label~.) z<-replicate(1000, tt.check.permute(x)) qplot(z, geom="histogram", fill=I("black")) t<-abs(t.test(x[,1],x[,2])$statistic) length(z[z<(-t) | z>t])/1000 x<-cbind(rnorm(100), rnorm(100)) y<-as.vector(x) lb<-rep(c(1,2), rep(nrow(x),2)) y<-cbind(y,lb) colnames(y)<-c("x","label") qplot(x,data=data.frame(y), geom="histogram", fill=I("black"), facets=label~.) z<-replicate(1000, tt.check.permute(x)) qplot(z, geom="histogram", fill=I("black")) t<-abs(t.test(x[,1],x[,2])$statistic) length(z[z<(-t) | z>t])/1000 x<-cbind(rnorm(100), rnorm(100, 0.5)) y<-as.vector(x) lb<-rep(c(1,2), rep(nrow(x),2)) y<-cbind(y,lb) colnames(y)<-c("x","label") qplot(x,data=data.frame(y), geom="histogram", fill=I("black"), facets=label~.) z<-replicate(1000, tt.check.permute(x)) qplot(z, geom="histogram", fill=I("black")) t<-abs(t.test(x[,1],x[,2])$statistic) length(z[z<(-t) | z>t])/1000 # Graphically qplot(carat, price, data=diamonds, geom=c("point","smooth"), method="lm") for (i in 1:5) { p <- qplot(carat, sample(price), data=diamonds, geom=c("point","smooth"), method="lm") print(p) readline("Ready to continue?") }