####1####
#
#Smith et al. (2004, Plant Physiology, 136, 2687-2699) were
#interested in studying gene expression changes during a
#24-hour period divided into a 12-hour dark period and
#a 12-hour light period. In each of two independent replications,
#plants were sampled at 0, 1, 2, 4, 8, 12, 13, 14, 16, 20, and
#24 hours, where 0 hours represents the transition from light to
#dark, 12 hours represents the transition from dark to light,
#and 24 hours is again the transition from light to dark. One
#Affymetrix GeneChip was used to measure expression in a sample
#of leaves collected for each time point and replication.
#The Affymetrix signal measurements from a total of 22
#microarray slides are available on the web in a .csv file.
#Read the data using the following command.
#
########
d=read.csv("http://www.public.iastate.edu/~dnett/microarray/diurnaldata.csv")
####2####
#
#Check the dimensions of d.
#
#########
dim(d)
####3####
#
#Examine the first five rows and first five columns of the data.
#
########
d[1:5,1:5]
####4####
#
#Make side-by-side boxplots of the signal data.
#
#########
boxplot(d[,-1])
####5####
#
#Make side-by-side boxplots of the log signal data.
#
########
boxplot(log(d[,-1]))
####6####
#
#You should have received some warnings because
#some of the signal values are 0 so taking
#the log is not permitted. Add 1 to each signal value,
#log the results, and make side-byside
#boxplots of this data.
#
########
boxplot(log(d[,-1]+1))
####7####
#
#Examine some scatterplots of log(signal+1) for one chip vs. log(signal+1)
#for another.
#
#########
plot(log(d[,2]+1),log(d[,3]+1))
lines(c(-99,99),c(-99,99),lwd=2,col=2)
#Change the column numbers (2 and 3) to see other chips
#For example to see time 0 vs. time 24 in rep 1...
plot(log(d[,2]+1),log(d[,11]+1))
lines(c(-99,99),c(-99,99),lwd=2,col=2)
#There appear to be some intensity-dependent chip effects in
#some cases, but let's proceed assuming that the log(signal+1)
#data is suitable for analysis without further normalization.
logd=data.frame(d[,1],log(d[,-1]+1))
####8####
#
#Load some R functions for microarray data analysis from a file,
#and check to see what new objects have been added.
#
#########
source("http://www.public.iastate.edu/~dnett/microarray/rfunctions.txt")
####9####
#
#Conduct one test for each gene to identify genes that exhibit
#some significant change in expression across the 11 time points
#surveyed. Collect one p-value for each gene. Also estimate
#time-specific means for each gene so that the pattern of
#expression over time can be monitored.
#
##########
rcbd.results=rcbd.analysis(logd,ntrt=11)
pvals=rcbd.results$pvals
means=rcbd.results$means
stderrs=rcbd.results$stderrs
#Take a look at the first few rows of means.
means[1:5,]
####10####
#
#Plot means + and - 1 SE for a given gene.
#
##########
plot.means(99)
#Re-plot with more appropriate spacing and labels.
plot.means(99, x=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time (hours)")
####11####
#
#Make a histogram of the p-values.
#
##########
hist(pvals)
#It looks like a lot of genes change expression.
#Perhaps that is not surprising given the plant processes going on
#over a 24-hour period with a dark and light transition.
####12####
#
#Compute a q-value for each gene using the method of Benjamini
#and Hochberg (1995).
#
#########
qvals=bh.fdr(pvals)
####13####
#
#How many genes will you declare significant if you wish to keep
#the false discovery rate no larger than 1%?
#
########
sum(qvals<=0.01)
####14####
#
#How many genes will you declare significant if you wish to keep
#the false discovery rate no larger than 0.0001?
#
#########
sum(qvals<=0.0001)
####15####
#
#Obtain matrix of means for the genes with q-values <= 0.0001.
#We will try clustering this small subset of genes for illustration
#purposes.
#
########
m=means[qvals<=0.0001,]
#Take a look at the first few rows of the matrix.
m[1:4,]
####16####
#
#Standardize the matrix of means so that Euclidean distance will
#be related to 1-correlation dissimilarity as discussed in class.
#
########
x=standardize(m)
####17####
#
#Find all pairwise Euclidean distances between objects.
#
########
D=dist(x)
#The function dist finds Euclidean distances by default.
#Type ?dist to see other distance choices.
?dist
####18####
#
#load the clustering library so that we have access to built-in
#clustering functions in R.
#
########
library(cluster)
####19####
#
#Cluster the data with K-mediods clustering using K=3.
#
#######
out=pam(D,k=3)
####20####
#
#Make parallel coordinate plots of the clusters.
#
########
plot.clusters(x,out$cluster,out$medoids,
xvals=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time")
####21####
#
#The gap statistic approach suggested K=7 clusters
#The code can take a few minutes to run, but you could
#do so as follows:
#
#choose.k=gap(x,12,100)
#choose.k$gapK
#[7]
#
#Because the selected number of clusters depends on randomly generated
#values you might not get 7 when you run it. K=11 comes up sometimes
#as well.
#
#Cluster the data with K-medoids clustering using K=7.
#
#########
out=pam(D,k=7)
####22####
#
#Plot the clusters.
#
#########
plot.clusters(x,out$cluster,out$medoids,
xvals=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time")
####23####
#
#Examine Silhouette Widths.
#
#########
plot(out)
attributes(out)
attributes(out$silinfo)
out$silinfo$widths
#Note gene 9726 is one of the genes with a negative silhouette width.
#The average distance to points in cluster 3 is less than the
#average distance to points in its own cluster (cluster 6).
plot.means(9726)
####24####
#
#Perform agglomerative hierarchical clustering using average linkage
#to measure the distance between clusters.
#
#########
out=hclust(D,method="average")
####25####
#
#Plot the corresponding dendrogram.
#
#########
plot(out,hang=-1,labels=F)
box()
####26####
#
#Examine the genes joined during the first 10 steps of the clustering.
#
########
out$merge[1:10,]
#This indicates that rows 25 and 210 were the first two data objects joined.
#The standardized data and gene ids for these rows are determined as follows.
x[c(25,210),]
#In terms of Affy ids and original data, these genes are determined as follows.
d[c(1486,18255),]
#Check their correlation.
cor(as.numeric(logd[1486,-1]),as.numeric(logd[18255,-1]))
#Plot the means +/- standard errors for these genes.
par(mfrow=c(1,2))
plot.means(1486,x=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time (hours)")
plot.means(18255,x=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time (hours)")
#Turn off two plots per window.
dev.off()
####27####
#
#Perform divisive clustering using the approach proposed by
#Macnaughton-Smith et al. (1965).
#
#########
out=diana(D)
####28####
#
#Plot the corresponding dendrogram.
#
#########
plot(as.hclust(out),labels=F)
####29####
#
#See cluster assignments for k=7 clusters according to the dendrogram.
#
########
cutree(out,k=7)
####30####
#
#Plot the 7 clusters.
#
########
plot.clusters(x,cutree(out,k=7),
xvals=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time")
####31####
#
#Compute principal components.
#
########
pcs=prcomp(x,center=T,scale.=T,retx=T)
#
#pcs$x is now a matrix containing the principal components of the data x.
#pcs$x will have one row for each row in the original matrix x and one
#column for each principal component.
####32####
#
#Find the proportion of variation explained by each PC.
#
########
cbind(1:ncol(x),pcs$sdev^2/sum(pcs$sdev^2))
####33####
#
#Find the cumulative proportion of variation explained by the PCs.
#
########
cbind(1:ncol(x),cumsum(pcs$sdev^2)/sum(pcs$sdev^2))
####34####
#
#Plot pc2 vs. pc1.
#
########
plot(pcs$x[,1],pcs$x[,2])
#
#Color the points according to their cluster ids.
#
plot(pcs$x[,1],pcs$x[,2],col=cutree(out,k=7))
#
#Color and label points according to their cluster ids.
#
plot(pcs$x[,1],pcs$x[,2],col=cutree(out,k=7),pch=as.character(cutree(out,k=7)))
####35####
#
#Make a "heat map" for the data matrix.
#Use the divisive clustering for genes
#and agglomerative clustering for times.
#
########
D.times=dist(t(x))
out.times=hclust(D.times,method="average")
heatmap(x,Rowv=as.dendrogram(as.hclust(out)),Colv=as.dendrogram(out.times),
labCol=c(0,1,2,4,8,12,13,14,16,20,24))
heatmap(x,Rowv=as.dendrogram(as.hclust(out)),Colv=as.dendrogram(out.times),
labCol=c(0,1,2,4,8,12,13,14,16,20,24),labRow=F)
#The lighter the color the higher the expression.