####1#### # #Smith et al. (2004, Plant Physiology, 136, 2687-2699) were #interested in studying gene expression changes during a #24-hour period divided into a 12-hour dark period and #a 12-hour light period. In each of two independent replications, #plants were sampled at 0, 1, 2, 4, 8, 12, 13, 14, 16, 20, and #24 hours, where 0 hours represents the transition from light to #dark, 12 hours represents the transition from dark to light, #and 24 hours is again the transition from light to dark. One #Affymetrix GeneChip was used to measure expression in a sample #of leaves collected for each time point and replication. #The Affymetrix signal measurements from a total of 22 #microarray slides are available on the web in a .csv file. #Read the data using the following command. # ######## d=read.csv("http://www.public.iastate.edu/~dnett/microarray/diurnaldata.csv") ####2#### # #Check the dimensions of d. # ######### dim(d) ####3#### # #Examine the first five rows and first five columns of the data. # ######## d[1:5,1:5] ####4#### # #Make side-by-side boxplots of the signal data. # ######### boxplot(d[,-1]) ####5#### # #Make side-by-side boxplots of the log signal data. # ######## boxplot(log(d[,-1])) ####6#### # #You should have received some warnings because #some of the signal values are 0 so taking #the log is not permitted. Add 1 to each signal value, #log the results, and make side-byside #boxplots of this data. # ######## boxplot(log(d[,-1]+1)) ####7#### # #Examine some scatterplots of log(signal+1) for one chip vs. log(signal+1) #for another. # ######### plot(log(d[,2]+1),log(d[,3]+1)) lines(c(-99,99),c(-99,99),lwd=2,col=2) #Change the column numbers (2 and 3) to see other chips #For example to see time 0 vs. time 24 in rep 1... plot(log(d[,2]+1),log(d[,11]+1)) lines(c(-99,99),c(-99,99),lwd=2,col=2) #There appear to be some intensity-dependent chip effects in #some cases, but let's proceed assuming that the log(signal+1) #data is suitable for analysis without further normalization. logd=data.frame(d[,1],log(d[,-1]+1)) ####8#### # #Load some R functions for microarray data analysis from a file, #and check to see what new objects have been added. # ######### source("http://www.public.iastate.edu/~dnett/microarray/rfunctions.txt") ####9#### # #Conduct one test for each gene to identify genes that exhibit #some significant change in expression across the 11 time points #surveyed. Collect one p-value for each gene. Also estimate #time-specific means for each gene so that the pattern of #expression over time can be monitored. # ########## rcbd.results=rcbd.analysis(logd,ntrt=11) pvals=rcbd.results$pvals means=rcbd.results$means stderrs=rcbd.results$stderrs #Take a look at the first few rows of means. means[1:5,] ####10#### # #Plot means + and - 1 SE for a given gene. # ########## plot.means(99) #Re-plot with more appropriate spacing and labels. plot.means(99, x=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time (hours)") ####11#### # #Make a histogram of the p-values. # ########## hist(pvals) #It looks like a lot of genes change expression. #Perhaps that is not surprising given the plant processes going on #over a 24-hour period with a dark and light transition. ####12#### # #Compute a q-value for each gene using the method of Benjamini #and Hochberg (1995). # ######### qvals=bh.fdr(pvals) ####13#### # #How many genes will you declare significant if you wish to keep #the false discovery rate no larger than 1%? # ######## sum(qvals<=0.01) ####14#### # #How many genes will you declare significant if you wish to keep #the false discovery rate no larger than 0.0001? # ######### sum(qvals<=0.0001) ####15#### # #Obtain matrix of means for the genes with q-values <= 0.0001. #We will try clustering this small subset of genes for illustration #purposes. # ######## m=means[qvals<=0.0001,] #Take a look at the first few rows of the matrix. m[1:4,] ####16#### # #Standardize the matrix of means so that Euclidean distance will #be related to 1-correlation dissimilarity as discussed in class. # ######## x=standardize(m) ####17#### # #Find all pairwise Euclidean distances between objects. # ######## D=dist(x) #The function dist finds Euclidean distances by default. #Type ?dist to see other distance choices. ?dist ####18#### # #load the clustering library so that we have access to built-in #clustering functions in R. # ######## library(cluster) ####19#### # #Cluster the data with K-mediods clustering using K=3. # ####### out=pam(D,k=3) ####20#### # #Make parallel coordinate plots of the clusters. # ######## plot.clusters(x,out$cluster,out$medoids, xvals=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time") ####21#### # #The gap statistic approach suggested K=7 clusters #The code can take a few minutes to run, but you could #do so as follows: # #choose.k=gap(x,12,100) #choose.k$gapK #[7] # #Because the selected number of clusters depends on randomly generated #values you might not get 7 when you run it. K=11 comes up sometimes #as well. # #Cluster the data with K-medoids clustering using K=7. # ######### out=pam(D,k=7) ####22#### # #Plot the clusters. # ######### plot.clusters(x,out$cluster,out$medoids, xvals=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time") ####23#### # #Examine Silhouette Widths. # ######### plot(out) attributes(out) attributes(out$silinfo) out$silinfo$widths #Note gene 9726 is one of the genes with a negative silhouette width. #The average distance to points in cluster 3 is less than the #average distance to points in its own cluster (cluster 6). plot.means(9726) ####24#### # #Perform agglomerative hierarchical clustering using average linkage #to measure the distance between clusters. # ######### out=hclust(D,method="average") ####25#### # #Plot the corresponding dendrogram. # ######### plot(out,hang=-1,labels=F) box() ####26#### # #Examine the genes joined during the first 10 steps of the clustering. # ######## out$merge[1:10,] #This indicates that rows 25 and 210 were the first two data objects joined. #The standardized data and gene ids for these rows are determined as follows. x[c(25,210),] #In terms of Affy ids and original data, these genes are determined as follows. d[c(1486,18255),] #Check their correlation. cor(as.numeric(logd[1486,-1]),as.numeric(logd[18255,-1])) #Plot the means +/- standard errors for these genes. par(mfrow=c(1,2)) plot.means(1486,x=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time (hours)") plot.means(18255,x=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time (hours)") #Turn off two plots per window. dev.off() ####27#### # #Perform divisive clustering using the approach proposed by #Macnaughton-Smith et al. (1965). # ######### out=diana(D) ####28#### # #Plot the corresponding dendrogram. # ######### plot(as.hclust(out),labels=F) ####29#### # #See cluster assignments for k=7 clusters according to the dendrogram. # ######## cutree(out,k=7) ####30#### # #Plot the 7 clusters. # ######## plot.clusters(x,cutree(out,k=7), xvals=c(0,1,2,4,8,12,13,14,16,20,24),xlab="Time") ####31#### # #Compute principal components. # ######## pcs=prcomp(x,center=T,scale.=T,retx=T) # #pcs$x is now a matrix containing the principal components of the data x. #pcs$x will have one row for each row in the original matrix x and one #column for each principal component. ####32#### # #Find the proportion of variation explained by each PC. # ######## cbind(1:ncol(x),pcs$sdev^2/sum(pcs$sdev^2)) ####33#### # #Find the cumulative proportion of variation explained by the PCs. # ######## cbind(1:ncol(x),cumsum(pcs$sdev^2)/sum(pcs$sdev^2)) ####34#### # #Plot pc2 vs. pc1. # ######## plot(pcs$x[,1],pcs$x[,2]) # #Color the points according to their cluster ids. # plot(pcs$x[,1],pcs$x[,2],col=cutree(out,k=7)) # #Color and label points according to their cluster ids. # plot(pcs$x[,1],pcs$x[,2],col=cutree(out,k=7),pch=as.character(cutree(out,k=7))) ####35#### # #Make a "heat map" for the data matrix. #Use the divisive clustering for genes #and agglomerative clustering for times. # ######## D.times=dist(t(x)) out.times=hclust(D.times,method="average") heatmap(x,Rowv=as.dendrogram(as.hclust(out)),Colv=as.dendrogram(out.times), labCol=c(0,1,2,4,8,12,13,14,16,20,24)) heatmap(x,Rowv=as.dendrogram(as.hclust(out)),Colv=as.dendrogram(out.times), labCol=c(0,1,2,4,8,12,13,14,16,20,24),labRow=F) #The lighter the color the higher the expression.