################################################### # # This file contains R code created to illustrate # FDR calculation and gene category testing as part # of the useR!2007 short course # # "Analyzing transcriptomic, metabolomic, and # biological network data using R, Bioconductor, # and GGobi" # # Author: Dan Nettleton # Date: August 7, 2007 # ################################################### ##### 1 ##### # #Load libraries # ############# library(GOstats) library(ALL) library(hgu95av2) library(qvalue) ##### 2 ##### # #Load the ALL data. #This is microarray data on 128 patients with #Acute Lymphocytic Leukemia (ALL) of two basic types #(B-cell and T-cell). # #See Chiaretti, S. et al. (2004) Blood 103, 2771-2778. # ############# data(ALL) ##### 3 ##### # #Examine the first 10 "feature names". #These are the names of the first 10 probe sets. # ############# featureNames(ALL)[1:10] ##### 4 ##### # #Examine the sample names. #There is one sample for each of 128 patients. # ############# sampleNames(ALL) ##### 5 ##### # #See the names of variables in the data set. # ############# varLabels(ALL) ##### 6 ##### # #Examine the B-cell or T-cell designation for each sample. # ############# phenoData(ALL)$BT ##### 7 ##### # #Define a new factor that has B- or T-cell ALL type designation. # ############# ALLtype=as.factor(substr(phenoData(ALL)$BT,1,1)) ##### 8 ##### # #Put the microarray data in a matrix y. #Rows correspond to probe sets and columns to samples. # ############# y=exprs(ALL) ##### 9 ##### # #Write some code that will allow us to get the usual two-sample #t-test p-value for each probe set. # ############# get.ttest.pvalue=function(y) { return(t.test(y[ALLtype=="B"],y[ALLtype=="T"],var.equal=T)$p.value) } tp=apply(y,1,get.ttest.pvalue) ##### 10 ##### # #Write some code that will allow us to get the Wilcoxon #rank sum p-value for each probe set. Then make a histogram #of those p-values, which will indicate that many genes differed #significantly in expression between B-cell and T-cell ALL. # ############## get.wtest.pvalue=function(y) { return(wilcox.test(y[ALLtype=="B"],y[ALLtype=="T"])$p.value) } wp=apply(y,1,get.wtest.pvalue) hist(wp,col=4) box() ##### 11 ##### # #Examine the q-value manual and then convert the p-values #to q-values for False Discovery Rate (FDR) estimation. # ############## vignette("qvalue") wq=qvalue(wp)$qvalues ##### 12 ##### # #Determine the number of probe sets we can declare significant #if we want FDR to be <= 0.05, and examine the names of these #probe sets. # ############## sum(wq<=0.05) names(wq[wq<=0.05]) ##### 13 ##### # #Find Biological Process (BP) Gene Ontology (GO) IDs #associated with the probe sets on the Affymetrix #hgu95av2 GeneChip. # ############## BPIDs=eapply(hgu95av2GO,getOntology,"BP") BPIDs[1:4] ##### 14 ##### # #Explicitly include the most generally applicable GO terms #for each probe set. For example, if a probe set is #associated with "ion transport" it should automatically #be associated with "transport". # ############## getAncestors=function(IDs){ if(length(IDs)>0){ IDs=sort(unique(c(IDs,unlist(mget(IDs,envir=GOBPANCESTOR)))))[-1] } return(IDs) } BPIDs=lapply(BPIDs,getAncestors) ##### 15 ##### # #Examine GO IDs for a few probe sets, and determine the GO #terms associated with a few GO IDs. # ############## BPIDs[1:4] getTerms=function(TermIDs){ n=length(TermIDs) Terms=rep(0,n) for(i in 1:n){ Terms[i]=Term(as.list(mget(TermIDs[i],envir=GOTERM))[[1]]) } return(Terms) } cbind(BPIDs$"34687_at",getTerms(BPIDs$"34687_at")) ##### 16 ##### # #Find the genes associated with the GO term #"ion transport". # ############## getCategory=function(IDs,termID){ check=function(ID,termpID){ return(termID%in%ID) } return(unlist(lapply(IDs,check,termID))) } ionTransportGenes=getCategory(BPIDs,"GO:0006811") ##### 17 ##### # #Test for a difference between B-cell and T-cell samples #with respect to the multivariate expression distribution #of ion transport genes. # #Several functions are provided below that are to be used #with the main function "mrpp". All functions were written #by Dan Nettleton. Further details are provided in a #technical report that is available from the author by #request (dnett@iastate.edu). # ############## e.com= function (y) { K=ncol(y) scale=rep(0,K) for(i in 1:K){ D=dist(y[,i]) scale[i]=1/sum(D) } y=sweep(y,2,scale,FUN="*") return(y) } get.delta= function (x,D,n,N,w) { inter.sum=tapply(1:N,x,FUN="inter.dist",D) inter.mean=inter.sum*2/(n*(n-1)) delta=sum(w*inter.mean) return(delta) } inter.dist= function (index,D) { return(sum(as.matrix(D)[index,index])/2) } mrpp= function (x,y,nperm) { D=dist(y) x=as.factor(x) n=table(x) N=sum(n) w=n/N delta.obs=get.delta(x,D,n,N,w) delta=rep(0,nperm) for(i in 1:nperm){ delta[i]=get.delta(sample(x),D,n,N,w) } return(p=mean(delta.obs>=delta)) } # #Obtain permutation p-value for the test #for a difference between B-cell and T-cell samples #with respect to the multivariate expression distribution #of ion transport genes. # #Only 100 data permutations are used here to reduce run time. #Nonetheless, the significance of the difference is not in doubt. # mrpp(ALLtype,e.com(t(y[ionTransportGenes,])),nperm=100) ##### 18 ##### # #Test for a difference between male and female samples of #type B1 with respect to the multivariate expression distribution #of ion transport genes. # ############## x=phenoData(ALL) table(x$BT,x$sex) mrpp(x$sex[x$BT=="B1"],e.com(t(y[ionTransportGenes,x$BT=="B1"])),nperm=100) # #Again only 100 permutations were used to reduce run time. #Even with 100 permutations it is obvious that there is no significant #difference between male and female samples for this subset of the data. #