Learning objective

The aim in this lecture is to understand and apply various clustering algorithms for exploratory data analysis.

Libraries to load

library(limma)
library(gplots)
library(e1071)
library(shiny)

Application of clustering algorithms to Leukemia dataset

ExpressionSet containing gene expresion data from 60 bone marrow samples of patients with one of the four main types of leukemia (ALL, AML, CLL, CML) or no-leukemia controls.

Platform: Affymetrix Human Genome U133 Plus 2.0

Annotation: genemapperhgu133plus2 (CDF from GATExplorer)

Mapping: Gene Ensembl ID (20172 features)

Tissue: Bone Marrow

Cell type: Mononuclear cells isolated by Ficoll density centrifugation

Disease type:
1. Acute Lymphoblastic Leukemia (ALL). Subtype: c-ALL / pre-B-ALL without t(9;22)
2. Acute Myeloid Leukemia (AML). Subtype: Normal karyotype
3. Chronic Lymphocytic Leukemia (CLL)
4. Chronic Myeloid Leukemia (CML)
5. Non-leukemia and healthy bone marrow (NoL)

All samples were obtained from untreated patients at the time of diagnosis. Preprocessing: The microarrays were normalized with RMA using a redefined probe mapping from Affymetrix probesets to Ensembl genes (Ensembl IDs ENSG). This alternative Chip Definition File (CDF) with complete unambiguous mapping of microarray probes to genes (GeneMapper) is available at GATExplorer (http://bioinfow.dep.usal.es/xgate/mapping/mapping.php) (Risueno et al. 2010).

# load data
library(leukemiasEset)
data(leukemiasEset)
# extract expression matrix from the "ExpressionSet object"
leukemia.mat <- exprs(leukemiasEset)
# obtain and summarize phenodata
phenodata <- pData(leukemiasEset)

hierarchical clustering analysis of Leukemia dataset

labs <- phenodata[,"LeukemiaType"]
#
par(mfrow=c(3,1))
data.dist=dist(t(leukemia.mat))
plot(hclust(data.dist), labels=labs, main="Complete Linkage", xlab="", sub="",ylab="", cex=1.3)
plot(hclust(data.dist, method="average"), labels=labs, main="Average Linkage", xlab="", sub="",ylab="", cex=1.3)
plot(hclust(data.dist, method="single"), labels=labs,  main="Single Linkage", xlab="", sub="",ylab="", cex=1.3)

hc.out=hclust(dist(t(leukemia.mat)))
hc.clusters = cutree(hc.out, 5)
table(hc.clusters, labs)
##            labs
## hc.clusters ALL AML CLL CML NoL
##           1   1   4   0   0   0
##           2   1   1   0   2   0
##           3  10   4   0   0   0
##           4   0   3   0  10  12
##           5   0   0  12   0   0
par(mfrow=c(1,1))
plot(hc.out, labels = labs)
abline(h=139, col="red")

hc.out
## 
## Call:
## hclust(d = dist(t(leukemia.mat)))
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 60

Identify and select differentially expressed genes from Leukemia dataset using LIMMA package

# create the design matrix
f = phenodata[colnames(leukemia.mat), 3]
design = model.matrix(~ f - 1)
colnames(design) <- c("ALL", "AML", "CLL", "CML", "NoL")
fit <- lmFit(leukemia.mat , design) # fitting data to a linear model
contrast.matrix <- makeContrasts(ALL-NoL, AML-NoL, CLL-NoL, CML-NoL, levels=design)  # defining group comparisons
fit2 <- contrasts.fit(fit, contrast.matrix) # convert the coefficients of the design matrix into these 4 contrasts which are to be tested equal to zero
fit2 <- eBayes(fit2) # With a series of related parameter estimates & standard errors, eBayes computes moderated t-statistics of differential expression by empirical Bayes shrinkage of the standard errors towards a common value.
leukemia.topDE <-leukemia.mat[order(fit2$F.p.value), ][1:500,] # Select the top 500 most differentially expressed genes
dim(leukemia.topDE)
## [1] 500  60

Use heatmap to visualize hierarchical clustering results

patientcolors = sapply(as.character(f), switch, ALL = "red", AML =  "orange", CLL = "blue",  CML="green", NoL="gray")
heatmap.2(leukemia.topDE, col=bluered(75), ColSideColors=patientcolors, density.info="none", trace="none", na.color = "black", margins=c(8, 8), main="Clustering of top500 DE genes", dendrogram = "column")

output session information

## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.3
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] leukemiasEset_1.14.0 Biobase_2.38.0       BiocGenerics_0.24.0 
## [4] shiny_1.0.5          e1071_1.7-0.1        gplots_3.0.1        
## [7] limma_3.34.9        
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.0         knitr_1.20         magrittr_1.5      
##  [4] xtable_1.8-2       R6_2.4.0           stringr_1.4.0     
##  [7] caTools_1.17.1     tools_3.4.3        KernSmooth_2.23-15
## [10] htmltools_0.3.6    gtools_3.5.0       class_7.3-14      
## [13] yaml_2.2.0         rprojroot_1.3-2    digest_0.6.18     
## [16] later_0.7.1        promises_1.0.1     bitops_1.0-6      
## [19] mime_0.5           evaluate_0.11      rmarkdown_1.10    
## [22] gdata_2.18.0       stringi_1.2.4      compiler_3.4.3    
## [25] backports_1.1.2    httpuv_1.4.0