Topic 1. k-means based clustering algorithms

Apply k-means clustering on iris data

dim(iris)
## [1] 150   5
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50
class(iris)
## [1] "data.frame"
# set up plots
par(mfrow=c(2,2))
# plot original data
speciesColors = sapply(as.character(iris$Species), switch, setosa = "red", versicolor = "blue",  virginica="green")
data.mat <- cbind(iris$Petal.Length, iris$Sepal.Width, iris$Species)
colnames(data.mat) <- c("Petal.Length", "Sepal.Width", "Species")
plot(data.mat[,-3], col=speciesColors, main="orignal data with class information")

# apply k-means with k=2
set.seed(1)
km.out2 <- kmeans(data.mat[,-3], centers=2)
plot(data.mat[,-3], col=(km.out2$cluster+1), main="k-means clustering results with k=2", xlab="f1", ylab="f2", pch=20, cex=2)

# apply k-means with k=3
set.seed(1)
km.out3 <- kmeans(data.mat[,-3], centers=3)
plot(data.mat[,-3], col=(km.out3$cluster+1), main="k-means clustering results with k=3", xlab="f1", ylab="f2", pch=20, cex=2)

# apply k-means with k=4
set.seed(1)
km.out4 <- kmeans(data.mat[,-3], centers=4)
plot(data.mat[,-3], col=(km.out4$cluster+1), main="k-means clustering results with k=4", xlab="f1", ylab="f2", pch=20, cex=2)

Cluster statistics

# what is the output from k-means clustering?
km.out2
## K-means clustering with 2 clusters of sizes 51, 99
## 
## Cluster means:
##   Petal.Length Sepal.Width
## 1     1.492157    3.409804
## 2     4.925253    2.875758
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 11.68196 74.62869
##  (between_SS / total_SS =  82.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
# check between and within cluster sum of squares.
km.out2$betweenss
## [1] 406.3217
km.out2$tot.withinss
## [1] 86.31065
km.out3$betweenss
## [1] 451.8953
km.out3$tot.withinss
## [1] 40.73707
km.out4$betweenss
## [1] 465.1476
km.out4$tot.withinss
## [1] 27.4847

Topic 2. Application of c-means clustering.

notice that we use the membership score to control the size of the point to plot. k-means clustering algorithm will not create membership score and will create a hard clustering whereas c-means gives membership score which can be used as a confidence meansure of clustering for each sample.

library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
# set up plots
par(mfrow=c(2,2))
# apply c-means with k=1
set.seed(1)
cm.out2 <- cmeans(data.mat[,-3], centers=2, m = 2)
plot(data.mat[,-3], col=(cm.out2$cluster+1), main="c-means clustering results with k=2", xlab="f1", ylab="f2", pch=20, cex=apply(cm.out2$membership+0.5, 1, max)^2)

# apply c-means with k=3
set.seed(1)
cm.out3 <- cmeans(data.mat[,-3], centers=3)
plot(data.mat[,-3], col=(cm.out3$cluster+1), main="c-means clustering results with k=3", xlab="f1", ylab="f2", pch=20, cex=apply(cm.out2$membership+0.5, 1, max)^2)

# apply c-means with k=4
set.seed(1)
cm.out4 <- cmeans(data.mat[,-3], centers=4)
plot(data.mat[,-3], col=(cm.out4$cluster+1), main="c-means clustering results with k=4", xlab="f1", ylab="f2", pch=20, cex=apply(cm.out2$membership+0.5, 1, max)^2)

Topic 3. Use k-means to clustering data and infer networks

# (1)
library(ClueR)
## Loading required package: parallel
data(adipocyte)

# (2)
adipocyte.exp <- adipocyte[,-ncol(adipocyte)]
idx <- which(rowSums(abs(adipocyte.exp) > 1) > 0)
adipocyte.reg <- adipocyte.exp[idx,]
# number of genes before filtering
nrow(adipocyte.exp)
## [1] 24213
# number of genes after filtering
nrow(adipocyte.reg)
## [1] 2742
# number of time points in the data
ncol(adipocyte.reg)
## [1] 7
# (3)
standardize <- function(mat) {
  means <- apply(mat, 1, mean)
  stds <- apply(mat, 1, sd)
  tmp <- sweep(mat, 1, means, FUN="-")
  mat.stand <- sweep(tmp, 1, stds, FUN="/")
  return(mat.stand)
}

adipocyte.scaled <- standardize(adipocyte.reg)

# (4) 
set.seed(1)
km <- kmeans(adipocyte.scaled, centers = 13)

# (5)
km.gene.clustered <- names(which(km$cluster == km$cluster["ISL1"]))

# (6)
library(igraph)
## Warning: package 'igraph' was built under R version 3.4.4
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
mat <- cor(t(adipocyte.scaled[km.gene.clustered,]))
mat[mat<0.99]=0

# Make an Igraph object from this matrix:
network <- graph_from_adjacency_matrix(mat, weighted=T, mode="undirected", diag=F)
V(network)$label.cex <- 0.5
l <- layout_on_sphere(network)
plot(network, layout=l, cex=1, vertex.size=3)