Peter Langfelder and Steve Horvath tutorials are here
Paper: Navarro et al, 2021
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# if(!require("limma")) BiocManager::install("limma"); library("limma")
library(edgeR)
library(DT)
library(sva)
library(limma)
library(factoextra)
library(WGCNA)
library(data.table)
library(flashClust)
library(DESeq2)
library(tidyverse)
https://zenodo.org/record/4715907
# Metadata
metadata = as.data.frame(data.table::fread(paste0(expr_dir, "MyND/MyND_metadata.txt"), header = T, stringsAsFactors = F, check.names = F))
# Counts expression data
data = data.table::fread(paste0(expr_dir, "MyND/monocyte_counts_matrix.txt"), header = T, stringsAsFactors = F, check.names = F)
# dim(data)
# data[1:5, 1:5]
data_tmp = as.data.frame(data)
rownames(data_tmp) = data$ENSEMBL_ID
data_tmp$ENSEMBL_ID = NULL
data_tmp = data_tmp[, metadata$rnaseq_id] # order the expression table
identical(colnames(data_tmp), metadata$rnaseq_id) # must be TRUE
## [1] TRUE
data_tmp[1:3,1:3]
## ADRC-111-01_1 ADRC-11955-01_2 ADRC-18133-01_1
## ENSG00000000003.14 0 1 0
## ENSG00000000005.6 0 0 0
## ENSG00000000419.12 295 1246 426
x <- DGEList(counts=as.matrix(data_tmp), samples=metadata)
cpm = cpm(x)
# cpm[1:3,1:3]
keep.exp = rowSums(cpm > 1) >= 0.3*ncol(x) # Filter non-expressed genes
x = x[keep.exp,]
dim(x$counts)
## [1] 13667 230
counts_filt = as.data.frame(x$counts) # counts filtered matrix
# counts_filt[1:3, 1:3]
# Let's factor be factor!
metadata$sex = as.factor(metadata$sex)
metadata$diagnosis = as.factor(metadata$diagnosis)
metadata$aj_status = as.factor(metadata$aj_status)
metadata$population_assignment = as.factor(metadata$population_assignment)
# Data normalization
dds <- DESeqDataSetFromMatrix(countData = round(counts_filt),
colData = metadata,
design = ~ diagnosis)
## converting counts to integer mode
gExpr <- DGEList(counts=assay(dds))
gExpr <- calcNormFactors(gExpr)
vobjGenes <- voom(gExpr, model.matrix( ~ diagnosis, metadata) )
gene_counts_voom = vobjGenes$E # normalized matrix
res.pca = prcomp(t(gene_counts_voom))
fviz_pca_ind(res.pca,
habillage = metadata$diagnosis)
# SVA network for data adjustment
mod0 <- model.matrix(~ 1, colData(dds))
mod <- model.matrix(design(dds), colData(dds))
nsv <- num.sv(gene_counts_voom, mod, method = "be")
message(paste0("Number of SVs proposed: ", nsv))
## Number of SVs proposed: 12
resid_expr = sva_network(gene_counts_voom, nsv) # matrix adjusted
resid_expr_t = t(resid_expr)
res.pca2 = prcomp(resid_expr_t)
fviz_pca_ind(res.pca2,
habillage = metadata$diagnosis)
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(resid_expr_t, powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 3273.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 3273 of 13667
## Warning: executing %dopar% sequentially: no parallel backend registered
## ..working on genes 3274 through 6546 of 13667
## ..working on genes 6547 through 9819 of 13667
## ..working on genes 9820 through 13092 of 13667
## ..working on genes 13093 through 13667 of 13667
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.198 -2.15 0.991 1.24e+03 1.23e+03 2110.00
## 2 2 0.590 -2.52 0.998 1.87e+02 1.77e+02 534.00
## 3 3 0.796 -2.64 0.997 3.78e+01 3.27e+01 179.00
## 4 4 0.885 -2.59 0.997 9.53e+00 7.23e+00 70.90
## 5 5 0.937 -2.40 0.992 2.89e+00 1.82e+00 33.70
## 6 6 0.973 -2.26 0.989 1.03e+00 5.19e-01 21.70
## 7 7 0.979 -2.04 0.978 4.28e-01 1.61e-01 16.40
## 8 8 0.949 -1.88 0.941 2.06e-01 5.38e-02 13.50
## 9 9 0.964 -1.69 0.956 1.13e-01 1.92e-02 11.30
## 10 10 0.956 -1.55 0.945 7.06e-02 7.07e-03 9.69
## 11 12 0.960 -1.32 0.953 3.64e-02 1.08e-03 7.32
## 12 14 0.934 -1.26 0.915 2.43e-02 1.85e-04 6.99
## 13 16 0.386 -1.82 0.339 1.87e-02 3.33e-05 6.98
## 14 18 0.382 -1.97 0.336 1.57e-02 6.50e-06 6.98
## 15 20 0.386 -1.93 0.337 1.39e-02 1.30e-06 6.98
# Scale-free topology fit index as a function of the soft-thresholding power
cex1 = 0.9
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
softPower = 5
adjacency = adjacency(resid_expr_t, power = softPower, type = "signed")
TOM = TOMsimilarity(adjacency, TOMType="signed")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM = 1-TOM
geneTree = flashClust(as.dist(dissTOM), method="average")
plot(geneTree, xlab="", sub="", main= "Gene Clustering on TOM-based dissimilarity", labels= FALSE, hang=0.04)
# "We like large modules" - Tutorial, so we set the minimum module size relatively high:
minModuleSize = 30
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree,
distM = dissTOM,
deepSplit = 3,
pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
..cutHeight not given, setting it to 0.964 ===> 99% of the (truncated) height range in dendro. ..done.
clusters_size = data.matrix(table(dynamicMods))
clusters_size = cbind(rownames(clusters_size),clusters_size)
colnames(clusters_size) = c("cluster","size")
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
clusters_colors = data.matrix(table(dynamicColors))
clusters_colors = cbind(rownames(clusters_colors), clusters_colors)
colnames(clusters_colors) = c("color","size")
mytable = table(dynamicColors)
# as.data.frame(t(as.matrix(unclass(mytable))))
# Plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
length(unique(dynamicColors))
[1] 88
# Calculate eigengenes
MEList = moduleEigengenes(resid_expr_t, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs)
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average")
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
#We choose a height cut of 0.25, corresponding to correlation of 0.75, to merge
MEDissThres = 0.25
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(resid_expr_t, dynamicColors, cutHeight = MEDissThres, verbose = 3)
## mergeCloseModules: Merging modules whose distance is less than 0.25
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 88 module eigengenes in given set.
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 87 module eigengenes in given set.
## Calculating new MEs...
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 87 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
#png(paste0(expr_dir, "/MyND/Merge_modules.png"), width = 16, height = 8, res = 300, units = "in")
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
#### For use of the new MERGED DATA!!!
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;
# Save module colors and labels for use in subsequent parts
save(MEs, mergedMEs, moduleColors, geneTree, adjacency, file = paste0(expr_dir, "MyND/WGCNA_monocytes.RData"))
length(unique(moduleColors))
[1] 87
metadata_selected2 = metadata[,c("diagnosis", "sex", "age")]
metadata_selected2$diagnosis = as.numeric(metadata_selected2$diagnosis)
metadata_selected2$sex = as.numeric(metadata_selected2$sex)
metadata_selected2$age = as.numeric(metadata_selected2$age)
nGenes = ncol(resid_expr_t)
nSamples = nrow(resid_expr_t)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(resid_expr_t, moduleColors)$eigengenes
MEs = orderMEs(MEs0) #Reorder eigenvectors such that similar ones measured by correlation are next to each other
moduleTraitCor = WGCNA::cor(MEs, metadata_selected2, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 20, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(metadata_selected2),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
## Warning in greenWhiteRed(50): WGCNA::greenWhiteRed: this palette is not suitable for people
## with green-red color blindness (the most common kind of color blindness).
## Consider using the function blueWhiteRed instead.
gene_ids = colnames(resid_expr_t)
gene_ids = colnames(resid_expr_t)
# save(moduleColors, file = paste0(work_plots, "MyND/moduleColors.Rdata"))
gene_modules = cbind(gene_ids, moduleColors)
gene_modules = as.data.frame(gene_modules)
## Get conversion table for Gencode 30
gencode_30 = read.table(paste0(expr_dir, "ens.geneid.gencode.v30"))
colnames(gencode_30) = c("ensembl","symbol")
gene_modules_symbol = merge(gene_modules, gencode_30, by.x = "gene_ids", by.y = "ensembl")
# write.table(gene_modules_symbol, file = paste0(expr_dir, "MyND/geneBymodule.txt"), quote = F, row.names = F, sep = "\t")
datatable(gene_modules_symbol)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
mytable2 = table(gene_modules_symbol$moduleColors)
datatable(as.data.frame(t(as.matrix(unclass(mytable2)))))
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] forcats_0.5.1 stringr_1.4.0
## [3] dplyr_1.0.8 purrr_0.3.4
## [5] readr_2.1.2 tidyr_1.2.0
## [7] tibble_3.2.1 tidyverse_1.3.1
## [9] DESeq2_1.34.0 SummarizedExperiment_1.24.0
## [11] Biobase_2.54.0 MatrixGenerics_1.6.0
## [13] matrixStats_0.61.0 GenomicRanges_1.46.1
## [15] GenomeInfoDb_1.30.1 IRanges_2.28.0
## [17] S4Vectors_0.32.3 BiocGenerics_0.40.0
## [19] flashClust_1.01-2 data.table_1.14.2
## [21] WGCNA_1.70-3 fastcluster_1.2.3
## [23] dynamicTreeCut_1.63-1 factoextra_1.0.7
## [25] ggplot2_3.4.1 sva_3.42.0
## [27] BiocParallel_1.28.3 genefilter_1.76.0
## [29] mgcv_1.8-38 nlme_3.1-153
## [31] DT_0.30 edgeR_3.36.0
## [33] limma_3.50.1
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.4.1 Hmisc_4.6-0
## [4] splines_4.1.2 crosstalk_1.2.0 digest_0.6.29
## [7] foreach_1.5.2 htmltools_0.5.2 GO.db_3.14.0
## [10] fansi_1.0.4 magrittr_2.0.3 checkmate_2.0.0
## [13] memoise_2.0.1 cluster_2.1.2 doParallel_1.0.17
## [16] tzdb_0.2.0 Biostrings_2.62.0 annotate_1.72.0
## [19] modelr_0.1.8 jpeg_0.1-9 colorspace_2.0-3
## [22] blob_1.2.2 rvest_1.0.2 ggrepel_0.9.1
## [25] haven_2.4.3 xfun_0.29 crayon_1.5.0
## [28] RCurl_1.98-1.6 jsonlite_1.7.3 impute_1.68.0
## [31] survival_3.2-13 iterators_1.0.14 glue_1.6.2
## [34] gtable_0.3.0 zlibbioc_1.40.0 XVector_0.34.0
## [37] DelayedArray_0.20.0 car_3.0-12 abind_1.4-5
## [40] scales_1.2.1 DBI_1.1.2 rstatix_0.7.0
## [43] Rcpp_1.0.8 xtable_1.8-4 htmlTable_2.4.0
## [46] foreign_0.8-81 bit_4.0.4 preprocessCore_1.56.0
## [49] Formula_1.2-4 htmlwidgets_1.5.4 httr_1.4.2
## [52] RColorBrewer_1.1-2 ellipsis_0.3.2 farver_2.1.0
## [55] pkgconfig_2.0.3 XML_3.99-0.8 nnet_7.3-16
## [58] sass_0.4.0 dbplyr_2.1.1 locfit_1.5-9.4
## [61] utf8_1.2.3 labeling_0.4.2 tidyselect_1.1.2
## [64] rlang_1.1.1 AnnotationDbi_1.56.2 munsell_0.5.0
## [67] cellranger_1.1.0 tools_4.1.2 cachem_1.0.6
## [70] cli_3.6.1 generics_0.1.2 RSQLite_2.2.10
## [73] broom_0.7.12 evaluate_0.15 fastmap_1.1.0
## [76] yaml_2.3.5 knitr_1.37 bit64_4.0.5
## [79] fs_1.5.2 KEGGREST_1.34.0 xml2_1.3.3
## [82] compiler_4.1.2 rstudioapi_0.13 png_0.1-7
## [85] ggsignif_0.6.3 reprex_2.0.1 geneplotter_1.72.0
## [88] bslib_0.3.1 stringi_1.7.6 highr_0.9
## [91] lattice_0.20-45 Matrix_1.5-1 vctrs_0.6.3
## [94] pillar_1.9.0 lifecycle_1.0.3 jquerylib_0.1.4
## [97] bitops_1.0-7 R6_2.5.1 latticeExtra_0.6-29
## [100] gridExtra_2.3 codetools_0.2-18 assertthat_0.2.1
## [103] withr_2.5.0 GenomeInfoDbData_1.2.7 parallel_4.1.2
## [106] hms_1.1.1 grid_4.1.2 rpart_4.1-15
## [109] rmarkdown_2.11 carData_3.0-5 ggpubr_0.4.0
## [112] lubridate_1.8.0 base64enc_0.1-3