Peter Langfelder and Steve Horvath tutorials are here

Paper: Navarro et al, 2021

# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

# if(!require("limma")) BiocManager::install("limma"); library("limma")
library(edgeR)
library(DT)
library(sva)
library(limma)
library(factoextra)
library(WGCNA)
library(data.table)
library(flashClust)
library(DESeq2)
library(tidyverse)

Download dataset

https://zenodo.org/record/4715907

# Metadata 
metadata = as.data.frame(data.table::fread(paste0(expr_dir, "MyND/MyND_metadata.txt"), header = T, stringsAsFactors = F, check.names = F))

# Counts expression data 
data = data.table::fread(paste0(expr_dir, "MyND/monocyte_counts_matrix.txt"), header = T, stringsAsFactors = F, check.names = F)
# dim(data)
# data[1:5, 1:5]
data_tmp = as.data.frame(data)
rownames(data_tmp) = data$ENSEMBL_ID
data_tmp$ENSEMBL_ID = NULL
data_tmp = data_tmp[, metadata$rnaseq_id] # order the expression table 
identical(colnames(data_tmp), metadata$rnaseq_id) # must be TRUE

## [1] TRUE

data_tmp[1:3,1:3]

##                    ADRC-111-01_1 ADRC-11955-01_2 ADRC-18133-01_1
## ENSG00000000003.14             0               1               0
## ENSG00000000005.6              0               0               0
## ENSG00000000419.12           295            1246             426

x <- DGEList(counts=as.matrix(data_tmp), samples=metadata)
cpm = cpm(x)
# cpm[1:3,1:3]

keep.exp = rowSums(cpm > 1) >= 0.3*ncol(x) # Filter non-expressed genes
x = x[keep.exp,]
dim(x$counts)

## [1] 13667   230

counts_filt = as.data.frame(x$counts) # counts filtered matrix 
# counts_filt[1:3, 1:3]

PCA before adjustment

# Let's factor be factor! 
metadata$sex = as.factor(metadata$sex)
metadata$diagnosis = as.factor(metadata$diagnosis)
metadata$aj_status = as.factor(metadata$aj_status)
metadata$population_assignment = as.factor(metadata$population_assignment)

# Data normalization 
dds <- DESeqDataSetFromMatrix(countData = round(counts_filt),
                             colData = metadata,
                             design = ~ diagnosis)

## converting counts to integer mode

gExpr <- DGEList(counts=assay(dds))
gExpr <- calcNormFactors(gExpr)
vobjGenes <- voom(gExpr, model.matrix( ~ diagnosis, metadata) )
gene_counts_voom = vobjGenes$E # normalized matrix 

res.pca = prcomp(t(gene_counts_voom))
fviz_pca_ind(res.pca, 
             habillage = metadata$diagnosis)

PCA after adjustment

# SVA network for data adjustment 
mod0 <- model.matrix(~ 1, colData(dds))
mod <- model.matrix(design(dds), colData(dds))
nsv <- num.sv(gene_counts_voom, mod, method = "be")
message(paste0("Number of SVs proposed: ", nsv))

## Number of SVs proposed: 12

resid_expr = sva_network(gene_counts_voom, nsv) # matrix adjusted
resid_expr_t = t(resid_expr)

res.pca2 = prcomp(resid_expr_t)
fviz_pca_ind(res.pca2,
             habillage = metadata$diagnosis)

Power threshold

# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))

# Call the network topology analysis function
sft = pickSoftThreshold(resid_expr_t, powerVector = powers, verbose = 5)

## pickSoftThreshold: will use block size 3273.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 3273 of 13667

## Warning: executing %dopar% sequentially: no parallel backend registered

##    ..working on genes 3274 through 6546 of 13667
##    ..working on genes 6547 through 9819 of 13667
##    ..working on genes 9820 through 13092 of 13667
##    ..working on genes 13093 through 13667 of 13667
##    Power SFT.R.sq slope truncated.R.sq  mean.k. median.k.  max.k.
## 1      1    0.198 -2.15          0.991 1.24e+03  1.23e+03 2110.00
## 2      2    0.590 -2.52          0.998 1.87e+02  1.77e+02  534.00
## 3      3    0.796 -2.64          0.997 3.78e+01  3.27e+01  179.00
## 4      4    0.885 -2.59          0.997 9.53e+00  7.23e+00   70.90
## 5      5    0.937 -2.40          0.992 2.89e+00  1.82e+00   33.70
## 6      6    0.973 -2.26          0.989 1.03e+00  5.19e-01   21.70
## 7      7    0.979 -2.04          0.978 4.28e-01  1.61e-01   16.40
## 8      8    0.949 -1.88          0.941 2.06e-01  5.38e-02   13.50
## 9      9    0.964 -1.69          0.956 1.13e-01  1.92e-02   11.30
## 10    10    0.956 -1.55          0.945 7.06e-02  7.07e-03    9.69
## 11    12    0.960 -1.32          0.953 3.64e-02  1.08e-03    7.32
## 12    14    0.934 -1.26          0.915 2.43e-02  1.85e-04    6.99
## 13    16    0.386 -1.82          0.339 1.87e-02  3.33e-05    6.98
## 14    18    0.382 -1.97          0.336 1.57e-02  6.50e-06    6.98
## 15    20    0.386 -1.93          0.337 1.39e-02  1.30e-06    6.98

# Scale-free topology fit index as a function of the soft-thresholding power
cex1 = 0.9

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")

Connectivity

# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

TOM adjacency

softPower = 5
adjacency = adjacency(resid_expr_t, power = softPower, type = "signed") 
TOM = TOMsimilarity(adjacency, TOMType="signed")

## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.

dissTOM = 1-TOM
geneTree = flashClust(as.dist(dissTOM), method="average")

plot(geneTree, xlab="", sub="", main= "Gene Clustering on TOM-based dissimilarity", labels= FALSE, hang=0.04)

Net parameters

# "We like large modules" - Tutorial, so we set the minimum module size relatively high:
minModuleSize = 30
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, 
                            distM = dissTOM,
                            deepSplit = 3, 
                            pamRespectsDendro = FALSE,
                            minClusterSize = minModuleSize);

..cutHeight not given, setting it to 0.964 ===> 99% of the (truncated) height range in dendro. ..done.

clusters_size = data.matrix(table(dynamicMods))
clusters_size = cbind(rownames(clusters_size),clusters_size)
colnames(clusters_size) = c("cluster","size")

# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
clusters_colors = data.matrix(table(dynamicColors))
clusters_colors = cbind(rownames(clusters_colors), clusters_colors)
colnames(clusters_colors) = c("color","size")

mytable = table(dynamicColors)
# as.data.frame(t(as.matrix(unclass(mytable))))

Dendrogram colors

# Plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")

Number of modules before merge

length(unique(dynamicColors))

[1] 88

Clustering eigengenes

# Calculate eigengenes
MEList = moduleEigengenes(resid_expr_t, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs)
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average")

plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
#We choose a height cut of 0.25, corresponding to correlation of 0.75, to merge
MEDissThres = 0.25
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")

Merge modules

# Call an automatic merging function
merge = mergeCloseModules(resid_expr_t, dynamicColors, cutHeight = MEDissThres, verbose = 3)

##  mergeCloseModules: Merging modules whose distance is less than 0.25
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 88 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 87 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 87 module eigengenes in given set.

# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
#png(paste0(expr_dir, "/MyND/Merge_modules.png"), width = 16, height = 8, res = 300, units = "in")
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)

Merged network

#### For use of the new MERGED DATA!!! 
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;
# Save module colors and labels for use in subsequent parts
save(MEs, mergedMEs, moduleColors, geneTree, adjacency, file = paste0(expr_dir, "MyND/WGCNA_monocytes.RData"))

Number of modules after merge

length(unique(moduleColors))

[1] 87

Trait associations

metadata_selected2 = metadata[,c("diagnosis", "sex", "age")]
metadata_selected2$diagnosis = as.numeric(metadata_selected2$diagnosis)
metadata_selected2$sex = as.numeric(metadata_selected2$sex)
metadata_selected2$age = as.numeric(metadata_selected2$age)
nGenes = ncol(resid_expr_t)
nSamples = nrow(resid_expr_t)

# Recalculate MEs with color labels
MEs0 = moduleEigengenes(resid_expr_t, moduleColors)$eigengenes
MEs = orderMEs(MEs0) #Reorder eigenvectors such that similar ones measured by correlation are next to each other
moduleTraitCor = WGCNA::cor(MEs, metadata_selected2, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 20, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(metadata_selected2), 
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))

## Warning in greenWhiteRed(50): WGCNA::greenWhiteRed: this palette is not suitable for people
## with green-red color blindness (the most common kind of color blindness).
## Consider using the function blueWhiteRed instead.

gene_ids = colnames(resid_expr_t)
gene_ids = colnames(resid_expr_t)
# save(moduleColors, file = paste0(work_plots, "MyND/moduleColors.Rdata"))
gene_modules = cbind(gene_ids, moduleColors)
gene_modules = as.data.frame(gene_modules)

Module assignment

## Get conversion table for Gencode 30
gencode_30 = read.table(paste0(expr_dir, "ens.geneid.gencode.v30"))
colnames(gencode_30) = c("ensembl","symbol")
gene_modules_symbol = merge(gene_modules, gencode_30, by.x = "gene_ids", by.y = "ensembl")

# write.table(gene_modules_symbol, file = paste0(expr_dir, "MyND/geneBymodule.txt"), quote = F, row.names = F, sep = "\t")
datatable(gene_modules_symbol)

## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Module size

mytable2 = table(gene_modules_symbol$moduleColors)
datatable(as.data.frame(t(as.matrix(unclass(mytable2)))))

Session info

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] forcats_0.5.1               stringr_1.4.0              
##  [3] dplyr_1.0.8                 purrr_0.3.4                
##  [5] readr_2.1.2                 tidyr_1.2.0                
##  [7] tibble_3.2.1                tidyverse_1.3.1            
##  [9] DESeq2_1.34.0               SummarizedExperiment_1.24.0
## [11] Biobase_2.54.0              MatrixGenerics_1.6.0       
## [13] matrixStats_0.61.0          GenomicRanges_1.46.1       
## [15] GenomeInfoDb_1.30.1         IRanges_2.28.0             
## [17] S4Vectors_0.32.3            BiocGenerics_0.40.0        
## [19] flashClust_1.01-2           data.table_1.14.2          
## [21] WGCNA_1.70-3                fastcluster_1.2.3          
## [23] dynamicTreeCut_1.63-1       factoextra_1.0.7           
## [25] ggplot2_3.4.1               sva_3.42.0                 
## [27] BiocParallel_1.28.3         genefilter_1.76.0          
## [29] mgcv_1.8-38                 nlme_3.1-153               
## [31] DT_0.30                     edgeR_3.36.0               
## [33] limma_3.50.1               
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.4.1        Hmisc_4.6-0           
##   [4] splines_4.1.2          crosstalk_1.2.0        digest_0.6.29         
##   [7] foreach_1.5.2          htmltools_0.5.2        GO.db_3.14.0          
##  [10] fansi_1.0.4            magrittr_2.0.3         checkmate_2.0.0       
##  [13] memoise_2.0.1          cluster_2.1.2          doParallel_1.0.17     
##  [16] tzdb_0.2.0             Biostrings_2.62.0      annotate_1.72.0       
##  [19] modelr_0.1.8           jpeg_0.1-9             colorspace_2.0-3      
##  [22] blob_1.2.2             rvest_1.0.2            ggrepel_0.9.1         
##  [25] haven_2.4.3            xfun_0.29              crayon_1.5.0          
##  [28] RCurl_1.98-1.6         jsonlite_1.7.3         impute_1.68.0         
##  [31] survival_3.2-13        iterators_1.0.14       glue_1.6.2            
##  [34] gtable_0.3.0           zlibbioc_1.40.0        XVector_0.34.0        
##  [37] DelayedArray_0.20.0    car_3.0-12             abind_1.4-5           
##  [40] scales_1.2.1           DBI_1.1.2              rstatix_0.7.0         
##  [43] Rcpp_1.0.8             xtable_1.8-4           htmlTable_2.4.0       
##  [46] foreign_0.8-81         bit_4.0.4              preprocessCore_1.56.0 
##  [49] Formula_1.2-4          htmlwidgets_1.5.4      httr_1.4.2            
##  [52] RColorBrewer_1.1-2     ellipsis_0.3.2         farver_2.1.0          
##  [55] pkgconfig_2.0.3        XML_3.99-0.8           nnet_7.3-16           
##  [58] sass_0.4.0             dbplyr_2.1.1           locfit_1.5-9.4        
##  [61] utf8_1.2.3             labeling_0.4.2         tidyselect_1.1.2      
##  [64] rlang_1.1.1            AnnotationDbi_1.56.2   munsell_0.5.0         
##  [67] cellranger_1.1.0       tools_4.1.2            cachem_1.0.6          
##  [70] cli_3.6.1              generics_0.1.2         RSQLite_2.2.10        
##  [73] broom_0.7.12           evaluate_0.15          fastmap_1.1.0         
##  [76] yaml_2.3.5             knitr_1.37             bit64_4.0.5           
##  [79] fs_1.5.2               KEGGREST_1.34.0        xml2_1.3.3            
##  [82] compiler_4.1.2         rstudioapi_0.13        png_0.1-7             
##  [85] ggsignif_0.6.3         reprex_2.0.1           geneplotter_1.72.0    
##  [88] bslib_0.3.1            stringi_1.7.6          highr_0.9             
##  [91] lattice_0.20-45        Matrix_1.5-1           vctrs_0.6.3           
##  [94] pillar_1.9.0           lifecycle_1.0.3        jquerylib_0.1.4       
##  [97] bitops_1.0-7           R6_2.5.1               latticeExtra_0.6-29   
## [100] gridExtra_2.3          codetools_0.2-18       assertthat_0.2.1      
## [103] withr_2.5.0            GenomeInfoDbData_1.2.7 parallel_4.1.2        
## [106] hms_1.1.1              grid_4.1.2             rpart_4.1-15          
## [109] rmarkdown_2.11         carData_3.0-5          ggpubr_0.4.0          
## [112] lubridate_1.8.0        base64enc_0.1-3

WGCNA for bulk RNASeq data

Katia Lopes
Systems Analyst and Computational Biologist

2023-10-09

Download dataset

PCA before adjustment

PCA after adjustment

Power threshold

Connectivity

TOM adjacency

Net parameters

Dendrogram colors

Number of modules before merge

Clustering eigengenes

Merge modules

Merged network

Number of modules after merge

Trait associations

Module assignment

Module size

Session info

WGCNA for bulk RNASeq data

Katia Lopes Systems Analyst and Computational Biologist

2023-10-09

Download dataset

PCA before adjustment

PCA after adjustment

Power threshold

Connectivity

TOM adjacency

Net parameters

Dendrogram colors

Number of modules before merge

Clustering eigengenes

Merge modules

Merged network

Number of modules after merge

Trait associations

Module assignment

Module size

Session info

Katia Lopes
Systems Analyst and Computational Biologist