Peter Langfelder and Steve Horvath tutorials are here

Paper: Humphrey et al, 2022

# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

# if(!require("limma")) BiocManager::install("limma"); library("limma")
library("edgeR")
library("DT")
if(!require("sva")) BiocManager::install("sva"); library("sva")
if(!require("limma")) BiocManager::install("limma"); library("limma")
library("factoextra")
library("WGCNA")
library("data.table")
library("flashClust")
library("DESeq2")
library("tidyverse")

Download dataset

https://zenodo.org/record/6385747

# Metadata 
metadata = as.data.frame(data.table::fread(paste0(expr_dir, "Thoracic_Spinal_Cord/Thoracic_Spinal_Cord_metadata.tsv"), header = T, stringsAsFactors = F, check.names = F))
rownames(metadata) = metadata$rna_id

# Counts expression data 
data = data.table::fread(paste0(expr_dir, "Thoracic_Spinal_Cord/Thoracic_Spinal_Cord_gene_counts.tsv"), header = T, stringsAsFactors = F, check.names = F)
dim(data)

## [1] 58884    54

data[1:5, 1:5]

##         ensembl_id gene_name sample_256 sample_271 sample_66
## 1: ENSG00000000003    TSPAN6        474        298       789
## 2: ENSG00000000005      TNMD          1          3         2
## 3: ENSG00000000419      DPM1        266        156       238
## 4: ENSG00000000457     SCYL3        301        166       121
## 5: ENSG00000000460  C1orf112        152        154       133

data_tmp = as.data.frame(data)
data_tmp$gene_name = NULL
rownames(data_tmp) = data$ensembl_id
data_tmp$ensembl_id = NULL
data_tmp = data_tmp[, metadata$rna_id] # order the expression table 
identical(colnames(data_tmp), metadata$rna_id) # must be TRUE

## [1] TRUE

data_tmp[1:3,1:3]

##                 sample_269 sample_239 sample_271
## ENSG00000000003        203        416        298
## ENSG00000000005          2          0          3
## ENSG00000000419        206        135        156

x <- DGEList(counts=as.matrix(data_tmp), samples=metadata)
cpm = cpm(x)
cpm[1:3,1:3]

##                 sample_269 sample_239 sample_271
## ENSG00000000003 9.57537549  23.415310 10.6861322
## ENSG00000000005 0.09433867   0.000000  0.1075785
## ENSG00000000419 9.71688350   7.598718  5.5940826

keep.exp = rowSums(cpm > 1) >= 0.3*ncol(x) #Filtering
x = x[keep.exp,]
dim(x$counts)

## [1] 17206    52

counts_filt = as.data.frame(x$counts)
counts_filt[1:3, 1:3]

##                 sample_269 sample_239 sample_271
## ENSG00000000003        203        416        298
## ENSG00000000419        206        135        156
## ENSG00000000457        127        147        166

PCA before adjustment

Counts matrix

metadata$sex = as.factor(metadata$sex)
metadata$disease = as.factor(metadata$disease)
metadata$library_prep = as.factor(metadata$library_prep)
metadata$site_id = as.factor(metadata$site_id)

res.pca = prcomp(t(counts_filt))
fviz_pca_ind(res.pca, 
             habillage = metadata$disease)

Data normalization

dds <- DESeqDataSetFromMatrix(countData = round(counts_filt),
                             colData = metadata,
                             design = ~ disease)

#Using the function of voom normalization
gExpr <- DGEList(counts=assay(dds))
gExpr <- calcNormFactors(gExpr)
vobjGenes <- voom(gExpr, model.matrix( ~ disease, metadata) )
gene_counts_voom = vobjGenes$E # ate aqui

PCA after adjustment

Matrix normalized and adjusted

covariates = c("rin", "pct_mrna_bases", "median_3prime_bias", "median_5prime_bias", "pct_chimeras", "gPC1","gPC2","gPC3","gPC4", "gPC5", "pct_ribosomal_bases", "pct_intergenic_bases", "estimated_library_size" )

residuals <- removeBatchEffect(x = gene_counts_voom, 
                                   batch = metadata$site_id, 
                                   batch2 = metadata$sex, 
                                #   design = model.matrix(~ disease, data = metadata), #force to not regress disease 
                                   covariates = as.matrix(metadata[, covariates])) 

dim(residuals)

## [1] 17206    52

res.pca2 = prcomp(t(residuals))
fviz_pca_ind(res.pca2, 
             habillage = metadata$disease)

Clustering: Euclidean

#Euclidean distance with the transposed matrix
sampleTree = hclust(dist(t(residuals)), method = "average")

plot(sampleTree, main = "Sample clustering to detecting outliers after correction", sub = "Function: hclust, Method: average from Euclidean distance", xlab = "samples", cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5, cex.sub = 1.5)

Power threshold

# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))

# Call the network topology analysis function
sft = pickSoftThreshold(t(residuals), powerVector = powers, verbose = 5)

## pickSoftThreshold: will use block size 2600.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 2600 of 17206

## Warning: executing %dopar% sequentially: no parallel backend registered

##    ..working on genes 2601 through 5200 of 17206
##    ..working on genes 5201 through 7800 of 17206
##    ..working on genes 7801 through 10400 of 17206
##    ..working on genes 10401 through 13000 of 17206
##    ..working on genes 13001 through 15600 of 17206
##    ..working on genes 15601 through 17206 of 17206
##    Power SFT.R.sq  slope truncated.R.sq  mean.k. median.k. max.k.
## 1      1   0.1280  2.010          0.951 3460.000  3.49e+03 4940.0
## 2      2   0.0353 -0.568          0.938 1070.000  1.07e+03 2100.0
## 3      3   0.2930 -1.340          0.964  417.000  4.00e+02 1070.0
## 4      4   0.5840 -1.710          0.997  187.000  1.70e+02  634.0
## 5      5   0.7660 -2.040          0.994   93.900  7.99e+01  431.0
## 6      6   0.8630 -2.210          0.981   51.200  4.02e+01  318.0
## 7      7   0.9210 -2.230          0.976   29.900  2.12e+01  247.0
## 8      8   0.9530 -2.170          0.976   18.500  1.18e+01  199.0
## 9      9   0.9700 -2.110          0.981   12.000  6.74e+00  168.0
## 10    10   0.9750 -2.030          0.980    8.150  3.99e+00  144.0
## 11    12   0.9850 -1.880          0.988    4.160  1.52e+00  110.0
## 12    14   0.9890 -1.750          0.993    2.370  6.22e-01   86.9
## 13    16   0.9850 -1.650          0.991    1.470  2.75e-01   70.4
## 14    18   0.9860 -1.580          0.994    0.969  1.29e-01   58.1
## 15    20   0.9770 -1.530          0.987    0.672  6.25e-02   48.7

# Scale-free topology fit index as a function of the soft-thresholding power
cex1 = 0.9

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")

Connectivity

# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

TOM adjacency

softPower = 7
adjacency = adjacency(t(residuals), power = softPower, type = "signed") 
TOM = TOMsimilarity(adjacency, TOMType="signed")

## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.

dissTOM = 1-TOM
geneTree = flashClust(as.dist(dissTOM), method="average")

plot(geneTree, xlab="", sub="", main= "Gene Clustering on TOM-based dissimilarity", labels= FALSE, hang=0.04)

Net parameters

# "We like large modules" - Tutorial, so we set the minimum module size relatively high:
minModuleSize = 30
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, 
                            distM = dissTOM,
                            deepSplit = 1, 
                            pamRespectsDendro = FALSE,
                            minClusterSize = minModuleSize);

..cutHeight not given, setting it to 0.978 ===> 99% of the (truncated) height range in dendro. ..done.

clusters_size = data.matrix(table(dynamicMods))
clusters_size = cbind(rownames(clusters_size),clusters_size)
colnames(clusters_size) = c("cluster","size")

# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
clusters_colors = data.matrix(table(dynamicColors))
clusters_colors = cbind(rownames(clusters_colors), clusters_colors)
colnames(clusters_colors) = c("color","size")

mytable = table(dynamicColors)
# as.data.frame(t(as.matrix(unclass(mytable))))

Dendrogram colors

# Plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")

Number of modules before merge

length(unique(dynamicColors))

[1] 41

Clustering eigengenes

# Calculate eigengenes
resid_expr_t = t(residuals)
MEList = moduleEigengenes(resid_expr_t, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs)
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average")

plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
#We choose a height cut of 0.25, corresponding to correlation of 0.75, to merge
MEDissThres = 0.25
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")

Merge modules

# Call an automatic merging function
merge = mergeCloseModules(resid_expr_t, dynamicColors, cutHeight = MEDissThres, verbose = 3)

##  mergeCloseModules: Merging modules whose distance is less than 0.25
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 41 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 31 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 31 module eigengenes in given set.

# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
#png(paste0(expr_dir, "/MyND/Merge_modules.png"), width = 16, height = 8, res = 300, units = "in")
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)

Merged network

#### For use of the new MERGED DATA!!! 
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;

Number of modules after merge

length(unique(moduleColors))

[1] 31

Trait associations

metadata_selected2 = metadata[,c("sex", "site_id", "rin", "age_rounded", "disease_duration")]
metadata_selected2$rin = as.numeric(metadata_selected2$rin)
metadata_selected2$site_id = as.numeric(metadata_selected2$site_id)
metadata_selected2$sex = as.numeric(metadata_selected2$sex)
metadata_selected2$age_rounded = as.numeric(metadata_selected2$age_rounded)

nGenes = ncol(resid_expr_t)
nSamples = nrow(resid_expr_t)

# Recalculate MEs with color labels
MEs0 = moduleEigengenes(resid_expr_t, moduleColors)$eigengenes
MEs = orderMEs(MEs0) #Reorder eigenvectors such that similar ones measured by correlation are next to each other
moduleTraitCor = WGCNA::cor(MEs, metadata_selected2, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 20, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(metadata_selected2), 
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))

## Warning in greenWhiteRed(50): WGCNA::greenWhiteRed: this palette is not suitable for people
## with green-red color blindness (the most common kind of color blindness).
## Consider using the function blueWhiteRed instead.

gene_ids = colnames(resid_expr_t)
gene_ids = colnames(resid_expr_t)
# save(moduleColors, file = paste0(work_plots, "MyND/moduleColors.Rdata"))
gene_modules = cbind(gene_ids, moduleColors)
gene_modules = as.data.frame(gene_modules)

Gene assignment

## Get conversion table for Gencode 30
gencode_30 = read.table(paste0(expr_dir, "ens.geneid.gencode.v30"))
colnames(gencode_30) = c("ensembl","symbol")
gencode_30$ensembl = gsub("(.*)\\.(.*)","\\1",gencode_30$ensembl)
gene_modules_symbol = merge(gene_modules, gencode_30, by.x = "gene_ids", by.y = "ensembl")

# write.table(gene_modules_symbol, file = paste0(expr_dir, "MyND/geneBymodule.txt"), quote = F, row.names = F, sep = "\t")
datatable(gene_modules_symbol)

## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Module size

mytable2 = table(gene_modules_symbol$moduleColors)
datatable(as.data.frame(t(as.matrix(unclass(mytable2)))))

And now what?

Session info

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] forcats_0.5.1               stringr_1.4.0              
##  [3] dplyr_1.0.8                 purrr_0.3.4                
##  [5] readr_2.1.2                 tidyr_1.2.0                
##  [7] tibble_3.2.1                tidyverse_1.3.1            
##  [9] DESeq2_1.34.0               SummarizedExperiment_1.24.0
## [11] Biobase_2.54.0              MatrixGenerics_1.6.0       
## [13] matrixStats_0.61.0          GenomicRanges_1.46.1       
## [15] GenomeInfoDb_1.30.1         IRanges_2.28.0             
## [17] S4Vectors_0.32.3            BiocGenerics_0.40.0        
## [19] flashClust_1.01-2           data.table_1.14.2          
## [21] WGCNA_1.70-3                fastcluster_1.2.3          
## [23] dynamicTreeCut_1.63-1       factoextra_1.0.7           
## [25] ggplot2_3.4.1               sva_3.42.0                 
## [27] BiocParallel_1.28.3         genefilter_1.76.0          
## [29] mgcv_1.8-38                 nlme_3.1-153               
## [31] DT_0.30                     edgeR_3.36.0               
## [33] limma_3.50.1               
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.4.1        Hmisc_4.6-0           
##   [4] splines_4.1.2          crosstalk_1.2.0        digest_0.6.29         
##   [7] foreach_1.5.2          htmltools_0.5.2        GO.db_3.14.0          
##  [10] fansi_1.0.4            magrittr_2.0.3         checkmate_2.0.0       
##  [13] memoise_2.0.1          cluster_2.1.2          doParallel_1.0.17     
##  [16] tzdb_0.2.0             Biostrings_2.62.0      annotate_1.72.0       
##  [19] modelr_0.1.8           jpeg_0.1-9             colorspace_2.0-3      
##  [22] blob_1.2.2             rvest_1.0.2            ggrepel_0.9.1         
##  [25] haven_2.4.3            xfun_0.29              crayon_1.5.0          
##  [28] RCurl_1.98-1.6         jsonlite_1.7.3         impute_1.68.0         
##  [31] survival_3.2-13        iterators_1.0.14       glue_1.6.2            
##  [34] gtable_0.3.0           zlibbioc_1.40.0        XVector_0.34.0        
##  [37] DelayedArray_0.20.0    car_3.0-12             abind_1.4-5           
##  [40] scales_1.2.1           DBI_1.1.2              rstatix_0.7.0         
##  [43] Rcpp_1.0.8             xtable_1.8-4           htmlTable_2.4.0       
##  [46] foreign_0.8-81         bit_4.0.4              preprocessCore_1.56.0 
##  [49] Formula_1.2-4          htmlwidgets_1.5.4      httr_1.4.2            
##  [52] RColorBrewer_1.1-2     ellipsis_0.3.2         farver_2.1.0          
##  [55] pkgconfig_2.0.3        XML_3.99-0.8           nnet_7.3-16           
##  [58] sass_0.4.0             dbplyr_2.1.1           locfit_1.5-9.4        
##  [61] utf8_1.2.3             labeling_0.4.2         tidyselect_1.1.2      
##  [64] rlang_1.1.1            AnnotationDbi_1.56.2   munsell_0.5.0         
##  [67] cellranger_1.1.0       tools_4.1.2            cachem_1.0.6          
##  [70] cli_3.6.1              generics_0.1.2         RSQLite_2.2.10        
##  [73] broom_0.7.12           evaluate_0.15          fastmap_1.1.0         
##  [76] yaml_2.3.5             knitr_1.37             bit64_4.0.5           
##  [79] fs_1.5.2               KEGGREST_1.34.0        xml2_1.3.3            
##  [82] compiler_4.1.2         rstudioapi_0.13        png_0.1-7             
##  [85] ggsignif_0.6.3         reprex_2.0.1           geneplotter_1.72.0    
##  [88] bslib_0.3.1            stringi_1.7.6          highr_0.9             
##  [91] lattice_0.20-45        Matrix_1.5-1           vctrs_0.6.3           
##  [94] pillar_1.9.0           lifecycle_1.0.3        jquerylib_0.1.4       
##  [97] bitops_1.0-7           R6_2.5.1               latticeExtra_0.6-29   
## [100] gridExtra_2.3          codetools_0.2-18       assertthat_0.2.1      
## [103] withr_2.5.0            GenomeInfoDbData_1.2.7 parallel_4.1.2        
## [106] hms_1.1.1              grid_4.1.2             rpart_4.1-15          
## [109] rmarkdown_2.11         carData_3.0-5          ggpubr_0.4.0          
## [112] lubridate_1.8.0        base64enc_0.1-3

WGCNA - RNASeq

Katia Lopes
Systems Analyst and Computational Biologist

2023-08-10

Download dataset

PCA before adjustment

Data normalization

PCA after adjustment

Clustering: Euclidean

Power threshold

Connectivity

TOM adjacency

Net parameters

Dendrogram colors

Number of modules before merge

Clustering eigengenes

Merge modules

Merged network

Number of modules after merge

Trait associations

Gene assignment

Module size

Session info

WGCNA - RNASeq

Katia Lopes Systems Analyst and Computational Biologist

2023-08-10

Download dataset

PCA before adjustment

Data normalization

PCA after adjustment

Clustering: Euclidean

Power threshold

Connectivity

TOM adjacency

Net parameters

Dendrogram colors

Number of modules before merge

Clustering eigengenes

Merge modules

Merged network

Number of modules after merge

Trait associations

Gene assignment

Module size

Session info

Katia Lopes
Systems Analyst and Computational Biologist