Peter Langfelder and Steve Horvath tutorials are here

Paper: Humphrey et al, 2022

# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

# if(!require("limma")) BiocManager::install("limma"); library("limma")
library("edgeR")
library("DT")
if(!require("sva")) BiocManager::install("sva"); library("sva")
if(!require("limma")) BiocManager::install("limma"); library("limma")
library("factoextra")
library("WGCNA")
library("data.table")
library("flashClust")
library("DESeq2")
library("tidyverse")

Download dataset

https://zenodo.org/record/6385747

# Metadata 
metadata = as.data.frame(data.table::fread(paste0(expr_dir, "Thoracic_Spinal_Cord/Thoracic_Spinal_Cord_metadata.tsv"), header = T, stringsAsFactors = F, check.names = F))
rownames(metadata) = metadata$rna_id

# Counts expression data 
data = data.table::fread(paste0(expr_dir, "Thoracic_Spinal_Cord/Thoracic_Spinal_Cord_gene_counts.tsv"), header = T, stringsAsFactors = F, check.names = F)
dim(data)
## [1] 58884    54
data[1:5, 1:5]
##         ensembl_id gene_name sample_256 sample_271 sample_66
## 1: ENSG00000000003    TSPAN6        474        298       789
## 2: ENSG00000000005      TNMD          1          3         2
## 3: ENSG00000000419      DPM1        266        156       238
## 4: ENSG00000000457     SCYL3        301        166       121
## 5: ENSG00000000460  C1orf112        152        154       133
data_tmp = as.data.frame(data)
data_tmp$gene_name = NULL
rownames(data_tmp) = data$ensembl_id
data_tmp$ensembl_id = NULL
data_tmp = data_tmp[, metadata$rna_id] # order the expression table 
identical(colnames(data_tmp), metadata$rna_id) # must be TRUE 
## [1] TRUE
data_tmp[1:3,1:3]
##                 sample_269 sample_239 sample_271
## ENSG00000000003        203        416        298
## ENSG00000000005          2          0          3
## ENSG00000000419        206        135        156
x <- DGEList(counts=as.matrix(data_tmp), samples=metadata)
cpm = cpm(x)
cpm[1:3,1:3]
##                 sample_269 sample_239 sample_271
## ENSG00000000003 9.57537549  23.415310 10.6861322
## ENSG00000000005 0.09433867   0.000000  0.1075785
## ENSG00000000419 9.71688350   7.598718  5.5940826
keep.exp = rowSums(cpm > 1) >= 0.3*ncol(x) #Filtering
x = x[keep.exp,]
dim(x$counts)
## [1] 17206    52
counts_filt = as.data.frame(x$counts)
counts_filt[1:3, 1:3]
##                 sample_269 sample_239 sample_271
## ENSG00000000003        203        416        298
## ENSG00000000419        206        135        156
## ENSG00000000457        127        147        166

PCA before adjustment

Counts matrix

metadata$sex = as.factor(metadata$sex)
metadata$disease = as.factor(metadata$disease)
metadata$library_prep = as.factor(metadata$library_prep)
metadata$site_id = as.factor(metadata$site_id)

res.pca = prcomp(t(counts_filt))
fviz_pca_ind(res.pca, 
             habillage = metadata$disease)

Data normalization

dds <- DESeqDataSetFromMatrix(countData = round(counts_filt),
                             colData = metadata,
                             design = ~ disease)

#Using the function of voom normalization
gExpr <- DGEList(counts=assay(dds))
gExpr <- calcNormFactors(gExpr)
vobjGenes <- voom(gExpr, model.matrix( ~ disease, metadata) )
gene_counts_voom = vobjGenes$E # ate aqui

PCA after adjustment

Matrix normalized and adjusted

covariates = c("rin", "pct_mrna_bases", "median_3prime_bias", "median_5prime_bias", "pct_chimeras", "gPC1","gPC2","gPC3","gPC4", "gPC5", "pct_ribosomal_bases", "pct_intergenic_bases", "estimated_library_size" )

residuals <- removeBatchEffect(x = gene_counts_voom, 
                                   batch = metadata$site_id, 
                                   batch2 = metadata$sex, 
                                #   design = model.matrix(~ disease, data = metadata), #force to not regress disease 
                                   covariates = as.matrix(metadata[, covariates])) 

dim(residuals)
## [1] 17206    52
res.pca2 = prcomp(t(residuals))
fviz_pca_ind(res.pca2, 
             habillage = metadata$disease)

Clustering: Euclidean

#Euclidean distance with the transposed matrix
sampleTree = hclust(dist(t(residuals)), method = "average")

plot(sampleTree, main = "Sample clustering to detecting outliers after correction", sub = "Function: hclust, Method: average from Euclidean distance", xlab = "samples", cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5, cex.sub = 1.5)

Power threshold

# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))

# Call the network topology analysis function
sft = pickSoftThreshold(t(residuals), powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 2600.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 2600 of 17206
## Warning: executing %dopar% sequentially: no parallel backend registered
##    ..working on genes 2601 through 5200 of 17206
##    ..working on genes 5201 through 7800 of 17206
##    ..working on genes 7801 through 10400 of 17206
##    ..working on genes 10401 through 13000 of 17206
##    ..working on genes 13001 through 15600 of 17206
##    ..working on genes 15601 through 17206 of 17206
##    Power SFT.R.sq  slope truncated.R.sq  mean.k. median.k. max.k.
## 1      1   0.1280  2.010          0.951 3460.000  3.49e+03 4940.0
## 2      2   0.0353 -0.568          0.938 1070.000  1.07e+03 2100.0
## 3      3   0.2930 -1.340          0.964  417.000  4.00e+02 1070.0
## 4      4   0.5840 -1.710          0.997  187.000  1.70e+02  634.0
## 5      5   0.7660 -2.040          0.994   93.900  7.99e+01  431.0
## 6      6   0.8630 -2.210          0.981   51.200  4.02e+01  318.0
## 7      7   0.9210 -2.230          0.976   29.900  2.12e+01  247.0
## 8      8   0.9530 -2.170          0.976   18.500  1.18e+01  199.0
## 9      9   0.9700 -2.110          0.981   12.000  6.74e+00  168.0
## 10    10   0.9750 -2.030          0.980    8.150  3.99e+00  144.0
## 11    12   0.9850 -1.880          0.988    4.160  1.52e+00  110.0
## 12    14   0.9890 -1.750          0.993    2.370  6.22e-01   86.9
## 13    16   0.9850 -1.650          0.991    1.470  2.75e-01   70.4
## 14    18   0.9860 -1.580          0.994    0.969  1.29e-01   58.1
## 15    20   0.9770 -1.530          0.987    0.672  6.25e-02   48.7
# Scale-free topology fit index as a function of the soft-thresholding power
cex1 = 0.9

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")

Connectivity

# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

TOM adjacency

softPower = 7
adjacency = adjacency(t(residuals), power = softPower, type = "signed") 
TOM = TOMsimilarity(adjacency, TOMType="signed")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM = 1-TOM
geneTree = flashClust(as.dist(dissTOM), method="average")

plot(geneTree, xlab="", sub="", main= "Gene Clustering on TOM-based dissimilarity", labels= FALSE, hang=0.04)

Net parameters

# "We like large modules" - Tutorial, so we set the minimum module size relatively high:
minModuleSize = 30
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, 
                            distM = dissTOM,
                            deepSplit = 1, 
                            pamRespectsDendro = FALSE,
                            minClusterSize = minModuleSize);

..cutHeight not given, setting it to 0.978 ===> 99% of the (truncated) height range in dendro. ..done.

clusters_size = data.matrix(table(dynamicMods))
clusters_size = cbind(rownames(clusters_size),clusters_size)
colnames(clusters_size) = c("cluster","size")

# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
clusters_colors = data.matrix(table(dynamicColors))
clusters_colors = cbind(rownames(clusters_colors), clusters_colors)
colnames(clusters_colors) = c("color","size")

mytable = table(dynamicColors)
# as.data.frame(t(as.matrix(unclass(mytable))))

Dendrogram colors

# Plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")

Number of modules before merge

length(unique(dynamicColors))

[1] 41

Clustering eigengenes

# Calculate eigengenes
resid_expr_t = t(residuals)
MEList = moduleEigengenes(resid_expr_t, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs)
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average")

plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
#We choose a height cut of 0.25, corresponding to correlation of 0.75, to merge
MEDissThres = 0.25
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")

Merge modules

# Call an automatic merging function
merge = mergeCloseModules(resid_expr_t, dynamicColors, cutHeight = MEDissThres, verbose = 3)
##  mergeCloseModules: Merging modules whose distance is less than 0.25
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 41 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 31 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 31 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
#png(paste0(expr_dir, "/MyND/Merge_modules.png"), width = 16, height = 8, res = 300, units = "in")
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)

Merged network

#### For use of the new MERGED DATA!!! 
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;

Number of modules after merge

length(unique(moduleColors))

[1] 31

Trait associations

metadata_selected2 = metadata[,c("sex", "site_id", "rin", "age_rounded", "disease_duration")]
metadata_selected2$rin = as.numeric(metadata_selected2$rin)
metadata_selected2$site_id = as.numeric(metadata_selected2$site_id)
metadata_selected2$sex = as.numeric(metadata_selected2$sex)
metadata_selected2$age_rounded = as.numeric(metadata_selected2$age_rounded)

nGenes = ncol(resid_expr_t)
nSamples = nrow(resid_expr_t)

# Recalculate MEs with color labels
MEs0 = moduleEigengenes(resid_expr_t, moduleColors)$eigengenes
MEs = orderMEs(MEs0) #Reorder eigenvectors such that similar ones measured by correlation are next to each other
moduleTraitCor = WGCNA::cor(MEs, metadata_selected2, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 20, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(metadata_selected2), 
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
## Warning in greenWhiteRed(50): WGCNA::greenWhiteRed: this palette is not suitable for people
## with green-red color blindness (the most common kind of color blindness).
## Consider using the function blueWhiteRed instead.

gene_ids = colnames(resid_expr_t)
gene_ids = colnames(resid_expr_t)
# save(moduleColors, file = paste0(work_plots, "MyND/moduleColors.Rdata"))
gene_modules = cbind(gene_ids, moduleColors)
gene_modules = as.data.frame(gene_modules)

Gene assignment

## Get conversion table for Gencode 30
gencode_30 = read.table(paste0(expr_dir, "ens.geneid.gencode.v30"))
colnames(gencode_30) = c("ensembl","symbol")
gencode_30$ensembl = gsub("(.*)\\.(.*)","\\1",gencode_30$ensembl)
gene_modules_symbol = merge(gene_modules, gencode_30, by.x = "gene_ids", by.y = "ensembl")

# write.table(gene_modules_symbol, file = paste0(expr_dir, "MyND/geneBymodule.txt"), quote = F, row.names = F, sep = "\t")
datatable(gene_modules_symbol)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Module size

mytable2 = table(gene_modules_symbol$moduleColors)
datatable(as.data.frame(t(as.matrix(unclass(mytable2)))))

And now what?

Session info

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] forcats_0.5.1               stringr_1.4.0              
##  [3] dplyr_1.0.8                 purrr_0.3.4                
##  [5] readr_2.1.2                 tidyr_1.2.0                
##  [7] tibble_3.2.1                tidyverse_1.3.1            
##  [9] DESeq2_1.34.0               SummarizedExperiment_1.24.0
## [11] Biobase_2.54.0              MatrixGenerics_1.6.0       
## [13] matrixStats_0.61.0          GenomicRanges_1.46.1       
## [15] GenomeInfoDb_1.30.1         IRanges_2.28.0             
## [17] S4Vectors_0.32.3            BiocGenerics_0.40.0        
## [19] flashClust_1.01-2           data.table_1.14.2          
## [21] WGCNA_1.70-3                fastcluster_1.2.3          
## [23] dynamicTreeCut_1.63-1       factoextra_1.0.7           
## [25] ggplot2_3.4.1               sva_3.42.0                 
## [27] BiocParallel_1.28.3         genefilter_1.76.0          
## [29] mgcv_1.8-38                 nlme_3.1-153               
## [31] DT_0.30                     edgeR_3.36.0               
## [33] limma_3.50.1               
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.4.1        Hmisc_4.6-0           
##   [4] splines_4.1.2          crosstalk_1.2.0        digest_0.6.29         
##   [7] foreach_1.5.2          htmltools_0.5.2        GO.db_3.14.0          
##  [10] fansi_1.0.4            magrittr_2.0.3         checkmate_2.0.0       
##  [13] memoise_2.0.1          cluster_2.1.2          doParallel_1.0.17     
##  [16] tzdb_0.2.0             Biostrings_2.62.0      annotate_1.72.0       
##  [19] modelr_0.1.8           jpeg_0.1-9             colorspace_2.0-3      
##  [22] blob_1.2.2             rvest_1.0.2            ggrepel_0.9.1         
##  [25] haven_2.4.3            xfun_0.29              crayon_1.5.0          
##  [28] RCurl_1.98-1.6         jsonlite_1.7.3         impute_1.68.0         
##  [31] survival_3.2-13        iterators_1.0.14       glue_1.6.2            
##  [34] gtable_0.3.0           zlibbioc_1.40.0        XVector_0.34.0        
##  [37] DelayedArray_0.20.0    car_3.0-12             abind_1.4-5           
##  [40] scales_1.2.1           DBI_1.1.2              rstatix_0.7.0         
##  [43] Rcpp_1.0.8             xtable_1.8-4           htmlTable_2.4.0       
##  [46] foreign_0.8-81         bit_4.0.4              preprocessCore_1.56.0 
##  [49] Formula_1.2-4          htmlwidgets_1.5.4      httr_1.4.2            
##  [52] RColorBrewer_1.1-2     ellipsis_0.3.2         farver_2.1.0          
##  [55] pkgconfig_2.0.3        XML_3.99-0.8           nnet_7.3-16           
##  [58] sass_0.4.0             dbplyr_2.1.1           locfit_1.5-9.4        
##  [61] utf8_1.2.3             labeling_0.4.2         tidyselect_1.1.2      
##  [64] rlang_1.1.1            AnnotationDbi_1.56.2   munsell_0.5.0         
##  [67] cellranger_1.1.0       tools_4.1.2            cachem_1.0.6          
##  [70] cli_3.6.1              generics_0.1.2         RSQLite_2.2.10        
##  [73] broom_0.7.12           evaluate_0.15          fastmap_1.1.0         
##  [76] yaml_2.3.5             knitr_1.37             bit64_4.0.5           
##  [79] fs_1.5.2               KEGGREST_1.34.0        xml2_1.3.3            
##  [82] compiler_4.1.2         rstudioapi_0.13        png_0.1-7             
##  [85] ggsignif_0.6.3         reprex_2.0.1           geneplotter_1.72.0    
##  [88] bslib_0.3.1            stringi_1.7.6          highr_0.9             
##  [91] lattice_0.20-45        Matrix_1.5-1           vctrs_0.6.3           
##  [94] pillar_1.9.0           lifecycle_1.0.3        jquerylib_0.1.4       
##  [97] bitops_1.0-7           R6_2.5.1               latticeExtra_0.6-29   
## [100] gridExtra_2.3          codetools_0.2-18       assertthat_0.2.1      
## [103] withr_2.5.0            GenomeInfoDbData_1.2.7 parallel_4.1.2        
## [106] hms_1.1.1              grid_4.1.2             rpart_4.1-15          
## [109] rmarkdown_2.11         carData_3.0-5          ggpubr_0.4.0          
## [112] lubridate_1.8.0        base64enc_0.1-3