Maximum scoring subnetwork analysis

Setup

library(BioPlex)
library(BioNet)
library(EnrichmentBrowser)
library(graph)

Data retrieval

Get the latest version of the 293T PPI network:

bp.293t <- BioPlex::getBioPlex(cell.line = "293T", version = "3.0")

## Using cached version from 2023-01-14 23:15:28

and turn into a graph object:

bp.gr <- BioPlex::bioplex2graph(bp.293t)

Get RNA-seq data for 293T cells from GEO: GSE122425

gse122425 <- BioPlex::getGSE122425()

## Using cached version from 2023-01-14 23:15:40

Differential expression analysis

Differential expression analysis between NSUN2 knockout and wild type:

gse122425$GROUP <- rep(c(0, 1), each = 3)
gse122425 <- EnrichmentBrowser::deAna(gse122425, de.method = "DESeq2")

Inspect results:

rowData(gse122425)

## DataFrame with 18956 rows and 8 columns
##                        SYMBOL          KO                     GO    length
##                   <character> <character>            <character> <integer>
## ENSG00000227232        WASH7P      K18461                      _     15444
## ENSG00000228463    AP006222.2           _                      _     39639
## ENSG00000237094 RP4-669L17.10           _                      _    136229
## ENSG00000225630      MTND2P28      K03879                      _      1044
## ENSG00000237973  hsa-mir-6723      K02256                      _      1543
## ...                       ...         ...                    ...       ...
## ENSG00000185973         TMLHE      K00474 molecular_function:i..    179830
## ENSG00000124333         VAMP7      K08515 biological_process:a..     62478
## ENSG00000182484        WASH6P      K18461 molecular_function:m..      4885
## ENSG00000215414       PSMA6P1      K02730                      _       741
## ENSG00000185275        CD24P4      K06469                      _       243
##                        FC DESeq2.STAT        PVAL    ADJ.PVAL
##                 <numeric>   <numeric>   <numeric>   <numeric>
## ENSG00000227232  0.171520    0.724534    0.468738    0.648277
## ENSG00000228463  0.445551    1.194110    0.232435    0.419636
## ENSG00000237094 -0.318485   -0.446073    0.655545    0.791683
## ENSG00000225630 -0.404707   -1.227372    0.219683    0.404906
## ENSG00000237973 -0.337735   -0.983072    0.325572    0.518622
## ...                   ...         ...         ...         ...
## ENSG00000185973  0.111457    0.861636 3.88888e-01 5.79293e-01
## ENSG00000124333  3.430248    4.951910 7.34886e-07 2.15608e-05
## ENSG00000182484 -0.247212   -0.398845 6.90007e-01 8.17000e-01
## ENSG00000215414  0.543926    0.991335 3.21522e-01 5.14536e-01
## ENSG00000185275  0.452405    1.596938 1.10280e-01 2.59482e-01

not.na <- !is.na(rowData(gse122425)$PVAL)
gse122425 <- gse122425[not.na,]
EnrichmentBrowser::volcano(rowData(gse122425)$FC, rowData(gse122425)$ADJ.PVAL)

Change rownames to gene symbols for intersection with bioplex graph:

rowData(gse122425)$ENSEMBL <- rownames(gse122425)
rownames(gse122425) <- rowData(gse122425)$SYMBOL

Intersect bioplex graph and expression dataset:

syms <- graph::nodeData(bp.gr, graph::nodes(bp.gr), "SYMBOL")
syms <- unlist(syms)
isect <- intersect(syms, rownames(gse122425)) 
ind <- match(isect, syms) 
bp.sgr <- graph::subGraph(names(syms)[ind], bp.gr)
bp.sgr

## A graphNEL graph with directed edges
## Number of Nodes = 11015 
## Number of Edges = 79341

bp.sgr <- BioNet::rmSelfLoops(bp.sgr)
bp.sgr

## A graphNEL graph with directed edges
## Number of Nodes = 11015 
## Number of Edges = 79341

Get the DE p-values for each node:

syms <- syms[ind]
ind <- match(isect, rownames(gse122425))
pvals <- rowData(gse122425)[ind, "PVAL"]
names(pvals) <- names(syms)
head(pvals)

##     P00813     Q8N7W2     Q6ZMN8     P55039     Q9NZL4     Q6ICG6 
## 0.57137317 0.05761896 0.04112662 0.22823806 0.84595685 0.38653260

To score each node of the network we fit a Beta-uniform mixture model (BUM) to the p-value distribution and subsequently use the parameters of the model for the scoring function:

fb <- BioNet::fitBumModel(pvals)

scores <- BioNet::scoreNodes(bp.sgr, fb, fdr = 0.05)
head(scores)

##     P00813     Q8N7W2     Q6ZMN8     P55039     Q9NZL4     Q6ICG6 
## -2.3408852 -0.4505588 -0.1727220 -1.5847736 -2.6642293 -2.0188591

summary(scores)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  -2.8019  -2.1898  -1.2382   0.3586   0.6252 265.2463

First reduction step: unsupervised detection of maximum scoring subnetwork

Here we use a fast heuristic approach to calculate an approximation to theoptimal scoring subnetwork.
An optimal solution can be calculated using the heinz algorithm requiring a commercial CPLEX license.

module <- BioNet::runFastHeinz(bp.sgr, scores)

## Using cached version from 2023-01-14 23:42:23

module

## A graphNEL graph with directed edges
## Number of Nodes = 2851 
## Number of Edges = 8524

For some reason BioNet::runFastHeinz drops the edgeData of the input graph, so we need to re-annotate those to the module. There doesn’t seem to be a convenient way to bulk transfer edgeData from one graph to another, but rather only for one edge and one attribute at a time.

With a little abuse, we help ourselves via accessing slots directly.

en <- names(edgeData(module))
edgeDataDefaults(module) <- edgeDataDefaults(bp.sgr)
module@edgeData@data[en] <- edgeData(bp.sgr)[en]

Second reduction step: supervised enrichment analysis of maximum scoring subnetwork

kegg.gs <- EnrichmentBrowser::getGenesets(org = "hsa", db = "kegg", gene.id.type = "UNIPROT")
head(kegg.gs)

## $`hsa00010_Glycolysis_/_Gluconeogenesis`
##  [1] "A0A024QZ64" "A0A024R0Y5" "A0A024R145" "A0A024R5D8" "A0A024R713"
##  [6] "A0A024R782" "A0A024R7R1" "A0A024RBX9" "A0A0A0MS74" "A0A0K0K1K1"
## [11] "A0A140VJM9" "A0A140VJR3" "A0A140VKA7" "A8K7J7"     "B3KT70"    
## [16] "B7Z6C2"     "B7Z9E5"     "B9EKV4"     "J3KNX1"     "O00757"    
## [21] "P00325"     "P00326"     "P00338"     "P00558"     "P04075"    
## [26] "P04406"     "P05091"     "P06733"     "P06744"     "P07195"    
## [31] "P07327"     "P08319"     "P09104"     "P09467"     "P10515"    
## [36] "P11177"     "P11766"     "P13929"     "P14550"     "P14618"    
## [41] "P15259"     "P17858"     "P28332"     "P29803"     "P30613"    
## [46] "P30837"     "P30838"     "P35557"     "P35558"     "P35575"    
## [51] "P40394"     "P49419"     "P51648"     "P52789"     "P60174"    
## [56] "Q01813"     "Q6ZMR3"     "Q8N0Y7"     "Q96C23"     "Q96FY7"    
## [61] "Q96G03"     "Q9BRR6"     "Q9BUM1"     "Q9NQR9"     "Q9NUB1"    
## [66] "Q9UNW1"    
## 
## $`hsa00020_Citrate_cycle_(TCA_cycle)`
##  [1] "A0A024R1T9" "A0A024R5C5" "A0A024R713" "A0A024RB75" "A0A024RBX9"
##  [6] "A0A087X2E5" "A0A0A0MS74" "A0A0S2Z4C3" "A0A0S2Z4H7" "A0A140VJQ5"
## [11] "B3KTM1"     "B7Z6J1"     "E5KS60"     "O75874"     "P10515"    
## [16] "P11177"     "P21399"     "P21912"     "P29803"     "P31040"    
## [21] "P35558"     "P40925"     "P48735"     "P50213"     "P51553"    
## [26] "P53597"     "Q96I99"     "Q99643"     "Q99798"     "Q9ULD0"    
## 
## $hsa00030_Pentose_phosphate_pathway
##  [1] "A0A024QZ64" "A0A024R0Y5" "A0A024R145" "A0A0K0K1K7" "A0A140VK41"
##  [6] "A0A140VK56" "A0A140VKC2" "B4E2U0"     "B7Z6C2"     "B7Z7I0"    
## [11] "B7ZB02"     "E9PPM8"     "O00757"     "P04075"     "P06744"    
## [16] "P09467"     "P11413"     "P17858"     "P21108"     "P29401"    
## [21] "P49247"     "Q01813"     "Q15493"     "Q2QD12"     "Q5T6J7"    
## [26] "Q8IVS8"     "Q96AT9"     "Q96G03"     "Q9H477"     "R4GMU1"    
## 
## $hsa00040_Pentose_and_glucuronate_interconversions
##  [1] "A0A024R7A8" "A0A140T9Z0" "A0A140VJM1" "A0A140VKE1" "O60218"    
##  [6] "O60656"     "O60701"     "O75191"     "O75310"     "O75795"    
## [11] "P06133"     "P08236"     "P14550"     "P16662"     "P19224"    
## [16] "P22309"     "P22310"     "P35503"     "P35504"     "P36537"    
## [21] "P54855"     "Q00796"     "Q2QD12"     "Q5DSZ6"     "Q5DSZ7"    
## [26] "Q5DT02"     "Q6UWM9"     "Q7Z4W1"     "Q96AT9"     "Q96C11"    
## [31] "Q9BY64"     "Q9UEF7"     "Q9UQ10"     "Q9Y2S2"     "Q9Y4X1"    
## 
## $hsa00051_Fructose_and_mannose_metabolism
##  [1] "A0A024QZ64" "A0A024R0Y5" "A0A024R145" "A0A024R1U5" "A0A024R482"
##  [6] "A0A024R7A8" "A0A024R7R1" "A0A0S2Z4J6" "A0A140VJH7" "A0A140VJM6"
## [11] "A8K7J7"     "B3KT70"     "B7Z8A0"     "H3BPP3"     "I1Z9G4"    
## [16] "O00757"     "O14772"     "O60218"     "O60547"     "O60825"    
## [21] "P04075"     "P09467"     "P17858"     "P52789"     "P60174"    
## [26] "Q00796"     "Q01813"     "Q13630"     "Q16877"     "Q7L5Y1"    
## [31] "Q8N0W3"     "Q9NQ88"     "Q9Y5P6"    
## 
## $hsa00052_Galactose_metabolism
##  [1] "A0A024R0Y5" "A0A024R7A8" "A0A024R7R1" "A0A080YV01" "A0A0S2Z3Y7"
##  [6] "A0A140VKE1" "A8K7J7"     "B3KT70"     "B7Z6C2"     "H3BN99"    
## [11] "O43451"     "O60218"     "O60909"     "P06280"     "P09848"    
## [16] "P10253"     "P14410"     "P15291"     "P16278"     "P17858"    
## [21] "P35557"     "P35575"     "P51570"     "P52789"     "Q01813"    
## [26] "Q14376"     "Q2M2H8"     "Q96C23"     "Q96G03"     "Q9BUM1"    
## [31] "Q9NQR9"

go.gs <- EnrichmentBrowser::getGenesets(org = "hsa", db = "go", gene.id.type = "UNIPROT")

##

head(go.gs)

## $`GO:0000002_mitochondrial_genome_maintenance`
##  [1] "A0A0S2Z3H3" "A0A0S2Z3Z9" "A0A0S2Z417" "B2RBL3"     "O60313"    
##  [6] "P36776"     "Q96CQ1"     "Q9BQP7"     "Q9BSK2"     "Q9H611"    
## [11] "Q9Y243"    
## 
## $`GO:0000003_reproduction`
## [1] "O43555" "O75900" "P01148" "Q5TKA1"
## 
## $`GO:0000012_single_strand_break_repair`
## [1] "A0A024R6L5" "A0A024RE06" "A8K128"     "B2RCY5"     "P09430"    
## [6] "Q03468"     "Q13216"     "Q7Z2E3"     "Q8IW19"    
## 
## $`GO:0000017_alpha-glucoside_transport`
## [1] "P13866" "P31639"
## 
## $`GO:0000018_regulation_of_DNA_recombination`
## [1] "E9PB61" "P16871" "P52292" "P52294" "Q96FV9" "Q9H4L7"
## 
## $`GO:0000019_regulation_of_mitotic_recombination`
## [1] "A5D6Y3" "F8W7U8"

Given the maximum scoring subnetwork and gene sets of interest, we can now apply different set-based enrichment analysis (sbea) methods or network-based enrichment analysis (nbea) methods to identify biological themes within a module.

EnrichmentBrowser::sbeaMethods()

##  [1] "ora"        "safe"       "gsea"       "gsa"        "padog"     
##  [6] "globaltest" "roast"      "camera"     "gsva"       "samgs"     
## [11] "ebm"        "mgsa"

EnrichmentBrowser::nbeaMethods()

## [1] "ggea"        "spia"        "pathnet"     "degraph"     "ganpa"      
## [6] "cepa"        "topologygsa" "netgsa"      "neat"

Those are conveniently wrapped in the EnrichmentBrowser package and provide for a prioritization of gene sets of interest within a maximum scoring subnetwork.

To illustrate the concept, we use information on gene set enrichment from the paper.

NSUN2 is a nucleolus RNA m5C methyltransferase, regulator of cell proliferation and cell differentiation, dysfunction associated with intellectual diseases and cancer; reported to be a downstream target gene of MYC oncogene, a master regulator of cell proliferation.

Among the pathways showing enrichment in the paper is Ribosome, so as a showcase we extract nodes annotated to the gene set “Ribosome” from the maximum scoring subnetwork.

ind <- grep("Ribosome$", names(kegg.gs))
ribo <- kegg.gs[[ind]]
ribo <- intersect(ribo, graph::nodes(module))
ribo.gr <- graph::subGraph(ribo, module)
ribo.gr

## A graphNEL graph with directed edges
## Number of Nodes = 26 
## Number of Edges = 54

Such a gene set induced sub-graph can then be further explored in our BioPlex node data and edge data shiny viewer.

For that we might be interested in mapping additional data onto the nodes and the edges of the network.

For example, we might want to add the Bioplex proteome data onto the graph for inspection via the shiny graph viewer.

bp.prot <- BioPlex::getBioplexProteome()

## Using cached version from 2023-01-14 23:43:01

bp.prot

## class: SummarizedExperiment 
## dim: 9604 10 
## metadata(0):
## assays(1): exprs
## rownames(9604): P0CG40 Q8IXZ3-4 ... Q9H3S5 Q8WYQ3
## rowData names(5): ENTREZID SYMBOL nr.peptides log2ratio adj.pvalue
## colnames(10): HCT1 HCT2 ... HEK4 HEK5
## colData names(1): cell.line

ribo.gr <- BioPlex::mapSummarizedExperimentOntoGraph(ribo.gr, bp.prot,
    col.names = grep("HEK", colnames(bp.prot), value = TRUE), 
    rowdata.cols = c("nr.peptides", "log2ratio", "adj.pvalue"),
    prefix = "bpprot_")
head(graph::nodeData(ribo.gr), n = 2)

## $O75394
## $O75394$ENTREZID
## [1] "9553"
## 
## $O75394$SYMBOL
## [1] "MRPL33"
## 
## $O75394$ISOFORM
## [1] "O75394"
## 
## $O75394$score
## [1] 0.2549818
## 
## $O75394$bpprot_HEK1
## [1] 8.66958
## 
## $O75394$bpprot_HEK2
## [1] 8.36718
## 
## $O75394$bpprot_HEK3
## [1] 8.53282
## 
## $O75394$bpprot_HEK4
## [1] 8.6171
## 
## $O75394$bpprot_HEK5
## [1] 8.591
## 
## $O75394$bpprot_nr.peptides
## [1] 2
## 
## $O75394$bpprot_log2ratio
## [1] 0.4197192
## 
## $O75394$bpprot_adj.pvalue
## [1] 1.138452e-05
## 
## 
## $P18124
## $P18124$ENTREZID
## [1] "6129"
## 
## $P18124$SYMBOL
## [1] "RPL7"
## 
## $P18124$ISOFORM
## [1] "P18124"
## 
## $P18124$score
## [1] 0.4136751
## 
## $P18124$bpprot_HEK1
## [1] 7.7563
## 
## $P18124$bpprot_HEK2
## [1] 6.93534
## 
## $P18124$bpprot_HEK3
## [1] 7.17503
## 
## $P18124$bpprot_HEK4
## [1] 7.73146
## 
## $P18124$bpprot_HEK5
## [1] 6.7052
## 
## $P18124$bpprot_nr.peptides
## [1] 4
## 
## $P18124$bpprot_log2ratio
## [1] 0.8111145
## 
## $P18124$bpprot_adj.pvalue
## [1] 4.220104e-06

SessionInfo

sessionInfo()

## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] org.Hs.eg.db_3.16.0         AnnotationDbi_1.60.0       
##  [3] DESeq2_1.38.2               EnrichmentBrowser_2.28.0   
##  [5] BioNet_1.58.0               RBGL_1.74.0                
##  [7] graph_1.76.0                BioPlex_1.4.2              
##  [9] SummarizedExperiment_1.28.0 Biobase_2.58.0             
## [11] GenomicRanges_1.50.2        GenomeInfoDb_1.34.6        
## [13] IRanges_2.32.0              S4Vectors_0.36.1           
## [15] BiocGenerics_0.44.0         MatrixGenerics_1.10.0      
## [17] matrixStats_0.62.0         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7           fs_1.5.2               bit64_4.0.5           
##  [4] RColorBrewer_1.1-3     filelock_1.0.2         httr_1.4.3            
##  [7] rprojroot_2.0.3        Rgraphviz_2.42.0       tools_4.2.0           
## [10] bslib_0.3.1            utf8_1.2.2             R6_2.5.1              
## [13] colorspace_2.0-3       DBI_1.1.3              tidyselect_1.1.2      
## [16] bit_4.0.4              curl_4.3.2             compiler_4.2.0        
## [19] textshaping_0.3.6      cli_3.3.0              desc_1.4.1            
## [22] DelayedArray_0.24.0    sass_0.4.1             scales_1.2.0          
## [25] KEGGgraph_1.58.3       rappdirs_0.3.3         pkgdown_2.0.4         
## [28] systemfonts_1.0.4      stringr_1.4.0          digest_0.6.29         
## [31] rmarkdown_2.14         XVector_0.38.0         pkgconfig_2.0.3       
## [34] htmltools_0.5.2        highr_0.9              dbplyr_2.2.0          
## [37] fastmap_1.1.0          limma_3.54.0           htmlwidgets_1.5.4     
## [40] rlang_1.0.2            RSQLite_2.2.14         jquerylib_0.1.4       
## [43] generics_0.1.2         jsonlite_1.8.0         BiocParallel_1.32.5   
## [46] dplyr_1.0.9            RCurl_1.98-1.7         magrittr_2.0.3        
## [49] GO.db_3.16.0           GenomeInfoDbData_1.2.9 Matrix_1.4-1          
## [52] munsell_0.5.0          Rcpp_1.0.8.3           fansi_1.0.3           
## [55] lifecycle_1.0.1        stringi_1.7.6          yaml_2.3.5            
## [58] edgeR_3.40.1           zlibbioc_1.44.0        BiocFileCache_2.6.0   
## [61] grid_4.2.0             blob_1.2.3             parallel_4.2.0        
## [64] crayon_1.5.1           lattice_0.20-45        Biostrings_2.66.0     
## [67] annotate_1.76.0        KEGGREST_1.38.0        locfit_1.5-9.5        
## [70] knitr_1.39             pillar_1.7.0           igraph_1.3.2          
## [73] geneplotter_1.76.0     codetools_0.2-18       XML_3.99-0.10         
## [76] glue_1.6.2             evaluate_0.15          BiocManager_1.30.18   
## [79] BioPlexAnalysis_1.0.0  png_0.1-7              vctrs_0.4.1           
## [82] gtable_0.3.0           purrr_0.3.4            assertthat_0.2.1      
## [85] ggplot2_3.3.6          cachem_1.0.6           xfun_0.31             
## [88] xtable_1.8-4           r3dmol_0.1.2           ragg_1.2.2            
## [91] tibble_3.1.7           memoise_2.0.1          ellipsis_0.3.2        
## [94] GSEABase_1.60.0