Setup

The bioplexpy package implements functions to calculate and visualize physical interactions between chains of a PDB structure and compare with BioPlex PPIs detected in the AP-MS data.

This can be used to identify subunits of a protein complex that are close enough to physically interact. This analysis (1) maps a set of UniProt IDs, corresponding to the subunits of the complex, to a PDB structure, and (2) infers physical interactions within a structure by calculating the distance between the atoms of different chains within that structure.

Here, we demonstrate how to invoke this functionality from within R, and how to incorporate results into subsequent analysis and visualization in R.

We start by loading the required packages.

Mapping from a set of UniProt IDs to a PDB structure

We use data from the SIFTS project to map from CORUM complex subunits given as UniProt IDs to PDB structures:

url <- "ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/uniprot_pdb.csv.gz"
dest <- basename(url)
download.file(url, destfile = dest)
df <- read.csv("uniprot_pdb.csv.gz", skip = 1)
file.remove(dest)
## [1] TRUE
head(df)
##   SP_PRIMARY                                                              PDB
## 1     A0A003                                                        6kvc;6kv9
## 2 A0A009I821                               7m4y;7m4x;7ryg;7m4w;7ryf;7ryh;7m4z
## 3 A0A009L7S8                               7m4z;7ryh;7ryf;7m4y;7ryg;7m4w;7m4x
## 4 A0A009QSN8                                              6v3d;6v39;6v3b;6v3a
## 5     A0A010 6j8v;5b01;5b00;5b0i;5b0l;5b0k;5b03;5b0m;6j8w;5b02;5gww;5gwv;5b0j
## 6     A0A011                                         3vkb;3vka;3vk5;3vkc;3vkd

Turn into a mapping:

unip2pdb <- strsplit(df$PDB, ";")
names(unip2pdb) <- df$SP_PRIMARY
head(unip2pdb)
## $A0A003
## [1] "6kvc" "6kv9"
## 
## $A0A009I821
## [1] "7m4y" "7m4x" "7ryg" "7m4w" "7ryf" "7ryh" "7m4z"
## 
## $A0A009L7S8
## [1] "7m4z" "7ryh" "7ryf" "7m4y" "7ryg" "7m4w" "7m4x"
## 
## $A0A009QSN8
## [1] "6v3d" "6v39" "6v3b" "6v3a"
## 
## $A0A010
##  [1] "6j8v" "5b01" "5b00" "5b0i" "5b0l" "5b0k" "5b03" "5b0m" "6j8w" "5b02"
## [11] "5gww" "5gwv" "5b0j"
## 
## $A0A011
## [1] "3vkb" "3vka" "3vk5" "3vkc" "3vkd"

Having illustrated the basic principle underlying the mapping from a CORUM complex to a PDB structure, we now set up a python environment via reticulate that contains the bioplexpy package in order to invoke the functionality from within R, facilitating direct exchange of data and results between the BioPlex R and BioPlex Python packages:

reticulate::virtualenv_create("r-reticulate")
## virtualenv: r-reticulate
reticulate::virtualenv_install("r-reticulate", "bioplexpy")
## Using virtual environment 'r-reticulate' ...
reticulate::use_virtualenv("r-reticulate")
bp <- reticulate::import("bioplexpy")

In the following, we focus on the TFIIH core complex (PDB ID: 6nmi).

We first get the set of UniProt IDs corresponding to this CORUM complex ID.

corum <- bp$getCorum(complex_set = "core", organism = "Human")
unips <- bp$get_UniProts_from_CORUM(corum, 107) 
unips
## [1] "P18074" "P19447" "P32780" "P50613" "P51946" "P51948" "Q13888" "Q13889"
## [9] "Q92759"

and then also get the set of PDB IDs corresponding to the obtained set of UniProt IDs based on the SIFTS mapping.

pdbs <- bp$get_PDB_from_UniProts(unips)
head(pdbs)
##      num_proteins deposit_date
## 7AD8            8   2020-09-14
## 6O9M            8   2019-03-14
## 6NMI            8   2019-01-10
## 5OF4           10   2017-07-10
## 7NVW           11   2021-03-16
## 7NVX           11   2021-03-16
##                                                                                         citation_title
## 7AD8              The TFIIH subunits p44/p62 act as a damage sensor during nucleotide excision repair.
## 6O9M Transcription preinitiation complex structure and dynamics provide insight into genetic diseases.
## 6NMI                                           The complete structure of the human TFIIH core complex.
## 5OF4                         The cryo-electron microscopy structure of human transcription factor IIH.
## 7NVW                               Structures of mammalian RNA polymerase II pre-initiation complexes.
## 7NVX                               Structures of mammalian RNA polymerase II pre-initiation complexes.
##      UniProts_mapped_to_PDB num_proteins_diff_btwn_PDB_and_UniProts_input
## 7AD8           P18074, ....                                             1
## 6O9M           P18074, ....                                             1
## 6NMI           P18074, ....                                             1
## 5OF4           P18074, ....                                             1
## 7NVW           P18074, ....                                             2
## 7NVX           P18074, ....                                             2

As there is often no 1:1 mapping from a set of UniProt IDs to a certain PDB structure, it is instructive to inspect the title of the PDB entry and use prior knowledge about the PDB structure for a complex of interest where available.

Here, we accordingly chose 6nmi as the PDB ID representing the TFIIH core complex.

We can use functionality from the bio3d package to get the PDB structure for the TFIIH core complex (PDB ID: 6nmi).

pdb.id <- "6nmi"
pdb.file <- bio3d::get.pdb(pdb.id)
pdb <- bio3d::read.pdb(pdb.file)
pdb
## 
##  Call:  bio3d::read.pdb(file = pdb.file)
## 
##    Total Models#: 1
##      Total Atoms#: 24379,  XYZs#: 73137  Chains#: 8  (values: A B C D E F G H)
## 
##      Protein Atoms#: 23015  (residues/Calpha atoms#: 2908)
##      Nucleic acid Atoms#: 0  (residues/phosphate atoms#: 0)
## 
##      Non-protein/nucleic Atoms#: 1364  (residues: 277)
##      Non-protein/nucleic resid values: [ SF4 (1), UNK (270), ZN (6) ]
## 
##    Protein sequence:
##       PQEAVPSAAGKQVDESGTKVDEYGAKDYRLQMPLKDDHTSRPLWVAPDGHIFLEAFSPVY
##       KYAQDFLVAIAEPVCRPTHVHEYKLTAYSLYAAVSVGLQTSDITEYLRKLSKTGVPDGIM
##       QFIKLCTVSYGKVKLVLKHNRYFVESCHPDVIQHLLQDPVIRECRLRNSEQTVSFEVKQE
##       MIEELQKRCIHLEYPLLAEYDFRNDSVNPDINIDLKPTAVLRPYQ...<cut>...QLEM
## 
## + attr: atom, xyz, seqres, helix, sheet,
##         calpha, remark, call
str(pdb)
## List of 8
##  $ atom  :'data.frame':  24379 obs. of  16 variables:
##   ..$ type  : chr [1:24379] "ATOM" "ATOM" "ATOM" "ATOM" ...
##   ..$ eleno : int [1:24379] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ elety : chr [1:24379] "N" "CA" "C" "O" ...
##   ..$ alt   : chr [1:24379] NA NA NA NA ...
##   ..$ resid : chr [1:24379] "PRO" "PRO" "PRO" "PRO" ...
##   ..$ chain : chr [1:24379] "A" "A" "A" "A" ...
##   ..$ resno : int [1:24379] 34 34 34 34 34 34 34 35 35 35 ...
##   ..$ insert: chr [1:24379] NA NA NA NA ...
##   ..$ x     : num [1:24379] 177 177 175 175 177 ...
##   ..$ y     : num [1:24379] 185 185 185 186 186 ...
##   ..$ z     : num [1:24379] 131 133 133 133 133 ...
##   ..$ o     : num [1:24379] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ b     : num [1:24379] 60.6 60.6 60.6 60.6 60.6 ...
##   ..$ segid : chr [1:24379] NA NA NA NA ...
##   ..$ elesy : chr [1:24379] "N" "C" "C" "O" ...
##   ..$ charge: chr [1:24379] NA NA NA NA ...
##  $ xyz   : 'xyz' num [1, 1:73137] 177 185 131 177 185 ...
##  $ seqres: Named chr [1:3477] "PRO" "GLN" "GLU" "ALA" ...
##   ..- attr(*, "names")= chr [1:3477] "A" "A" "A" "A" ...
##  $ helix :List of 4
##   ..$ start: Named num [1:128] 37 92 119 132 149 181 191 249 273 276 ...
##   .. ..- attr(*, "names")= chr [1:128] "" "" "" "" ...
##   ..$ end  : Named num [1:128] 42 104 130 142 161 190 198 262 275 287 ...
##   .. ..- attr(*, "names")= chr [1:128] "" "" "" "" ...
##   ..$ chain: chr [1:128] "A" "A" "A" "A" ...
##   ..$ type : chr [1:128] "1" "1" "1" "1" ...
##  $ sheet :List of 4
##   ..$ start: Named num [1:80] 59 333 342 314 307 76 83 113 105 268 ...
##   .. ..- attr(*, "names")= chr [1:80] "" "" "" "" ...
##   ..$ end  : Named num [1:80] 60 337 345 317 310 78 87 117 108 272 ...
##   .. ..- attr(*, "names")= chr [1:80] "" "" "" "" ...
##   ..$ chain: chr [1:80] "A" "D" "D" "D" ...
##   ..$ sense: chr [1:80] "0" "1" "-1" "-1" ...
##  $ calpha: logi [1:24379] FALSE TRUE FALSE FALSE FALSE FALSE ...
##  $ remark:List of 1
##   ..$ biomat:List of 4
##   .. ..$ num   : int 1
##   .. ..$ chain :List of 1
##   .. .. ..$ : chr [1:8] "A" "B" "C" "D" ...
##   .. ..$ mat   :List of 1
##   .. .. ..$ :List of 1
##   .. .. .. ..$ A B C D E F G H: num [1:3, 1:4] 1 0 0 0 1 0 0 0 1 0 ...
##   .. ..$ method: chr "AUTHOR"
##  $ call  : language bio3d::read.pdb(file = pdb.file)
##  - attr(*, "class")= chr [1:2] "pdb" "sse"

Provide a different color for each chain:

chains <- unlist(pdb$remark$biomat$chain)
chains
## [1] "A" "B" "C" "D" "E" "F" "G" "H"
nr.chains <- length(chains)
chain.colors <- RColorBrewer::brewer.pal(8, "Set1")
chain.colors <- tolower(chain.colors)

And visualize the structure using functionality from the r3dmol package:

# Set up the initial viewer
viewer <- r3dmol(
  id = "",
  elementId = "demo"
) %>%
  # Add model to scene
  m_add_model(data = m_bio3d(pdb), format = "pdb") %>%
  # Zoom to encompass the whole scene
  m_zoom_to() %>%
  # Set style of structures
  m_set_style(style = m_style_cartoon(color = "#00cc96"))
for(i in seq_len(nr.chains))
  viewer <- m_set_style(viewer,
    sel = m_sel(chain = LETTERS[i]),
    style = m_style_cartoon(color = chain.colors[i])
  ) 
viewer %>% m_rotate(angle = 90, axis = "y") %>% m_spin()