Description

This R notebook is a bioinformatics pipeline to perform variability analysis with a resource allocation model for the chemolithoautotroph Ralstonia eutropha (a.k.a. Cupriavidus necator).

Libraries

# loading libraries
library(lattice)
library(latticeExtra)
library(latticetools)
library(tidyverse)
library(stringi)

Data import

Define the data source directories. The resource allocation model that was used to generate the data in data/simulation/ can be found at my fork of Bacterial-RBA-models.

Reutropha_proteomics <- "../data/input/Ralstonia_eutropha.Rdata"
model_reactions <- "../data/input/model_reactions.csv"
simulation_dir <- "../data/simulation/variability_analysis/"
source("read_rba_result.R")

Read simulation data.

# read simulation results
df_flux <- read_rba_result(list.files(simulation_dir, pattern = "fluxes_.*.tsv$", full.names = TRUE))
df_prot <- read_rba_result(list.files(simulation_dir, pattern = "proteins_.*.tsv", full.names = TRUE))
df_macr <- read_rba_result(list.files(simulation_dir, pattern = "macroprocesses_.*.tsv", full.names = TRUE))

Resource allocation in terms of protein mass

To determine the true allocation of protein resources per compartment, but also the true cost of protein per process, we need to convert the predicted concentration of proteins in mmol per gDCW to g per gDCW, simply by multiplying protein concentration with the molecular weight of a protein (g/mol, converted to g/mmol). We can then also easily transform g/gDCW to mass fraction by dividing individual protein concentrations by the sum of all protein concentrations. The protein mass fraction is dimensionless. The only parameter required for this transformation is the molecular weight per protein which is available from uniprot. We can for example take the protein annotation table that is automatically downloaded during RBApy model generation.

# import downloaded Ralstonia protein annotation from uniprot
df_uniprot <- read_tsv("../data/input/uniprot.csv", col_types = cols()) %>%
  mutate(locus_tag = stri_extract_first(`Gene names`, regex = "H16_[AB][0-9]{4}|PHG[0-9]{3}"))

# merge predicted protein allocation with molecular weight info from uniprot
df_prot <- left_join(df_prot, select(df_uniprot, locus_tag, Length, Mass),
  by = c("key" = "locus_tag")) %>%
  
  # calculate predicted protein mass in g/gDCW using MW in g/mmol, and mass fraction
  group_by(simulation) %>% mutate(
    predicted_mass_g_gDCW = value * Mass / 1000)

# test if mass fractions sum to reasonable value
df_prot %>% summarize(
  predicted_mass_g_gDCW = sum(predicted_mass_g_gDCW, na.rm = TRUE))

Correlation between predicted and experimentally determined proteome

To compare the predicted and experimental proteome composition, we load the required proteomics data, mass spectrometry measurements with label-free quantification of peptides. Protein quantification was performed by summing up all peptide intensities per annotated protein. The proteomic measurement unit that we are interested in is mass fraction. For comparability, the unit of RBA model predictions, protein copy number in mmol/gDCW, needs to be converted to protein mass in g/gDCW. Alternatively, we use protein mass fraction (g protein/g total protein).

To allow a fair comparison between measured and predicted data, it is necessary to aggregate (e.g. sum up) all protein abundances allocated to one reaction. The reason is that the model will only predict protein abundance of the first of a range of iso-enzymes for a particular reaction, while in reality another iso-enzyme might be more abundant (carry the majority of flux). This would lead to lower correlation between measured and predicted protein concentrations.

Step 1: load proteomics data

load(Reutropha_proteomics)

# pick a condition matching simulations
Ralstonia_eutropha <- Ralstonia_eutropha %>% filter(growthrate == 0.25) %>%
  
  # select only required columns
  select(condition, uniprot, locus_tag, protein, mean_mass_fraction, 
    MolWeight, COG_Process, R1:R4) %>%
  
  # rename conditions
  ungroup %>% mutate(condition = recode(condition,
    `FA 0.25` = "formate", `FRC 0.25` = "fructose", 
    `SUC 0.25` = "succinate", `NLIM 0.25` = "ammonium")) %>%
  
  # turn raw intensity measurements into mass in g per gDCW (assuming a 
  # total protein concentration of 0.68 g/gDCW)
  group_by(condition) %>%
  mutate(across(matches("R[1234]"), function(x) x/sum(x, na.rm = TRUE)*0.68)) %>%
  gather(key = "replicate", value = "mass_g_gDCW", R1:R4)

# test if protein mass sums to default total protein per gDCW
Ralstonia_eutropha %>% group_by(condition, replicate) %>%
  summarize(sum(mass_g_gDCW, na.rm = TRUE))
`summarise()` regrouping output by 'condition' (override with `.groups` argument)

Step 2: load gene reaction associations obtained from genome scale model

df_model_reactions <- read_csv(model_reactions, col_types = cols()) %>%
  
  # filter for reactions with gene associations
  select(reaction_id, reaction_name, genes) %>% separate_rows(genes, sep = ", ") %>%
  filter(!is.na(genes))

Step 3: Select and rename conditions from RBA simulation

df_prot <- df_prot %>%
  mutate(condition = case_when(
    carbon_source == "succ" ~ "succinate",
    carbon_source == "for" ~ "formate",
    carbon_source == "fru" & nitrogen_conc == 10 ~ "fructose",
    nitrogen_conc < 1 ~ "ammonium",
  ))

Step 4: Merge protein measurements and predictions into master table

df_prot_comp <- df_model_reactions %>%
  
  # join with proteomics data
  left_join(Ralstonia_eutropha, by = c("genes" = "locus_tag")) %>%
  
  # join with simulation data
  left_join(df_prot, by = c("genes" = "key", "condition")) %>%
  
  # determine number of reactions per protein
  group_by(condition, genes, replicate) %>% 
  mutate(n_reactions = length(reaction_id)) %>%
  
  # calculate protein mass in g/gDCW
  group_by(condition) %>% mutate(
    predicted_mass_g_gDCW = predicted_mass_g_gDCW/n_reactions,
    mass_g_gDCW = mass_g_gDCW/n_reactions
  ) %>%
  
  # summarize by summing up protein abundance per reaction (NA treated as zero)
  group_by(condition, reaction_id, reaction_name, replicate) %>% 
  summarize(
    predicted_mass_g_gDCW = sum(predicted_mass_g_gDCW, na.rm = TRUE),
    measured_mass_g_gDCW = sum(mass_g_gDCW, na.rm = TRUE),
    COG_Process = if_else(is.na(COG_Process), "Other", COG_Process),
  ) %>%
  
  # adding mass fraction
  group_by(condition, replicate) %>% 
  mutate(
    predicted_mass_fraction = predicted_mass_g_gDCW/sum(predicted_mass_g_gDCW, na.rm = TRUE),
    measured_mass_fraction = measured_mass_g_gDCW/sum(measured_mass_g_gDCW, na.rm = TRUE)
  ) %>%
  
  # trim COG Processes to most important ones
  mutate(COG_Process = replace(COG_Process, 
      !grepl("Amino|Nucleot|Coenz|Energy|Transla", COG_Process), "Other"
    )
  )

Now we perform a test. We check if all mass fractions per condition sum to unity as we would expect. And they do.

df_prot_comp %>% group_by(condition, replicate) %>%
  summarize(sum(measured_mass_fraction), sum(predicted_mass_fraction)) %>% head

Next, we plot the predicted versus actual protein abundance (mol fraction) summed up per reaction. The correlation between measured and predicted proteome is much higher when kapp values are used that were determined using the RBApy estim package (R2 > 0.5). These kapp are best fits for the measured proteome and the estimated flux disitrubtion obtained from flux sampling. When kapp values obtained from BRENDA database were used instead, the correlation was very low (R2 < 0.1).

plot_prot_comp <- df_prot_comp %>%
  
  # summarize protein quantification of 4 replicates to mean
  group_by(condition, reaction_id) %>%
  summarize(
    COG_Process = COG_Process[1],
    predicted_mass_fraction = mean(predicted_mass_fraction, na.rm = TRUE),
    measured_mass_fraction = mean(measured_mass_fraction, na.rm = TRUE)) %>%
  filter(!predicted_mass_fraction == 0, !measured_mass_fraction == 0) %>%
  
  xyplot(log10(predicted_mass_fraction) ~ log10(measured_mass_fraction) | condition, .,
    groups = COG_Process %>% substr(1, 15) %>% paste0("..."), par.settings = custom.colorblind(),
    xlab = expression("log"[10]*" measured mass fraction"), 
    ylab = expression("log"[10]*" predicted mass fraction"),
    as.table = TRUE, between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), layout = c(4,1),
    pch = 19, alpha = 0.5, cex = 0.6, auto.key = list(columns = 3),
    ylim = c(-7, 0), xlim = c(-7, 0),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
      panel.abline(a = 0, b = 1, col = grey(0.3), lwd = 2, lty = 2)
      rsquared = summary(lm(y ~ x))$r.squared
      panel.text(-4.8, -0.8, labels = paste0("R^2 = ", round(rsquared, 3)), ...)
    }
  )

print(plot_prot_comp)

kapp correction for outliers

We can also export a table with overview about highly over-predicted or under-predicted enzyme abundance that can be used to correct kapp values and thus improve protein predictions. It is not necessary to include this section if the predictions are sufficiently good.

Variability analysis – random sampling of kapp

Quantifying utilized reactions

To estimate the flexibility of simulated metabolism and the flux variability of reactions, we can perform RBA model simulations with random sampling of enzyme efficiency. These simulations are performed for growth under a limiting substrate just as described in the previous section. But instead of using the fitted (optimal) values of kapp obtained from RBA estim (see Bulovic et al., 2019), kapp values were randomly sampled from a log normal distribution of kapp values. This distribution is centered around a mean of 4 and standard deviation of 1.1 and was obtained from the fitted (optimal) kapp distribution. This procedure is identical with the one published in O’Brien et al., 2016.

The resource allocation model was used to perform 200 simulations with randomly sampled kapp values for each of four conditions. The fluxes and protein abundances were predicted by the RBA model. Quantifying the average flux and the flux variation per reation can tell us then which reactions (and associated proteins) are more often used than others, or which ones are not used at all (i.e. not required under the simulated conditions). First flux simulation data is imported.

# import flux simulation data obtained from random kapp sampling
sampling_dir <- paste0(simulation_dir, c("fructose/", "succinate/", "formate/", "ammonium/"))
      
df_random_flux <- bind_rows(.id = "substrate",
  read_rba_result(list.files(sampling_dir[1], pattern = "fluxes_.*.tsv$", full.names = TRUE)),
  read_rba_result(list.files(sampling_dir[2], pattern = "fluxes_.*.tsv$", full.names = TRUE)),
  read_rba_result(list.files(sampling_dir[3], pattern = "fluxes_.*.tsv$", full.names = TRUE)),
  read_rba_result(list.files(sampling_dir[4], pattern = "fluxes_.*.tsv$", full.names = TRUE))
  ) %>%
  
  # tidy substrate names and rename some columns
  mutate(substrate = recode(substrate, 
    `1` = "fructose", `2` = "succinate", `3` = "formate", `4` = "ammonium")) %>%
  rename(flux = value, reaction_id = key)

The first step is to perform a saturation analysis. Each iteration of the random sampling uses a certain set of reactions (and associated proteins), but this set will often be similar as essential reactions are used all the time, while other reactions might only be used with a favorable efficiency. As we don’t know the real turnover number and saturation of each enzyme, this gives a more robust picture of enzyme utilization. After a number of simulations, no new reactions are used and the set of utilized reactions becomes ‘saturated’. This saturation is plotted as a function of iteration number. We can see the number of unique reactions increased quickly in the beginning and only marginally at the end, when no or very few ‘new’ reactions are utilized.

# summarize variation per reaction
plot_randsamp_iter <- df_random_flux %>% 
  
  # saturation of random sampling: at which iteration does reaction first appear?
  group_by(substrate, reaction_id) %>% 
  summarize(first_appearance = iteration[1]) %>%
  group_by(substrate, first_appearance) %>%
  summarize(reaction_ids = length(reaction_id)) %>%
  mutate(reaction_ids = cumsum(reaction_ids)) %>%
  
  xyplot(reaction_ids ~ first_appearance, .,
    par.settings = custom.colorblind(), pch = 19,
    groups = substrate,
    xlab = "N simulations", ylab = "N used reactions",
    type = "l", lwd = 2.5, cex = 0.7,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.superpose(x, y, ...)
      panel.key(..., points = FALSE, corner = c(0.95, 0.05))
    }, panel.groups = function(x, y, ...) {
      panel.xyplot(c(rep(x, each = 2), 200), sort(c(y, y[-1], rep(tail(y, 1), 2))), ...)
    }
  )

print(plot_randsamp_iter)


We can also investigate how many reactions are maximally used under which condition, or taken together. And also, which reactions are always used and which are used condition specific (core versus non-core proteome, see O’Brien et al, 2016). The latter does not make much sense here, most reactions that are always used are used under all the tested conditions (similar to core proteome).

# number of all utilized reactions
df_random_flux %>% ungroup %>% pull(reaction_id) %>%
  unique %>% length
[1] 587
# number of reactions used in all simulations and limitations (core reactions)
df_random_flux %>%
  group_by(reaction_id) %>%
  summarize(n_simulations = length(iteration)) %>%
  filter(n_simulations == 800) %>% nrow
[1] 280
  

# which reactions are substrate specific (exclusive)?
df_random_flux %>%
  group_by(reaction_id) %>%
  summarize(n_substrates = length(unique(substrate))) %>%
  pull(n_substrates) %>% table
.
  1   2   3   4 
 28  28  37 494 

Overview on protein utilization

The utilization of proteins is a binary choice for every protein: it is either covered in any type of model simulation, or not. The situation gets a bit more complicated because the model solver would always choose the first of two annotated iso-enzymes per reaction (in case of an OR rule). In reality, several isoenzymes might catalyze the same reaction and both be expressed.

We therefore consider all proteins as utilized that are part of a reaction that carries flux in any simulation, either as essential subunit (AND rule) or as possible iso-enzyme (OR rule). All other enzymes are considered not utilized. Two more groups complete the picture, proteins that are part of molecular machinery (ribosomes, chaperones, DNA polymerase, RNA polymerase), or proteins that are not included by the model. Proteins that are part of molecular machines are obtained from the RBA model defintion. Non-modeled but experimentally quantified proteins are obtained from proteomics data.

# we collect utilized and non-utilized
utilized_enzymes <- df_model_reactions %>%
  filter(reaction_id %in% df_random_flux$reaction_id) %>%
  pull(genes) %>% unique

df_machinery <- bind_rows(
  read_tsv("../data/simulation/macro_machines/replication.tsv"),
  read_tsv("../data/simulation/macro_machines/transcription.tsv"),
  read_tsv("../data/simulation/macro_machines/ribosome.tsv"),
  read_tsv("../data/simulation/macro_machines/chaperones.tsv")
)
utilized_machinery <- df_machinery %>% pull(Entry)

# new df with annotated utilization
df_utilization <- Ralstonia_eutropha %>% 
  mutate(
    utilization = case_when(
      locus_tag %in% utilized_enzymes ~ "utilized\nenzymes",
      uniprot %in% utilized_machinery ~ "utilized\nmachinery",
      (locus_tag %in% df_model_reactions$genes) & !(locus_tag %in% utilized_enzymes) ~ "non-\nutilized\nenzymes",
      TRUE ~ "non-\nmodeled\nproteins"
    )
  )

# get summary of proteins per group (same for all conditions)
df_utilization %>% ungroup %>% filter(!duplicated(locus_tag)) %>% group_by(utilization) %>%
  summarize(n_proteins = length(locus_tag))

Now that proteins were mapped to reactions and utilization was flagged for each detected protein, we can summarize the actual protein mass (or mass fraction) occupied for each level of utilization.

df_util_summary <- df_utilization %>%
  
  # determine mass fraction per group
  group_by(condition, utilization, replicate) %>% 
  summarize(mass_g_gDCW = sum(mass_g_gDCW, na.rm = TRUE)) %>%
  mutate(utilization = utilization %>% factor(., unique(.)[c(1,4,3,2)]))

# summarize average mass in g/gDCW and % mass fraction over all conditions
df_util_summary %>% 
  group_by(utilization) %>%
  summarize(
    mass_g_gDCW = mean(mass_g_gDCW),
    mass_percent = mass_g_gDCW/0.68*100
  )

# plot
plot_utilization_bar <- xyplot(mass_g_gDCW ~ factor(utilization), df_util_summary,
    groups = condition, 
    stack = FALSE, cex = 0.6,
    par.settings = custom.colorblind(), ylim = c(0.001, 0.33),
    xlab = "", ylab = expression("m"[protein]*" [g gDCW"^-1*"]"),
    scales = list(alternating = FALSE, x = list(cex = 0.7)), 
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.barplot(x, y, beside = TRUE, fill_alpha = 1, fill = "white", lwd = 2, ...)
      panel.key(..., corner = c(0.98, 0.98), points = FALSE)
    }
  )

print(plot_utilization_bar)

Distribution of COG terms for non-modeled/non-utilized reactions

plot_COG_non_utilized <- df_utilization %>%
  
  # sum up mass fraction per COG group and all conditions
  group_by(COG_Process, utilization, replicate, condition) %>%
  summarize(mass_g_gDCW = sum(mass_g_gDCW, na.rm = TRUE)) %>%
  
  # determine mean mass fraction per COG process and utilization
  group_by(COG_Process, utilization) %>%
  summarize(mass_g_gDCW = mean(mass_g_gDCW)) %>% ungroup %>%
  mutate(COG_Process = paste0(str_sub(COG_Process, 1, 20), "..") %>% 
    fct_reorder(mass_g_gDCW, max)) %>%
  
  xyplot(COG_Process ~ factor(utilization), .,
    par.settings = custom.colorblind(),
    groups = COG_Process, as.table = TRUE, 
    col = custom.colorblind()$superpose.polygon$col[c(3,5)],
    pch = 19, alpha = 0.6, z = .[["mass_g_gDCW"]],
    xlab = "", ylab = "", scales = list(cex = 0.7), 
    panel = function(x, y, z, ...) {
      panel.abline(h = seq_along(unique(y)), v = seq_along(unique(x)), 
        col = grey(0.8), lty = 2)
      tactile::panel.bubbleplot(x, y, 80*z, ...)
      panel.key(labels = "0.01 g/gDCW ", corner = c(0.91, 0.11),
        point.cex = 80*0.01, pch = 19, col = grey(0.5), cex = 0.7, alpha = 0.8)
      panel.key(labels = "0.02 g/gDCW", corner = c(0.95, 0.05),
        point.cex = 80*0.02, pch = 19, col = grey(0.5), cex = 0.7, alpha = 0.8)
    }
  )
`summarise()` regrouping output by 'COG_Process', 'utilization', 'replicate' (override with `.groups` argument)
`summarise()` regrouping output by 'COG_Process' (override with `.groups` argument)
print(plot_COG_non_utilized)