This R notebook is a bioinformatics pipeline to process and analyze MS based peptide/protein abundance data for the chemolithoautotroph Ralstonia eutropha (a.k.a. Cupriavidus necator).
Proteomics data was obtained using the following work flow (to be added…).
# loading libraries
library(lattice)
library(latticeExtra)
library(latticetools)
library(tidyverse)
library(stringi)
Define the data source directories. Some of them are external in the sense of not included in the accompanying data folder of this R notebook. The main proteomics data is loaded from the R ShinyProt directory that can also be found on github and interactively browsed and searched in the ShinyProt web app.
Reutropha_proteomics <- "../data/input/Ralstonia_eutropha.Rdata"
load(Reutropha_proteomics)
In total, the following number of proteins was quantified, out of theoretical total of 6,614 proteins (Uniprot reference genome, July 31 2020). That represents roughly 81 % by number and much more by mass (see below for estimation)
n_quantified_prot <- Ralstonia_eutropha %>% pull(uniprot) %>% unique %>% length
print(n_quantified_prot)
[1] 5357
print(n_quantified_prot/6614*100)
[1] 80.99486
We can also estimate the coverage in terms of protein mass, by assuming that the 1257 missing proteins are of average mass or lower than average mass than the detected proteins. The following simple calculation simulates missing protein mass with an average abundance of the lower quantile of detected proteins. For this purpose we simply pick a standard condition, such as growth on fructose.
quantified_protein <- Ralstonia_eutropha %>%
# pick a certain condition
filter(substrate == "fructose", growthrate == 0.25) %>%
pull(mean_intensity)
# determine quantiles of raw quantification intensity
quantified_protein %>% quantile(na.rm = TRUE)
0% 25% 50% 75% 100%
7.244750e+04 8.117648e+07 3.395687e+08 1.480863e+09 4.888941e+11
Now we just simulate that the 1257 non-detected proteins have an average mass similar to that of the protein with 25% lowest abundance. The missing protein abundance will then sum up to less 1% of the total estimated proteome, meaning we can detect more than 99% of the proteome by mass.
missing_protein <- 1257 * quantile(na.rm = TRUE, quantified_protein)[2] %>% as.numeric()
missing_protein_percent <- missing_protein/(missing_protein + sum(quantified_protein, na.rm = TRUE))
paste("missing protein in % total mass:", round(missing_protein_percent*100, 3))
[1] "missing protein in % total mass: 0.88"
plot_quant_pep <- xyplot(sort(n_peptides, decreasing = TRUE) ~
1:length(protein),
filter(Ralstonia_eutropha, substrate == "fructose", growthrate == 0.25),
xlab = "protein", ylab = "n peptides",
par.settings = custom.lattice,
ylim = c(0, 80), xlim = c(0, 5500),
panel = function(x, y, ...) {
panel.grid(h = -1, v = -1, col = grey(0.9))
panel.barplot(x, y, col = NA, fill = grey(0.8), fill_alpha = 1, ewidth = 0.6)
xhalf = length(unique(x))/2
panel.lines(x = c(0, xhalf, xhalf), y = c(y[xhalf], y[xhalf], 0), col = 1)
panel.text(x = 10, y = y[xhalf]*3, pos= 4, cex = 0.8,
labels = paste0(round(xhalf), " proteins with >= ", y[xhalf], " peptides"))
}
)
print(plot_quant_pep)
plot_quant_pep_2 <- Ralstonia_eutropha %>%
# rearrange n_peptides to have another type of overview
filter(substrate == "fructose", growthrate == 0.25) %>%
pull(n_peptides) %>% table %>% as_tibble %>%
rename(., pep = `.`, prot = n) %>% mutate(pep = as.numeric(pep)) %>%
# plot
xyplot(prot ~ pep, .,
xlab = "n peptides", ylab = "n proteins", ylim = c(-5, 1155),
par.settings = custom.lattice, xlim = c(0, 80),
panel = function(x, y,...) {
panel.grid(h = -1, v = -1, col = grey(0.9))
panel.barchart(x, y, horizontal = FALSE, box.width = 1,
border = NA, col = grey(0.8), ...)
}
)
print(plot_quant_pep_2)
plot_quant_prot1 <- Ralstonia_eutropha %>%
# protein quantifications per replicate
gather(replicate, raw_intensity, R1:R4) %>%
group_by(substrate, growthrate, replicate) %>%
summarize(quant_proteins = sum(!is.na(raw_intensity))) %>%
# and plot
xyplot(quant_proteins ~ 1:length(quant_proteins), .,
xlab = "sample", ylab = "quantified proteins",
par.settings = custom.lattice,
ylim = c(0, 5500), xlim = c(0, 81),
panel = function(x, y,...) {
panel.grid(h = -1, v = -1, col = grey(0.9))
panel.barplot(x, y, col = NA, fill = grey(0.8),
fill_alpha = 1, ewidth = 0.5)
}
)
print(plot_quant_prot1)
plot_quant_prot2 <- Ralstonia_eutropha %>%
# protein quantifications per replicate
gather(replicate, raw_intensity, R1:R4) %>%
group_by(protein) %>%
summarize(quant_in_runs = sum(!is.na(raw_intensity))) %>%
pull(quant_in_runs) %>% table %>% enframe %>%
# and plot
xyplot(value ~ factor(name), .,
xlab = "in number of runs",
ylab = "quantified proteins",
par.settings = custom.lattice,
ylim = c(0, 3000), xlim = c(0, 79),
panel = function(x, y,...) {
panel.grid(h = -1, v = -1, col = grey(0.9))
panel.barplot(x, y, col = NA, fill = grey(0.8),
fill_alpha = 1, ewidth = 0.5)
panel.abline(v = 70, col = 1)
panel.text(x = 25, y = 1500, pos= 4, cex = 0.8,
labels = paste0(sum(y[70:79]), " proteins quantified\nin > 70 out of 80 runs"))
}
)
print(plot_quant_prot2)
Raw intensity here is the dimensionless MS ‘intensity’, that means the quantified area under the curve of MS1 spectra for peptides, summed up per protein. One replicate that was missing for condition Fructose, growth rate 0.1, R2, was temporarily replaced by R1 for this plot, because densityplot was otherwise giving an error message (because of missing values).
densityplot(~ log10(R1) + log10(R2) + log10(R3) + log10(R4) | condition,
Ralstonia_eutropha %>% mutate(R2 = case_when(
condition == "FRC 0.1" ~ R1, TRUE ~ R2)),
auto.key = list(columns = 4), layout = c(5, 4),
par.settings = custom.colorblind(), xlab = "log10 intensity",
scales = list(alternating = FALSE), as.table = TRUE,
panel = function(x, ...) {
panel.grid(h = -1, v = -1, col = grey(0.9))
panel.superpose(x, ...)
},
panel.groups = function(x, ...) {
panel.densityplot(x, plot.points = FALSE, ...)
panel.abline(v = median(x, na.rm = TRUE), lty = 2, col = grey(0.5))
}
)
Log 10 median intensity versus log 10 CV.
library(hexbin)
hexbinplot(log10(CV) ~ log10(median_intensity) | condition,
Ralstonia_eutropha,
layout = c(5, 4),
par.settings = custom.colorblind(),
scales = list(alternating = FALSE), aspect = 0.9,
ylim = c(-3, 0.5),
colramp = colorRampPalette(custom.lattice()$superpose.polygon$col[3:1])
)
A densityplot of the coeffcient of variation (CV) for the four replicates per protein, broken down by sample. This result shows that the variation is considerably higher than an ‘ideal’ MS-based proteomics experiment, where average CV can be as low as 10%. Here, variation is high as 50%.
densityplot(~ CV | condition,
Ralstonia_eutropha,
as.table = TRUE, lwd = 2,
par.settings = custom.colorblind(), pch = ".",
panel = function(x, ...) {
panel.grid(h = -1, v = -1, col = grey(0.9))
panel.densityplot(x, ...)
panel.ablineq(v = median(x, na.rm = TRUE), adj = -0.1,
lty = 2, col = grey(0.5), fontfamily = "FreeSans")
}
)
We can determine the overall similarity of samples (replicates and conditions) towards each other. A simple strategy for this is to use PCA or nMDS, the latter is an iterative and therefore not fully deterministic approach as it depends on the starting coordinates. However it is useful to compare how different samples or replicates ‘cluster’ together. Two replicates need to be removed from the data before, one corrupt sample that is missing (FRC_0.1_R2
), and one that is an outlier (NLIM_0.05_R1
).
The strategy for nMDS is to reshape all measurements per sample into a matrix and compute the ‘distance’ by a default measure. nMDS then tries to arrange each sample as a dot on a plane, taking optimal the distance to its neighbors into account. This approach might not give a perfect result and may contain contradictions indicated by the stress
level.
# load required libraries
library(dendextend)
library(vegan)
# first need to rearrange raw data so that we obtain a 'wide' table/matrix
dist_mat <- Ralstonia_eutropha %>% select(uniprot, condition, R1:R4) %>%
gather(replicate, intensity, R1:R4) %>%
unite(condition, condition, replicate) %>%
spread(condition, intensity) %>%
# remove missing/outlier samples and coerce to matrix
select(-uniprot, -all_of(c("FRC 0.1_R2", "NLIM 0.05_R1"))) %>%
as.matrix %>% t
# plot sample similarity as dendrogram
plot_cols <- custom.colorblind()$superpose.polygon$col[1:4]
cluster <- hclust(dist(dist_mat), method = "ward.D2")
plot(color_branches(cluster, col = rep(plot_cols, each = 20)[-c(26, 41)][cluster$order]))
We can see that many replicates cluster nicely together, samples also cluster predominantly by carbon/nitrogen limitation. For example, many samples from formic acid and nitrogen limitation cluster on the left side, and many samples from fructose cluster on the right side.
library(tactile)
# run nMDS analysis
NMDS <- dist_mat %>% dist %>% metaMDS
Run 0 stress 0.1716011
Run 1 stress 0.1702878
... New best solution
... Procrustes: rmse 0.03510426 max resid 0.2803567
Run 2 stress 0.1716398
Run 3 stress 0.1716012
Run 4 stress 0.1702877
... New best solution
... Procrustes: rmse 0.0001494851 max resid 0.001203133
... Similar to previous best
Run 5 stress 0.1702878
... Procrustes: rmse 0.0001786291 max resid 0.000834936
... Similar to previous best
Run 6 stress 0.1698424
... New best solution
... Procrustes: rmse 0.01435114 max resid 0.1129227
Run 7 stress 0.1716401
Run 8 stress 0.2045207
Run 9 stress 0.1702882
... Procrustes: rmse 0.01445371 max resid 0.1129753
Run 10 stress 0.199167
Run 11 stress 0.1702878
... Procrustes: rmse 0.01441851 max resid 0.1129971
Run 12 stress 0.1702881
... Procrustes: rmse 0.01444127 max resid 0.1129806
Run 13 stress 0.1716397
Run 14 stress 0.1702881
... Procrustes: rmse 0.01444245 max resid 0.1129879
Run 15 stress 0.1702879
... Procrustes: rmse 0.01442853 max resid 0.1129939
Run 16 stress 0.1702879
... Procrustes: rmse 0.01432394 max resid 0.1131321
Run 17 stress 0.1716013
Run 18 stress 0.1716398
Run 19 stress 0.1702877
... Procrustes: rmse 0.01439955 max resid 0.1130152
Run 20 stress 0.1716396
*** No convergence -- monoMDS stopping criteria:
20: stress ratio > sratmax
# and plot result
df_nmds <- NMDS$points %>% as_tibble(rownames = "condition") %>%
separate(condition, into = c("condition", "growth_rate", "replicate"), sep = "[ _]") %>%
mutate(condition = recode(condition, FA = "formate",
NLIM = "ammonium", FRC = "fructose", SUC = "succinate")) %>%
mutate(across(matches("MDS[12]"), function(x) x/10^11)) %>%
mutate(growth_rate = as.numeric(growth_rate)*7.5)
plot_nmds <- xyplot(MDS2 ~ MDS1, df_nmds,
groups = condition,
pch = 19, size = df_nmds$growth_rate, alpha = 0.7,
par.settings = custom.colorblind(),
panel = function(x, y, size, ...) {
panel.grid(h = -1, v = -1, col = grey(0.9))
panel.bubbleplot(x, y, z = size, ...)
panel.key(..., cex = 0.6, corner = c(0.95, 0.05), points = FALSE)
}
)
print(plot_nmds)
This is subfiure Figure 1 B of the manuscript:
plot_prot_per_chrom <- Ralstonia_eutropha %>% ungroup %>%
# filter for one growth rate and one chromosome only
filter(growthrate == 0.25) %>%
# add a rolling mean for every condition
group_by(substrate) %>% mutate(
roll_massfraction = zoo::rollapply(
median_mass_fraction, 5, function(x) mean(x, na.rm = TRUE),
partial = TRUE)) %>%
# sort by start position of gene
arrange(start) %>%
mutate(seq_type = seq_type %>% str_replace("chromosome", "Chr") %>%
str_replace("plasmid", "pHG1") %>% factor(., unique(.)[c(2,3,1)])) %>%
xyplot(roll_massfraction*100 ~ start/1000 | seq_type, .,
par.settings = custom.colorblind(),
layout = c(1, 3), between = list(x = 0.5, y = 0.5),
groups = substrate, type = "l", as.table = TRUE,
alpha = 0.8, xlab = "genome position [kbp]", ylab = "% protein mass fraction",
xlim = c(0, 4.05e3), ylim = c(-0.05, 0.55),
scales = list(alternating = FALSE),
strip.left = TRUE, strip = FALSE,
panel = function(x, y, ...) {
panel.grid(h = -1, v = -1, col = grey(0.9))
panel.xyplot(x, y, ...)
panel.key(..., cex = 0.7, points = FALSE, lines = TRUE,
corner = c(0.99, 0.9), which.panel = 3)
}
)
print(plot_prot_per_chrom)
null device
1
First we construct a simple generic plot function to plot different sets of genes.
plot_genes <- function(dat, y_lim, y_lab, key = TRUE) {
xyplot(median_mass_fraction*100 ~ protein, dat,
par.settings = custom.colorblind(),
between = list(x = 0.5, y = 0.5),
error_margin = dat$sd_massfraction*100,
groups = substrate, as.table = TRUE, lwd = 1.5,
xlab = "", ylab = y_lab,
ylim = y_lim,
scales = list(alternating = FALSE, x = list(rot = 30, cex = 0.6)),
panel = function(x, y, ...) {
panel.grid(h = -1, v = -1, col = grey(0.9))
panel.barplot(x, y, beside = TRUE, ...)
if (key) panel.key(..., cex = 0.7, pch = 15,
corner = c(0.95, 0.95))
}
)
}
Then we plot first all CBB genes. This is a special case because the two copies of the CBB operon on chromsome 2 and mega plasmid are virtually identical, regarding the protein sequence. The MS quantification can therefore not differentiate from which gene the detected peptide was expressed. We therefore combine mass fractions for both copies of the CBB operon, and also simply combine the standard deviation measurements (by summing them up).
df_cbb <- Ralstonia_eutropha %>% ungroup %>%
# filter for one growth rate only
filter(growthrate == 0.25, grepl("cbb", protein)) %>%
# abbreviate gene names and eliminate P(lasmid) and C(hromosomal) suffixes
mutate(protein = stri_extract_first_regex(protein, "cbb[A-Z0-9]")) %>%
# combine median and sd mass fraction for the 2 copies of each protein
group_by(protein, substrate) %>%
summarize(
median_mass_fraction = sum(median_mass_fraction),
sd_massfraction = sum(sd_massfraction)
) %>%
# arrange by order of genes in operon
mutate(protein = factor(protein, c("cbbB", "cbbA", "cbbK", "cbbG", "cbbZ",
"cbbT", "cbbP", "cbbF", "cbbE", "cbbY", "cbbX", "cbbS", "cbbL", "cbbR")))
# plot
plot_cbb <- plot_genes(df_cbb, y_lim = c(-0.04, 4.14), y_lab = "% protein mass fraction")
The other set that would be interesting to look at are (formate de-) hydrogenases, responsible for NADH reduction when Ralstonia grows on hydrogen or formate as energy source. Presumable formate dehydrogenases are present in R. eutropha in several operons mostly on different chromosomes (see Cramm et al., 2008). This folowwing is extracted from the review article:
Interesting side not in Cramm et al.: “S-FDH is formed only in formate-induced cells, M-FDH activity was detectable under various growth conditions [Burgdorf et al., 2001]”
df_fdh <- Ralstonia_eutropha %>% ungroup %>%
# filter for one growth rate
filter(growthrate == 0.25, grepl("fds|fdw|fdh|fdo", protein)) %>%
# abbreviate gene names and eliminate P(lasmid) and C(hromosomal) suffixes
mutate(protein = stri_extract_first_regex(protein, "fd[swho][A-Z0-9]+") %>%
stri_replace_first(replacement = "fdsC", regex = "fdhD$")) %>%
# arrange by order of genes in operon
ungroup %>% arrange(locus_tag) %>%
mutate(protein = factor(protein, unique(protein)))
# add missing information for fdoG
df_fdh[df_fdh$protein == "fdoG", "seq_type"] <- "chromosome 2"
df_fdh[df_fdh$protein == "fdoG", "strand"] <- "+"
df_fdh[df_fdh$protein == "fdoG", "start"] <- 1626225
df_fdh[df_fdh$protein == "fdoG", "end"] <- 1629314
plot_fdh <- plot_genes(df_fdh, y_lim = c(-0.01, 1.11), y_lab = "% protein mass fraction")
Add small genome map plots for cbb and other operons. For this purpose we construct a generic plotting function for genes as boxed arrows on a genome (line).
plot_genome <- function(df, xlim = NULL) {
if (!is.null(xlim))
xscale = list(limits = xlim)
else
xscale = list()
xyplot(end ~ start, df,
groups = strand, cex = 0.5, lwd = 1,
par.settings = custom.colorblind(),
scales = list(draw = FALSE, x = xscale),
ylim = c(-3,2), xlab = "", ylab = "",
gene_strand = df[["strand"]],
gene_name = df[["protein"]],
panel = function(x, y, ...) {
panel.geneplot(x, y, arrows = TRUE, tip = 200, ...)
}
)
}
Then we plot the CBB (only one, the chromosomal) and FDH operons (4 in total).
# plot CBB operon on chromosome 2
gene_plot_1 <- Ralstonia_eutropha %>%
filter(start > 1548000, start < 1564000,
seq_type == "chromosome 2", condition == "FA 0.05") %>%
# trim names again
mutate(protein = stri_extract_first_regex(protein, "cbb[A-Z0-9]|cfxP")) %>%
plot_genome(xlim = c(1549500, 1564500))
# plot formate dehydrogenase operons
df_fdh <- filter(df_fdh, !duplicated(protein))
gene_plot_2 <- df_fdh %>% plot_genome(xlim = c(677500, 685600)) # fds operon, S-FDH 1
gene_plot_3 <- df_fdh %>% plot_genome(xlim = c(1930000, 1935200)) # fdw operon, S-FDH 2
gene_plot_4 <- df_fdh %>% plot_genome(xlim = c(3169100, 3175000)) # fdh operon, M-FDH 1
gene_plot_5 <- df_fdh %>% plot_genome(xlim = c(1625500, 1632500)) # fdo operon, M-FDH 2
gene_plot_6 <- df_fdh %>% plot_genome(xlim = c(1648500, 1652300)) # fdh2 operon, M-FDH 1
Just for interest, we can also have a look at expression of hox and hyp gene operons. HoxFUYH and HoxKGZ genes encode soluble and membrane-bound hydrogenases. Hyp genes are accessory proteins. We can plot main hox operons and their vicinity.
df_hyd <- Ralstonia_eutropha %>%
filter(seq_type == "plasmid", growthrate == 0.25) %>%
# arrange by order of genes in operon
ungroup %>% arrange(locus_tag) %>%
mutate(protein = factor(protein, unique(protein)))
# hoxFUYH = S-HYD
hyd_plot_1 <- df_hyd %>% filter(start > 75000, start < 86000) %>%
plot_genes(, y_lim = c(0, 1.3), y_lab = "% protein mass fraction")
# hoxKGZ, MB-HYD
hyd_plot_2 <- df_hyd %>% filter(start > 0, start < 7000) %>%
plot_genes(, y_lim = c(0, 1.3), y_lab = "% protein mass fraction")
print(hyd_plot_1, position = c(0, 0.45, 1, 1), more = TRUE)
print(hyd_plot_2, position = c(0, 0, 1, 0.55))
Supplemental figure 2: proteome coverage
print(plot_quant_prot1, position = c(0, 0.64, 0.45, 1), more = TRUE)
print(plot_quant_pep_2, position = c(0, 0.32, 0.45, 0.68), more = TRUE)
print(plot_nmds, position = c(0.03, 0, 0.45, 0.36), more = TRUE)
print(gene_plot_1, position = c(0.48, 0.8, 1, 0.97), more = TRUE)
print(plot_cbb, position = c(0.41, 0.47, 1.04, 0.85), more = TRUE)
print(gene_plot_2, position = c(0.48, 0.33, 0.8, 0.5), more = TRUE)
print(gene_plot_4, position = c(0.77, 0.33, 1, 0.5), more = TRUE)
print(plot_fdh, position = c(0.39, 0, 1.04, 0.38), more = TRUE)
grid::grid.text(label = c("A", "B", "C", "D", "E"), x = c(0.03, 0.03, 0.03, 0.45, 0.45),
y = c(0.98, 0.66, 0.33, 0.98, 0.48))
null device
1