1 Description

This R notebook is a bioinformatics pipeline to analyze fitness data obtained from a barcoded transposon library in Ralstonia eutropha a.k.a. Cupriavidus necator. For background and details regarding the method, see Wetmore at al., mBio, 2015 and Price et al., Nature, 2018).

2 Libraries

# optionally install repos from github
# devtools::install_github("m-jahn/lattice-tools")
# devtools::install_github("m-jahn/R-tools")
library(lattice)
library(latticeExtra)
library(latticetools)
library(tidyverse)
library(dendextend)
library(Rtools)
library(colorspace)
library(stringi)

3 Overview of barcode/transposon read counts

3.1 Data import and processing

Read in the main data tables with A) reads per barcode and sample (‘pool counts’), and B) the fitness tables. Tables were obtained by processing sequencing data with a custom BarSeq pipeline. The 32 generation sequencing samples are removed due to the low read count in the continuous samples.

# import barseq counts data in wide format and reshape to long format
df_counts_frc <- read_tsv("../../../rebar/data/20201222_barseq_frc/results/result.poolcount") %>%
  select(!matches("32gen|_32_")) %>%
  pivot_longer(
    cols = !all_of(c("barcode", "rcbarcode", "scaffold", "strand", "pos")), 
    names_to = "sample", values_to = "n_reads")

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_double(),
  barcode = col_character(),
  rcbarcode = col_character(),
  scaffold = col_character(),
  strand = col_character()
)
ℹ Use `spec()` for the full column specifications.
df_counts_suc <- read_tsv("../../../rebar/data/20210407_barseq_suc_for/results/result.poolcount") %>%
  pivot_longer(
    cols = !all_of(c("barcode", "rcbarcode", "scaffold", "strand", "pos")), 
    names_to = "sample", values_to = "n_reads")

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_double(),
  barcode = col_character(),
  rcbarcode = col_character(),
  scaffold = col_character(),
  strand = col_character()
)
ℹ Use `spec()` for the full column specifications.
# merge barcode counts tables
df_counts <- bind_rows(df_counts_frc, df_counts_suc)

# import fitness data, the final output of the BarSeq pipeline
load("../../../rebar/data/20201222_barseq_frc/results/fitness_gene.Rdata")
df_fitness_frc <- fitness_gene %>%
  filter(condition != "long pulse", time != 32) %>%
  mutate(ID = as.numeric(ID), substrate = "fructose", 
    condition = str_remove(condition, "short "))

load("../../../rebar/data/20210407_barseq_suc_for/results/fitness_gene.Rdata")
df_fitness_suc <- fitness_gene %>%
  separate(condition, sep = "_", into = c("substrate", "condition"))

# merge fitness tables
df_fitness <- bind_rows(df_fitness_frc, df_fitness_suc) %>%
  rename(locus_tag = locusId)
rm("df_fitness_frc", "df_fitness_suc", "df_counts_frc", "df_counts_suc")

# import genome annotation
df_ref <- read_csv("../data/ref/Ralstonia_H16_genome_annotation.csv") %>%
  filter(!duplicated(locus_tag)) %>%
  mutate(eggNOG_name = if_else(is.na(eggNOG_name), gene_name, eggNOG_name))

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  length = col_double(),
  start = col_double(),
  end = col_double(),
  feature_interval_length = col_double(),
  Psortb_score = col_double()
)
ℹ Use `spec()` for the full column specifications.
# define standard colors
stdcol <- custom.colorblind()$superpose.line$col

3.2 Summary statistics

Overview about the number of reads per barcode, barcodes per gene and so on. Around 8-10 M reads were mapped on average, per sample.

# Number of total mapped reads
df_counts %>% group_by(sample) %>%
  summarize(n_million_reads = sum(n_reads)/10^6) %>%
  barchart(factor(sample) ~ n_million_reads, .,
    par.settings = custom.colorblind(),
    horizontal = TRUE, border = NULL,
    scales = list(y = list(cex = 0.7)),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.barchart(x, y, ...)
    }
  )

Distribution of number of reads per barcode. There is a sufficient number of reads for quantification, on average log2(n) = 5 = 32 reads per barcode.

plot_reads_per_bc <- histogram(~ log2(n_reads) | sample,
  df_counts, as.table = TRUE, layout = c(8,6),
  par.settings = custom.colorblind(),
  between = list(x = 0.5, y = 0.5),
  xlab = expression("log"[2]*" reads per barcode"),
  scales = list(alternating = FALSE),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
  }
)

print(plot_reads_per_bc)

Similarly to the above, this is an overview about the number of barcodes per gene as a histogram. This distribution is the same for all conditions and replicates. The second plot is the number of reads per gene, averaged as median over all conditions (excluding 0 time point where counts were averaged by BarSeq pipeline). The average reads per gene are log2(n) = 8, which translates to n = 2^8 or around 256 reads per gene.

plot_reads_per_bc <- histogram(~ strains_per_gene,
  df_fitness %>% select(locus_tag, strains_per_gene) %>% 
    distinct %>% filter(strains_per_gene < 40), 
  par.settings = custom.colorblind(), breaks = 20, xlim=c(-2, 42),
  xlab = expression("mutants per gene"),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
    panel.ablineq(v = mean(x, na.rm = TRUE),
      fontfamily = "FreeSans", col = grey(0.5), lwd = 2, lty = 2)
  }
)
Adding missing grouping variables: `scaffold`, `date`, `time`, `ID`, `condition`, `replicate`
plot_reads_per_gene <- histogram(~ log2(reads_per_gene_median),
  df_fitness %>% filter(time != 0) %>% group_by(locus_tag) %>%
    summarize(reads_per_gene_median = median(counts)),
  par.settings = custom.colorblind(), breaks = 20,
  xlab = expression("log"[2]*" reads per gene (med)"),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
    panel.ablineq(v = mean(x[!is.infinite(x)]),
      fontfamily = "FreeSans", col = grey(0.5), lwd = 2, lty = 2)
  }
)

print(plot_reads_per_bc, split = c(1,1,2,1), more = TRUE)
print(plot_reads_per_gene, split = c(2,1,2,1))

4 Gene fitness analysis

4.1 Depletion over time (generations)

We can plot log2 FC or normalized gene fitness over generations. For this type of overview it is best to summarize individual replicates (4x) to the mean or median, per time point and condition. We also add genome annotation to the summary table. The plots shows that depletion of some strains is so strong already at 8 generations that fitness/log2 FC could not be quantified for 16 generations due to missing read counts. It is best to focus on 8 generations as because it provides a more complete picture.

df_fitness_summary <- df_fitness %>%
  group_by(locus_tag, scaffold, time, condition, substrate, strains_per_gene) %>%
  summarize(
    norm_gene_fitness_median = median(norm_gene_fitness, na.rm = TRUE),
    log2FC_median = median(log2FC, na.rm = TRUE),
    tstat_median = median(t, na.rm = TRUE)
  ) %>%
  left_join(df_ref)
`summarise()` has grouped output by 'locus_tag', 'scaffold', 'time', 'condition', 'substrate'. You can override using the `.groups` argument.
Joining, by = "locus_tag"
plot_hist_log2FC <- df_fitness_summary %>%
  filter(all(!is.infinite(log2FC_median))) %>%
  group_by(condition, substrate) %>%
  slice(1:2000) %>%
  
  xyplot(log2FC_median ~ time | substrate * condition, .,
    groups = locus_tag, as.table = TRUE,
    col = stdcol[1], alpha = 0.2, layout = c(3, 2),
    xlab = "", ylab = expression("log"[2]*" FC"),
    par.settings = custom.colorblind(), type = c("l"),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
    }
  )

plot_hist_normfg <- df_fitness_summary %>%
  filter(all(!is.infinite(log2FC_median))) %>%
  group_by(condition, substrate) %>%
  slice(1:2000) %>%
  
  xyplot(norm_gene_fitness_median ~ time | substrate * condition, .,
    groups = locus_tag, as.table = TRUE,
    col = stdcol[2], alpha = 0.2, layout = c(3, 2),
    xlab = "generations", ylab = "fitness",
    par.settings = custom.colorblind(), type = c("l"),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
    }
  )

print(plot_hist_log2FC, position = c(0,0.47,1,1), more = TRUE)
print(plot_hist_normfg, position = c(0,0,1,0.53))

As sort of an internal control, we compare the gene fitness obtained by a complex procedure to the log2 FC of read counts, which is a very simple measure of ‘fitness’. The two variables correlate well all tested conditions and substrates.

df_fitness_summary %>%
  filter(time == 8, all(!is.infinite(log2FC_median))) %>%
  
  xyplot(norm_gene_fitness_median ~ log2FC_median |  substrate * condition, .,
    as.table = TRUE, col = stdcol[5], pch = 19,
    alpha = 0.4, cex = 0.6,
    layout = c(3, 2), xlim = c(-8,3), ylim = c(-6,5),
    xlab = expression("log"[2]*" FC"), ylab = "fitness",
    par.settings = custom.colorblind(),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
      panel.lmlineq(x, y, fontfamily = "FreeSans", r.squared = TRUE, lwd = 1.5, 
        col.text = 1, pos = 3, offset = 5, ...)
      panel.abline(a = 2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
    }
  )

4.2 Comparing gene fitness between conditions

Similarly to Figure 2 in Wetmore et al, mBIO, 2015, we can investigate condition-dependent gene fitness by comparing conditions and substrates one-on-one. The correlation between substrates and growth regimes (continuous, pulsed) is quite different. The strongest correlation does exist for the pulsed vs continuous regime for each substrate (R = 0.64 to 0.87). And for the pulsed conditions, comparison between substrates (R = 0.62 to 0.72).

df_fitness_comp <- df_fitness_summary %>% filter(time == 8) %>%
  group_by(locus_tag) %>% mutate(tstat_median = min(tstat_median)) %>%
  select(locus_tag, condition, substrate, norm_gene_fitness_median, gene_name, COG_Process, tstat_median) %>%
  unite(condition, condition, substrate) %>%
  pivot_wider(names_from = condition, values_from = norm_gene_fitness_median) %>%
  filter(!is.na(locus_tag))

custom_splom(df_fitness_comp %>% ungroup %>% 
  select(matches("conti|pulse")), col = grey(0.4, 0.4))

We can also compare selected conditions directly to identify genes enriched or depleted in several conditions.

# generalized plotting function
plot_fitness_comp <- function(data, vars){
  xyplot(get(vars[2]) ~ get(vars[1]), data,
    groups = abs(get(vars[1])-get(vars[2])) > 2, #tstat_median < -3
    par.settings = custom.colorblind(), col = stdcol[c(5,1)],
    pch = 19, alpha = 0.5, cex = 0.7, aspect = 1,
    xlim = c(-7, 4), ylim = c(-7, 4),
    xlab = vars[1], ylab = vars[2],
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.abline(v = 0, h = 0, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = 0, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = 2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = -2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.xyplot(x, y, ...)
      panel.key(...)
    }
  )
}

plot_fitness_comp(df_fitness_comp, c("continuous_formate", "pulse_formate")) %>%
  print(position = c(0,0.5,0.5,1), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_fructose", "pulse_fructose")) %>%
  print(position = c(0.5,0.5,1,1), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_succinate", "pulse_succinate")) %>%
  print(position = c(0,0,0.5,0.5), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_formate", "continuous_fructose")) %>%
  print(position = c(0.5,0,1,0.5), more = TRUE)

5 Differential fitness of selected gene sets

Most genes correlate in fitness value between conditions. That means, only some genes have a condition-specific fitness effect, i.e. increase or decrease fitness of the respective strain in one substrate or growth regime specifically. This section investigates specific genes and their functions that show such differential fitness between conditions. A simple comparison of one condition versus another will not be helpful as there two many possible combinations. To identify interesting sets of genes, cluster analysis can be performed.

# generate colorpalette for heatmap
heat_cols <- diverging_hcl(n = 7, h = c(255, 12), c = c(50, 80), l = c(20, 97), power = c(1, 1.3))
#heat_cols <- diverging_hcl(n = 7, h = c(340, 128), c = c(60, 80), l = c(30, 97), power = c(0.8, 1.5))
#heat_cols <- diverging_hcl(n = 7, h = c(300, 128), c = c(30, 65), l = c(20, 95), power = c(1, 1.4))

# create a matrix from wide fitness data for plotting heatmap
mat_heatmap <- df_fitness_comp %>% ungroup %>%
  filter(if_any(.cols = matches("conti|pulse"), ~ !between(., -2, 2))) %>%
  select(matches("locus_tag|conti|pulse")) %>%
  # filter out NA rows, and replace extreme values
  drop_na %>% mutate(across(matches("conti|pulse"), 
    function(x) {y = replace(x, x > 6, 6); replace(y, y < -6, -6)})) %>%
  column_to_rownames(var = "locus_tag") %>%
  as.matrix

# create cluster for reordering
mat_cluster <- mat_heatmap %>% dist %>% hclust(method = "ward.D")
mat_heatmap <- mat_heatmap[order.dendrogram(as.dendrogram(mat_cluster)), c(1,4,2,5,3,6)]

# plot heatmap
plot_heatmap <- levelplot(mat_heatmap,
  par.settings = custom.colorblind(),
  col.regions = colorRampPalette(heat_cols)(16),
  at = seq(-6, 6, 1), aspect = "fill",
  xlab = "", ylab = "", scales = list(x = list(draw = FALSE)),
  panel = function(x, y, z, ...) {
    panel.levelplot(x, y, z, ...)
    panel.abline(h = 1:5+0.5, col = "white", lwd = 1.5)
  }
)

print(plot_heatmap)

Clustering and silhouette analysis revealed that we have 4 to 7 clusters with acceptable separation, see the plot below. Of these, several clusters stick out:

  1. mutants specifically depleted in formate conditions = essential for formate
  2. mutants depleted in all conditions = essential in minimal medium
  3. mutants enriched in fructose growth, neutral on other substrates
  4. mutants enriched in continuous regime, mostly neutral in pulsed regime
  5. mutants slightly depleted in continuous succinate - low priority
  6. mutants slightly depleted in continuous fructose - low priority
  7. mutants slightly depleted in various conditions - low priority
silhouetteResult <- Rtools::silhouette_analysis(mat_heatmap, mat_cluster, 2:20)

Call:
hclust(d = ., method = "ward.D")

Cluster method   : ward.D 
Distance         : euclidean 
Number of objects: 310 

Silhouette analysis finished for clusters 2 to 20
silhouetteResult$plot.summary


# plot colored dendrogram
plot(color_branches(
  mat_cluster, k = 7,
  groupLabels = TRUE,
  col = stdcol[1:7]
))


# collect names of enriched/depleted genes
list_enriched <- vegan::cutreeord(mat_cluster, k = 7)
Registered S3 method overwritten by 'vegan':
  method     from      
  rev.hclust dendextend
list_enriched %>% table
.
 1  2  3  4  5  6  7 
34 40 14 27 80 67 48 

5.1 Enrichment of mutants over time (increased fitness)

One of the most puzzling results is the appearance of (very) fast growing mutants in some conditions. These mutants quickly enrich over only 16 generations to a volume of up to 20% of all library mutants, they “take over the culture”. This phenomenon was not observed previously in experiments with the Synechocystis sp. PCC6803 CRISPRi repression library (Yao et al., Nature Communications, 2020). The CRISPRi library is based on the same principle of depletion and enrichment of mutants depending on the associated fitness.

5.1.0.1 Strains enriched on fructose

What is the identity of the enriched genes? Which ones are enriched in several conditions? First we identify the mutants highly and specifically enriched on fructose, predominantly in the continuous growth regime.

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 3))) %>%
  filter(time == 8) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 3))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

Mechanisms enabling faster growth on fructose

The mechanism for increased growth on fructose seems clearly different from the mechanism for faster growth in all other (mainly continuous) conditions. For the fructose set, several genes are functionally linked suggesting a similar mechanism of action,

  1. Genes H16_B0517 (alcohol dehydrogenase) and H16_B1917 (aldehyde dehydrogenase): According to STRING DB, the genes are co-located i.e. either in direct proximity or in the same operon in alpha-, beta- and gamma-proteobacteria, and even cyanobcateria. BioCyc has 17 possible iso-enzymes in Cupriavidus for the 2-enzyme pathway “ethanol degradation 1”: ethanol –> acetaldehyde –> acetyl-CoA (reverse EtOH fermentation, obtain 1 NADH per reaction) catalyzed by both genes. What could be the mechanism for improved growth/yield on fructose? Chemostats are more yield- than growth selective, this pathway is probably functional and secretes EtOH as a fermentation product; KO of this pathway probably increases yield.

  2. Two genes are involved in nitrate respiration/reduction, PHG269 or narK, and H16_B2087, a NarL-family response regulator. NarK is a nitrate transporter, and NarL senses nitrate and then activates transcription of its targets by binding the DNA. According to this paper on NarL by Ruanto et al., 2020, “the Regulon DB database [11] for transcription regulation in E. coli lists 26 gene regulatory regions where NarL has a direct effect on transcript initiation, and it functions as an activator at 11 of these”. It is not clear what effect the KO of nitrate transport could have on growth or yield, nitrate is not part of the medium.

  3. Genes involved in translation/protein folding/sulphur metabolism:

    • H16_B2521, Alpha-ketoglutarate-dependent taurine dioxygenase; taurine is a sulphur carrier
    • H16_A2861, Glutathione S-transferase; Posttranslational modification, protein turnover, chaperones
    • H16_A1138, Thioredoxin; Posttranslational modification, protein turnover, chaperones
    • H16_A1336, yogA/tdcF, putative translation initiation inhibitor,yjg Ffamily; Translation, ribosomal structure and biogenesis
    • H16_A2553, cO, DNA repair protein RecO; Involved in DNA repair and RecF pathway recombination
  4. Several other gene products are presumably regulatory proteins:

    • 16_A0310, transcriptional regulator, GntR-family; Transcription
    • H16_B2215, HTH lysR-type domain-containing protein; helix turn helix DNA binding motif

5.1.0.2 Strains enriched on formate

Here, the obvious question with biotechnological importance is to find mutants with higher (specific) tolerance to formate, regardless if it’s pulsed or continuously supplied. We explicitly exclude mutants beneficial for other growth conditions too, which are dealt with in the next section. The clustering results of the previous section could not identify formate-specific mutants enriched after 8 generations. We need to lower the filter threshold to detect genes that have a more subtle enrichment (fitness >= 2 for formate, and <= 1 for all other conditions) after 16 generations.

list_fitness_formate <- df_fitness_summary %>%
  group_by(locus_tag, substrate, time) %>%
  mutate(formate_hit = case_when(
    substrate == "formate" & max(norm_gene_fitness_median) >= 2 ~ TRUE,
    substrate != "formate" & max(norm_gene_fitness_median) <= 1 ~ TRUE,
    TRUE ~ FALSE)
  ) %>% group_by(locus_tag, time) %>%
  filter(all(formate_hit), n() == 6) %>% 
  pull(locus_tag) %>% unique

df_fitness_summary %>%
  filter(time == 8, locus_tag %in% list_fitness_formate) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% list_fitness_formate) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

Mechanisms enabling faster growth on formate

According to STRING DB there are no obvious relationships between these genes except for the pair PHG388 (pcaI, Oxoadipate CoA transferase alpha subunit) and H16_B1583 (pcaD, 3-Oxoadipate enol-lactone hydrolase). Both genes are part of the catechol degradation pathway that is itself a larger part of benzene degradation. The gene products catalyze two sequential reactions 3-oxoadipate-enol-lactone –> 3-oxoadipate (pcaD) –> 3-oxoadipyl-CoA (pcaI). The next reaction catalyzed by four isoenzymes makes succinyl-CoA which ends up in the Citrate cycle. How the KO of this pathway would improve tolerance to formate is unclear.

5.1.0.3 Strains enriched on succinate

This is similar to the analysis above for formate, with the exception that many mutants associated with higher fitness in succinate conditions enrich only after 16 generations, and only in the continuous regime. This seems be an artifact of read compression. We therefore use the same thresholds as for formate (fitness >= 2 for succinate, and <= 1 for all other conditions) but after 8 instead of 16 generations.

list_fitness_succinate <- df_fitness_summary %>%
  filter(time == 8) %>%
  group_by(locus_tag, substrate) %>%
  mutate(succinate_hit = case_when(
    substrate == "succinate" & max(norm_gene_fitness_median) >= 2 ~ TRUE,
    substrate != "succinate" & max(norm_gene_fitness_median) <= 1 ~ TRUE,
    TRUE ~ FALSE)
  ) %>% group_by(locus_tag) %>%
  filter(all(succinate_hit), n() == 6, locus_tag != "H16_A2912") %>% 
  pull(locus_tag) %>% unique

df_fitness_summary %>%
  filter(time == 8, locus_tag %in% list_fitness_succinate) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% list_fitness_succinate) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

Mechanisms enabling faster growth on succinate

A group of five highly related genes shows significant enrichment in (continuous) succinate: H16_2096 (dppA1), H16_A2406 (dppD2), H16_A2407 (dppC2), H16_A2408 (dppB2), and H16_A2409, dppA2. Alternative names are yejABCDE. All of these proteins are part of an ABC transporter of the PepT family. Quote from PFAM about this type of protein/domain: “All characterised members appear to be involved in the transport of oligopeptides or dipeptides. Some are important for sporulation or antibiotic resistance. Some dipeptide transporters also act on the heme precursor delta-aminolevulinic acid. The enrichment seems to be highly reproducible: each of these genes has 7 to 15 different insertion mutants, all showing the same average pattern. At least two more genes are also involved in (amino acid?) transport, H16_A3284 and H16_A2521. The latter one is a”D-amino acid transferase (D-AAT), required by bacteria to catalyse the synthesis of D-glutamic acid and D-alanine" (PFAM). Supposed mechanism of enrichment: Growth on succinate could lead to a shortage/imbalance of certain amino acids, and investment in uptake might give a growth advantage. The growth advantage was only visible in the continuous condition, and is probably not universal for succinate. No gene/mutant was enriched in both succinate growth regimes.

5.1.0.4 Strains enriched in several conditions

A wealth of genes/mutants are enriched in multiple mainly pulsed growth conditions (cluster 4).

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 4))) %>%
  filter(time == 8) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 4))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

To learn more about functional relationships between enriched genes/mutants, we can submit the gene list to the STRING interaction database and retrieve a network of probable interactions.

library(ggraph)
library(tidygraph)

Attaching package: ‘tidygraph’

The following object is masked from ‘package:stats’:

    filter
# function retrieve network interaction data from STRING DB
# separate gene IDs by "%0d"; species/taxon ID for Cupriavidus necator H16: 381666
# (see https://string-db.org/cgi/organisms)
retrieve_STRING <- function(gene_ID, taxon_ID, min_score = 0000, ref = NULL) {
  gene_list <- paste(gene_ID, collapse = "%0d")
  string_graph <- paste0(
    "https://string-db.org/api/tsv/network?identifiers=", 
    gene_list, "&species=", taxon_ID, "&required_score=", min_score) %>%
  read_tsv(col_types = cols()) %>%
  mutate(across(matches("stringId"), function(x) gsub(paste0(taxon_ID, "."), "", x))) %>%
  as_tbl_graph()
  if (!is.null(ref)) {
    left_join(string_graph, ref, by = "name")
  } else {
    string_graph
  }
}

# function to space labels in certain distance to circle
nudge_circle <- function(n, size_x = 0.15, size_y = 0.1) {
  nudge_x = size_x * -cos(seq(0.5*pi, 2.5*pi, length.out = n))
  nudge_y = size_y * sin(seq(0.5*pi, 2.5*pi, length.out = n))
  list(x = nudge_x, y = nudge_y)
}
graph_all_enrich <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 4)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_all_enrich %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_x = nudge_circle(22)$x, nudge_y = nudge_circle(22)$y, 
    size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))

Mechanisms enabling faster growth in several conditions

The picture for these strains is more clear then for the fructose-enriched strains. 13 out of 27 enriched genes (48%), among them some of the most enriched, are directly involved in hydrogenase activity, i.e. hydrogen oxygenation (hoxA, hoxF, hoxU, hoxX, hypA, hypB, hypC, hypD hypE, hypF, hyaA, hyaB, hupH). The mechanism of enabling faster growth by KO of hydrogenases is unknown.

The second theme is enrichment of ptsI (H16_A0326) and ptsH (H16_A0325), together the entire functional unit of the PTS system, the PEP-dependent sugar phosphotransferase system (sugar PTS). “This major carbohydrate active-transport system catalyzes the phosphorylation of incoming sugar substrates concomitantly with their translocation across the cell membrane. Enzyme I transfers the phosphoryl group from (PEP) to the phosphoryl carrier protein (HPr).” Why does the KO of PTS system provides a growth benefit in in all conditions with almost equal contribution? Must be related to energy metabolism, because neither fructose (ABC transporter), succinate and def not formate are transported via a PTS.

Other interesting mechanisms: KO of RNA polymerase sigma factor RpoS (H16_A2373) seems to have a beneficial effect on steady state growth, as well as other stress related transcription factors like cold shock protein capB (H16_B2205), and scoF (H16_B0002).

The gene H16_B2043 is by far most abundant (by reads) and most enriched (by fitness) single mutant in almost all conditions. What is the mechanism? There is almost nothing known about this gene except that it probably has cyclic-guanylate-specific phosphodiesterase activity. This enzyme has, according to PFAM, two domains, the diguanylate cyclase (GGDEF) and the EAL domain. The first one synthesizes cyclic di-GMP Ryjenkov et al., J Bact, 2005. The second domain can act as phosphodiesterase and cleave cyclic cAMP or cGMP. Both compounds play a role in signaling, cAMP for example as the hormone in catabolite repression. According to the discussion in the linked paper, “Mutations in the GGDEF domain proteins or overexpression of such proteins affect exopolysaccharide synthesis […] and formation of biofilms. In C. crescentus, flagellum ejection, which is required for the switch from motile to sessile lifestyle, is impaired in a mutated GGDEF domain”. The underlying mechanism could be disrupted biofilm formation, higher flagellum activity, and thus non-settling phenotype in the bioreactor. Alternatively the mutation affects cAMP formation and disrupts or enhances catabolite repression (more likely that disrupted catabolite repression is beneficial).

5.1.0.5 Sanity checks

All above results are based on fitness score, which is not much more than a normalized log2 fold change of read count over time. We can compare if the mutants with extremely high fitness scores are also the ones that are super-abundant at the final time point, ie.e. have an average read count of >= 500,000 after 16 generations. In fact, only 3 genes have such an extreme read count, but 9 have more than 100,000 reads.

df_fitness %>% filter(time == 16, locus_tag %in% (subset(list_enriched, list_enriched %in% c(3,4)) %>% names)) %>%
  group_by(locus_tag) %>% summarize(max_counts = max(counts), max_fitness = max(norm_gene_fitness)) %>%
  arrange(desc(max_counts))

On the other hand, we can check which genes have an extremely high read count (>= 500,000 in any condition) and see if this correlates with high fitness score. It does partly. 3 out of 7 highly abundant gene mutants enrich extremely over time. The other enrich too, but only 2^1 = 2 to 2^3.5 = 11 times. These non-enriching super-abundant mutants are:

  • H16_A0774 - cphA, Cyanophycin synthase. Supposed mechanism: enriches in LB medium as it does not turn Asp into cyanophycin. No benefit on minimal medium.
  • H16_B2570 - fecA, Outer membrane receptor for Fe(III). Supposed mechanism: regulator activity together with fecR, but also transmembrane transporter for siderophores. KO reduces sensitivty to excess iron??
  • H16_A3145 - Conserved protein/domain typically associated with flavoprotein oxygenases, DIM6/NTAB family. Supposed mechanism: unknown.
df_fitness %>% filter(time == 16, counts > 5*10^5) %>%
  group_by(locus_tag) %>% summarize(max_counts = max(counts), max_fitness = max(norm_gene_fitness)) %>%
  arrange(desc(max_counts))

5.2 Depletion of mutants over time (reduced fitness)

5.2.0.1 Depletion on formate

Several genes/mutants are depleted over time depending on growth condition. Most interesting in this context are cluster 1 and 2, depletion only on formate, or in all conditions, respectively. Not surprisingly, formate-specific genes are highly depleted during growth on formate, but not cbb (Calvin cycle) genes.

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 1))) %>%
  filter(time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 1))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

What is the role of formate-essential genes? Are there patterns that emerge? We can submit the depleted gene list to STRING DB and retrieve a possible network of interactions. The network shows immediately that there are many high-scoring interactions between the genes, particularly the soluble FDH genes are sticking out as one cluster (fdsABDG) and their transcriptional regulator fdsR. Then the molybdenum cofactor processing proteins moaCDE, moeA, mobA, mog. And a range of cytochrome genes responsible for acceptance and transport of of electrons from formate to the ETC in cytoplasmic membrane (petABC, ccoGNOP, cyc). It is also intersting what is NOT essential on formate, such as no cbb genes except the master transcriptional regulator cbbR. The reason for this must be the redundancy of all cbb genes (two operons).

graph_formate_depl <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 1)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_formate_depl %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_x = nudge_circle(34)$x, nudge_y = nudge_circle(34)$y,
    size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))

Let’s create a pathway/operon based overview about the formate-essential genes. We group the genes into three categories, and also include the genes/proteins that belong to each group but were not depleted or enriched. The three groups are:

  • molybdenum cofactor biosynthesis
    • modAB = molybdenum transporter
    • moaACDE = molybdopterin backbone biosynthesis
    • moeA = molybdopterin molybdenum transferase
    • mobAB = molybdenum cofactor guanylyltransferase,
    • mog = molybdopterin biosynthesis protein (role?)
  • formate dehydrogenase
    • soluble (fds, fwd)
    • membrane-bound (fdh, fdo)
  • electron transport chain
    • petABC (aka qcrABC, H16_A3398, H16_A3397, H16_A3396) = cycochrome bc1 (reductase)
    • cyc (H16_A3451), cytochrome c553. Other cytochrome c553 genes to include according to uniprot: H16_A0830, H16_A0846, H16_B1452, H16_A3576, H16_A1121, H16_A1120
    • ccoGNOP (H16_A2316, H16_A2317, H16_A2318, H16_A2319), cytochrome C oxidase (cbb3 type)
    • VP2641 (H16_A1031) = Membrane spanning protein, RDD family. PFAM: unknown function, potential transporter? Interacts with cytochrome c553 and qcrC, cytochrome reductase. It’s depletion implicates this protein in formate-specific e- transport in the membrane.

We can prove that these complexes are mostly or only required during growth on formate. Can we exclude other cytochrome reductase and cytochrome oxidase complexes, or other cytochrome c553 proteins that act as e- acceptors from FDH? This type of information can be used to carve out the specific path of e- from formate through the ETC. If other cytochrome oxidase/reductase proteins are essential in all conditions, it can not be excluded that they are not required for growth on formate. We therefore also include alternative cytochrome oxidases, like ctaABCDEG and coxMNOP, cyo and cyd operons.

To summarize, according to the essentiality data on formate, electrons from formate are transported through the membrane the following way:

S-FDH --> NADH --|
                 |
M-FDH --> Cytochrome C reductase (bc1 complex III) --> Cytochrome c553 () --> Cytochrome C oxidase (cbb3 complex IV) --> O2

What is known in literature about electron transport from formate to membrane?

  • main FDH is S-FDH both in terms of protein abundance, inducibility, and essentiality
  • cytochrome reductase bc1 complex (qcrABC) is particularly important during growth on formate (this study)
  • it is also beneficial during heterotrophic growth (slightly reduced fitness)
  • inhibition of this complex reduces growth in heterotrophy and lithotrophy (see Cramm 2006)
  • qcrC seems to be a cytochrome C that is mobile (?) and transports e- in membrane
  • bc1 complex functions in oxygen-limited respiration, therefore maybe no growth on formate and nitrate respiration?
  • bc1 complex deletion mutant was completely deficient in oxygen-limited growth on nitrate (Garg et al., 2018).
  • bc1 complex accepts reduced quinols as e- donor, pumps 4 H+ /2 e− via proton-motive Q cycle (quinol:cyt c oxidoreduction), and releases reduced cytochrome C e- carriers
  • Cytochrome c553 proteins are e- shuttles in membrane, between bc1 and respiration complexes (?)
  • cbb3-type cytochrome oxidase special member of the heme-copper oxidase family
  • “in Bradyrhizobium japonicum, this enzyme operates at extremely low partial pressures of oxygen” [Preisig et al., 1996]
# generalized function to plot (small) heatmaps
heatmap_fitness <- function(data, key = TRUE) {
  
  # make some small adjustments to optical appeaarance
  mutate(data, gene_name = factor(gene_name, unique(gene_name))) %>%
  mutate(condition = if_else(condition == "pulse", "P", "C")) %>%
  mutate(cond = paste0(substrate, " - ", condition)) %>%
  group_by(gene_name) %>% arrange(cond) %>%
  mutate(norm_gene_fitness_median = replace(norm_gene_fitness_median, norm_gene_fitness_median < -6, -6)) %>%

  # plot heatmap
  levelplot(norm_gene_fitness_median ~ gene_name * factor(cond), .,
    par.settings = custom.colorblind(), colorkey = key,
    col.regions = colorRampPalette(heat_cols)(16),
    at = seq(-6, 6, 1), aspect = "fill",
    xlab = "", ylab = "", scales = list(cex = 0.7, x = list(rot = 90)),
    panel = function(x, y, z, ...) {
      panel.levelplot(x, y, z, ...)
      panel.abline(v = seq_along(unique(x))+0.5, 
        h = seq_along(unique(y))+0.5, col = "white", lwd = 2)
    }
  )
}
plot_formate_fit_1 <- df_fitness_summary %>% filter(time == 8, grepl("mo[aedbg]", gene_name)) %>%
  mutate(gene_name = stri_extract_first_regex(gene_name, "mo[aedbg]([A-Z0-9]+)?")) %>%
  ungroup %>% arrange(gene_name) %>%
  heatmap_fitness(key = FALSE)

plot_formate_fit_2 <- df_fitness_summary %>% filter(time == 8, grepl("fd[swho]", gene_name)) %>%
  mutate(gene_name = stri_extract_first_regex(gene_name, "fd[swho][A-Z0-9]+")) %>%
  mutate(gene_name = stri_replace_first_regex(gene_name, "fdhD$", "fdsC")) %>%
  ungroup %>% arrange(gene_name) %>%
  heatmap_fitness(key = FALSE)

plot_formate_fit_3 <- df_fitness_summary %>% filter(time == 8, locus_tag %in% c("H16_A3451", "H16_A0830",
    "H16_A0846", "16_B1452", "H16_A3576", "H16_A1121", "H16_A1120", "H16_A1031") | 
    grepl("qcr|cco|cta|cox[MNOPQ] |cyo|cyd", gene_name)) %>%
  mutate(gene_name = stri_extract_first_regex(gene_name, "H16_[AB][0-9]+|(qcr|cco|cta|cox|cyo|cyd)([A-Z0-9]+)?")) %>%
  ungroup %>% arrange(
    factor(str_sub(gene_name, 1, 3), c("cyo", "cyd", "qcr", "H16", "cox", "cta", "cco")), 
    str_sub(gene_name, 4, 5)) %>%
  heatmap_fitness()

print(plot_formate_fit_1, position = c(0, 0.475, 0.5, 1), more = TRUE)
print(plot_formate_fit_2, position = c(0.475, 0.49, 0.95, 1), more = TRUE)
print(plot_formate_fit_3, position = c(0, 0, 1, 0.57))

5.2.0.2 Depletion in all conditions

Which genes are depleted in all conditions (cluster 2)? Here we see the effect of the read compression, such that many genes seem to “recover” from initial depletion and enrich from 8 to 16 generation time point. This effect is only apparent for the continuous conditions and with very high probability related to the distortion of read counts by super-enriching mutants. The 8 generation time point should be considered as more reliable in those cases.

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 2))) %>%
  filter(time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 2))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

graph_all_depl <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 2)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_all_depl %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_x = nudge_circle(38)$x, nudge_y = nudge_circle(38)$y,
    size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))

5.3 Fitness of cbb genes

One of the most interesting features of C. necator is the cbb operon encoding enzymes for PPP and Calvin cycle (e.g. Rubisco). Two almost identical copies of the operon are present in the genome, one on the pHG1 megaplasmid and one one chromosome 2. The latter one has two additional genes, a putative cbbB formate dehydrogenase, and cbbR, the transcriptional regulator. To visualize general trends for all genes of a pathway, we can use a beeswarm plot broken down by condition.

plot_cbb_fitness_bee <- df_fitness_summary %>% ungroup %>%
  filter(time == 8, str_detect(gene_name, "cbb.|cfx.|cbx."), condition != "pulse") %>%
  mutate(gene_name = str_replace(gene_name, "cfx|cbx", "cbb")) %>%
  
  xyplot(norm_gene_fitness_median ~ factor(substrate), .,
    groups = factor(substrate), par.settings = custom.colorblind(col = subs_col[c(1,3,5)]),
    labels = stri_extract_first(.[["gene_name"]], regex = "cbb.|cfx.|cbx."),
    xlab = "", ylab = "fitness (8 generations)", ylim = c(-7.5, 2.5),
    scales = list(alternating = FALSE),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.abline(h = 0, lwd = 1.5, col = grey(0.7), lty = 2)
      panel.abline(h = -3, lwd = 1.5, col = grey(0.7), lty = 3)
      panel.text(3.1, -3.5, labels = "significant", col = grey(0.6), cex = 0.65)
      coords <- panel.beeswarm(x, y, bin_y = TRUE, breaks_y = seq(-10, 4, 0.4), 
        return_coords = TRUE, ...)
      panel.directlabel(x = coords$x, y = coords$y, y_boundary = c(-10, -4), 
        cex = 0.65, draw_box = TRUE, box_line = TRUE, ...)
      panel.key(labels = c("formate", "fructose", "succinate"), cex = 0.8, 
        corner = c(0.98, 0.05))
    }
  )

print(plot_cbb_fitness_bee)

Only one gene is really depleted and therefore essential on the two formate conditions, cbbR. At least two more genes for Rubisco and PRUK should be essential on formate as they have no counterpart on chromosome 1. They show no significant depletion. This proves that the two cbb operons can fully complement each other, except for cbbR, the one gene present in only copy (chromosome 2).

Note: Some cbb mutants for chromosome 2 seem to have a more positive fitness score compared to their counter parts on the pHG1 megaplasmid. Some genes with higher fitness show sudden leaps from 8 to 16 hours. A look up in the summary table revealed that they have only 1 or 2 mutants quantified per gene, which makes the results less reliable, particularly for the continuous conditions on fructose with low read count at 16 generationa. Results for these mutants have to be interpreted with care. The folowing plot focuses on the pHG1 cbb copies which have on average higher coverage with individual mutants.

# single time series per gene
plot_cbb_fitness_select <- df_fitness %>% ungroup %>%
  left_join(select(df_ref, locus_tag, gene_name), by = "locus_tag") %>%
  filter(str_detect(gene_name, "(cbb|cfx|cbx)[PSLKGAT]P|cbbR"), condition != "pulse") %>%
  mutate(
    gene_name = case_when(
      gene_name == "cfxR cbbR H16_B1396" ~ "cbbR (REG)",
      gene_name == "cfxP cbbPP PHG421" ~ "cbbP (PRUK)",
      gene_name == "cbxSP cbbS cbbSP cfxSP rbcS PHG426" ~ "cbbS (RBPC)",
      gene_name == "cbbL2 cbbL cbxLP cfxLP PHG427" ~ "cbbL (RBPC)",
      gene_name == "cbbKP PHG417" ~ "cbbK (PGK)",
      gene_name == "cbbGP PHG418" ~ "cbbG (GAPDH)",
      gene_name == "cbbAP PHG416" ~ "cbbA (FBA)",
      gene_name == "cbbTP PHG420" ~ "cbbT (TKT)",
      #gene_name == "cbbZP PHG419" ~ "cbbZ (PGP)",
      #gene_name == "cbxXP cfxXP PHG425" ~ "cbbX (RBCA)",
      #gene_name == "cbbEP cfxE PHG423" ~ "cbbE (RPE)"
    ) %>% factor(., unique(.)[c(1,6,7,8,3,4,2,5)])
  ) %>% ungroup() %>%
  
  xyplot(norm_gene_fitness ~ factor(time) | gene_name, .,
    groups = substrate, par.settings = custom.lattice(col = subs_col[c(1,3,5)]),
    type = c("p", "l"), xlab = "generations", ylab = "fitness",
    ylim = c(-7.9, 2.5), layout = c(4, 2), between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2.5, as.table = TRUE,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.errbars(x, y, ewidth = 0, ...)
    }
  )

print(plot_cbb_fitness_select)

5.4 Fitness of phosphoglycolate salvage mutants

Phosphoglycolate salvage, also known as photorespiration, is the process of recycling the product of Rubisco’s oxygenation reaction, 2-phoshoglycolate (2-PGly). Depending on the local CO2 concentration around Rubisco, the flux towards 2-PGly becomes higher with decreasing CO2 concentration. Typically, flux towards 2-PGly increases signifcantly between … and … ppm CO2 [Reference?]. The following plot shows fitness of the most important 2-PGly salvage KO mutants.

The key enzymes for the universal ‘upper part’ of 2-PGly salvage are 2-PGly-phosphatase (cbbZ2, cbbZP) and glycolate dehydrogenase GDH (GlcDEF, ). The glycerate pathway is the dominant route for the ‘lower part’ of 2-PGly salvage with four enzymes being strongly upregulated in low CO2: glyoxylate carboligase GLXCL (H16_A3598), hydroxypyruvate isomerase HPYRI (H16_A3599), tartronate semialdehyde reductase TRSARr (H16_A3600), and glycerate kinase GLYCK (H16_B0612). The first three of these enzymes are encoded in one operon.

df_fitness %>% ungroup %>%
  left_join(select(df_ref, locus_tag, gene_name), by = "locus_tag") %>%
  filter(
    locus_tag %in% c("H16_B1387", "PHG419", "H16_A3094", "H16_A3096", "H16_A3097",
      "H16_A3598", "H16_A3599", "H16_A3600", "H16_B0612", "H16_A0640", "H16_A0641",
      "H16_A0642", "H16_A0644"),
    condition == "pulse") %>%
  mutate(gene_name = str_replace(gene_name, " cbbZ2", "")) %>%
  
  xyplot(norm_gene_fitness ~ factor(time) | factor(gene_name, rev(unique(gene_name))), .,
    groups = substrate, par.settings = custom.lattice(col = subs_col[c(1,3,5)]),
    type = c("p", "l"), xlab = "generations", ylab = "fitness",
    ylim = c(-7.9, 2.5), layout = c(5, 3), between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2.5, as.table = TRUE,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.errbars(x, y, ewidth = 0, ...)
      panel.key(..., corner = c(0.05, 0.05))
    }
  )

Possible explanation: local CO2 concentration is high during formatotrophic growth

Question: What is steady-state CO2/HCO3-concentration in medium?

Known cultivation parameters for highest growth rate:

  • medium concentration: 1.5 g/L formate
  • growth rate/dilution rate: 0.25 h^-1
  • residual formate at µ = 0.25 h^-1: 0.25 g L^-1
  • 25% of CO2 from formate is fixed, 75% is emitted (C. necator GEM)

Answer:

  • formate consumption rate: Q_HCOO3 = (1.5 g L^-1 - 0.25 g L^-1) * 0.25 h^-1 = 0.31 g formate L^-1 h^-1
  • estimation of CO2 production rate: P_CO2 = Q_HCOO3 * MW_HCOO3 = 0.31 g L^-1 h^-1 / 48 g mol^-1 = 6.5 mmol L^-1 h^-1
  • estimation of CO2 consumption rate: Q_CO2 = P_CO2 * 0.25 = 1.625 mmol L^-1 h^-1
  • final CO2 production rate: P_CO2 - Q_CO2 = 4.875 mmol L^-1 h^-1

Unfortunately it is not possible to determine the steady-state concentration from the CO2 emission rate. However we can compare this rate with the growth-limiting CO2 supply for autotrophic growth (ambient air, Claassens et al., 2020). The authors used:

  • 0.04% CO2 or 10% CO2, 6.25 L/min = 375 L/h
  • 0.7L culture volume
  • density of CO2 = 1.836 g/L at 25*C
  • CO2 supply rate = 375 L h^-1 * 0.0004 / 0.7 L culture * 1.836 g L^-1 = 0.39 g L^-1 h^-1 = 8.5 mmol L^-1 h^-1

5.5 Fitness of central metabolism genes

In order to obtain an overview about fitness contribution of central carbon metabolism enzymes, we can select the same list of enzymes (reactions) that was used for Figure 3 of the manuscript. In this figure, absolute protein abundance, relative change in protein abundance, utilization was compared. The names of enzymatic reactions are taken from the genome scale modeland have to be mapped to gene IDs.

df_ccm_enzymes <- list(
  `CBB cycle` = c("PGK", "TPI", "GAPD", "FBA", "FBP", "TKT1", "TKT2", "TAh", "RPE", "RPI", "PRUK", "RBPC", "FDH"),
  `Entner-Doudoroff pathway` = c("PGI", "G6PDH2r", "PGL", "EDD", "EDA"),
  `Pyruvate metabolism` = c("PGM", "ENO", "PYK", "PDH1", "PDH2", "PDH3", "PC", "PPC", "ME1", "ME2", "CS"),
  `TCA cycle` = c("ACONT1", "ACONT2", "ICDHx", "ICDHyr", "AKGDH", "SUCOAS", "SUCDi", "SUCD1", "FUM", "MDH", "MALS")
) %>% enframe("model_group_short", "reaction_id") %>% 
unnest(reaction_id)

df_model_reactions <- read_csv("../data/input/model_reactions.csv", col_types = cols()) %>%
  select(reaction_id, reaction_name, genes) %>% separate_rows(genes, sep = ", ") %>%
  rename(locus_tag = genes) %>%
  right_join(df_ccm_enzymes, by = "reaction_id") %>%
  # remove 1 gene from TCA which is also listed in PYR (pdh3 / dihydrolipoamide dehydrogenase)
  filter(!(locus_tag == "H16_A1377" & model_group_short == "TCA"))
Warning: Missing column names filled in: 'X1' [1]

The beeswarm plot shows that some genes are conditionally essential, albeit not many. Trends that are consistent between time points and growth regimes are: - depletion of fdsABDG aka the soluble FDH in both formate conditions (described previously in detail) - depletion of eda, edd and pgl in fructose (all consecutive reactions of the ED pw), to a lower extent in formate too? - ppc (PEP carbooxylase) in formate, also predicted by FBA/RBA to carry major flux in formate) - pdhA (pyruvate dehydrogenase) on fructose and succinate - maeA malic enzyme - nothing from TCA cycle

plot_ccm_bee <- df_fitness_summary %>%
  inner_join(df_model_reactions, by = "locus_tag") %>%
  ungroup %>% filter(time == 8, condition != "pulse") %>%
  mutate(gene_name = stri_extract_first(gene_name, regex = "[a-zA-Z0-9]+")) %>%

  xyplot(norm_gene_fitness_median ~ factor(substrate) | model_group_short, .,
    groups = substrate, par.settings = custom.colorblind(col = subs_col[c(1,3,5)]),
    labels = .[["gene_name"]], xlab = "", ylab = "fitness (8 generations)", 
    ylim = c(-7.2, 1.8), as.table = TRUE, between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), layout = c(4, 1),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.abline(h = 0, lwd = 1.5, col = grey(0.7), lty = 2)
      panel.abline(h = -3, lwd = 1.5, col = grey(0.7), lty = 3)
      panel.text(3, -3.5, labels = "significant", col = grey(0.6), cex = 0.65)
      coords <- panel.beeswarm(x, y, bin_y = TRUE, breaks_y = seq(-10, 4, 0.4),
        return_coords = TRUE, ...)
      panel.directlabel(coords$x, coords$y, y_boundary = c(-10, -3), cex = 0.65,
        draw_box = TRUE, box_line = TRUE, box_scale = -0.01, ...)
    }
  )

print(plot_ccm_bee)

Looking at the single genes as a time series shows the same picture. Some genes however show a sudden “leap” in fitness from 8 to 16 generations even after previous depletion. This is an artifact caused by normalization of fitness by the overall read distribution. These genes have 0 reads quantified but the algorithm still determines if 0 reads can be expected for genes with low fitness and attributes this fitness to them. However if the read distribution becomes extremely skewed towards few high abundant mutants, the fitness calculation becomes impossible for low-abundant mutants. We therefore focus on the 8 generation time point.

# single time series per gene
plot_ccm_fitness_select <- df_fitness %>% ungroup %>%
  left_join(select(df_ref, locus_tag, gene_name), by = "locus_tag") %>%
  inner_join(df_model_reactions, by = "locus_tag") %>%
  filter(str_detect(gene_name, "fds|pgl|pgi|edd1|eda|ppc|pdh|mae")) %>%
  mutate(condition = paste0(substrate, " - ", toupper(substr(condition, 1, 1)))) %>%
  mutate(gene_name = stri_extract_first(gene_name, regex = "[a-zA-Z0-9]+")) %>%
  
  xyplot(norm_gene_fitness ~ factor(time) | gene_name, .,
    groups = condition, as.table = TRUE,
    par.settings = custom.colorblind(col = subs_col), type = c("p", "l"),
    xlab = "generations", ylab = "fitness",
    ylim = c(-8.5, 4), layout = c(7, 2), 
    between = list(x = 0.5, y = 0.5),
    auto.key = list(columns = 3, cex = 0.7),
    scales = list(alternating = FALSE), lwd = 2,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.errbars(x, y, ewidth = 0, ...)
    }
  )

print(plot_ccm_fitness_select)

5.6 Table with conditionally essential reactions in central metabolism

df_ccm_essential <- df_model_reactions %>%
  filter(reaction_id %in% c("FDH", "PPC", "PGL", "EDD", "EDA", "PDH1", "ME1", "ME2")) %>%
  left_join(select(df_ref, locus_tag, gene_name), by = "locus_tag") %>%
  left_join(select(df_fitness_comp, matches("locus|continuous")), by = "locus_tag") %>%
  arrange(model_group_short, reaction_id, locus_tag) %>%
  mutate(across(matches("continuous_"), round, 1)) %>%
  mutate(gene_name = str_remove(gene_name, " ?H16\\_[AB0-9]+")) %>%
  rename_with(function(x) gsub("continuous_", "", x), matches("continuous_")) %>%
  rename(Pathway = model_group_short, Reaction = reaction_id,
    `Reaction name` = reaction_name, `Gene ID` = locus_tag, `Gene name` = gene_name) %>%
  select(c(4,1,2,3,5,6,7,8))

print(df_ccm_essential)
write_csv(df_ccm_essential, "../data/output/essentiality_ccm.csv")

6 Export gene fitness data

Save an Rdata file with fitness data for all conditions that finally were used in the manuscript by Jahn et al., 2021. This is used in the ShinyLib web app.

Cupriavidus_BarSeq_2021 <- df_fitness %>%
  ungroup %>% select(-ID) %>%
  unite(condition, substrate, condition, sep = " - ") %>%
  mutate(fold_change = 2^log2FC) %>%
  rename(fitness_score = norm_gene_fitness, t_stat = t, timepoint = time) %>%
  left_join(select(df_ref, locus_tag, new_locus_tag, uniprot, gene_name, protein_name, System, 
    Process, Pathway), by = "locus_tag")

save(Cupriavidus_BarSeq_2021, file = "../../ShinyLib/data/Cupriavidus_BarSeq_2021.Rdata")
---
title: "Analysis of competition experiments with a barcoded transposon library"
date: "`r format(Sys.time(), '%d %B, %Y')`"
author: "Michael Jahn"
output:
  html_notebook: 
    theme: cosmo
    toc: yes
    number_sections: true
---

# Description

This R notebook is a bioinformatics pipeline to analyze fitness data obtained from a barcoded transposon library in *Ralstonia eutropha* a.k.a. *Cupriavidus necator*. For background and details regarding the method, see [Wetmore at al., mBio, 2015](https://mbio.asm.org/content/6/3/e00306-15) and [Price et al., Nature, 2018](http://www.nature.com/articles/s41586-018-0124-0)).


# Libraries

```{r, message = FALSE}
# optionally install repos from github
# devtools::install_github("m-jahn/lattice-tools")
# devtools::install_github("m-jahn/R-tools")
library(lattice)
library(latticeExtra)
library(latticetools)
library(tidyverse)
library(dendextend)
library(Rtools)
library(colorspace)
library(stringi)
```


# Overview of barcode/transposon read counts

## Data import and processing

Read in the main data tables with A) reads per barcode and sample ('pool counts'), and B) the fitness tables. Tables were obtained by processing sequencing data with a custom [BarSeq pipeline](https://github.com/m-jahn/rebar). The 32 generation sequencing samples are removed due to the low read count in the `continuous` samples.

```{r, message = FALSE}
# import barseq counts data in wide format and reshape to long format
df_counts_frc <- read_tsv("../../../rebar/data/20201222_barseq_frc/results/result.poolcount") %>%
  select(!matches("32gen|_32_")) %>%
  pivot_longer(
    cols = !all_of(c("barcode", "rcbarcode", "scaffold", "strand", "pos")), 
    names_to = "sample", values_to = "n_reads")

df_counts_suc <- read_tsv("../../../rebar/data/20210407_barseq_suc_for/results/result.poolcount") %>%
  pivot_longer(
    cols = !all_of(c("barcode", "rcbarcode", "scaffold", "strand", "pos")), 
    names_to = "sample", values_to = "n_reads")

# merge barcode counts tables
df_counts <- bind_rows(df_counts_frc, df_counts_suc)

# import fitness data, the final output of the BarSeq pipeline
load("../../../rebar/data/20201222_barseq_frc/results/fitness_gene.Rdata")
df_fitness_frc <- fitness_gene %>%
  filter(condition != "long pulse", time != 32) %>%
  mutate(ID = as.numeric(ID), substrate = "fructose", 
    condition = str_remove(condition, "short "))

load("../../../rebar/data/20210407_barseq_suc_for/results/fitness_gene.Rdata")
df_fitness_suc <- fitness_gene %>%
  separate(condition, sep = "_", into = c("substrate", "condition"))

# merge fitness tables
df_fitness <- bind_rows(df_fitness_frc, df_fitness_suc) %>%
  rename(locus_tag = locusId)
rm("df_fitness_frc", "df_fitness_suc", "df_counts_frc", "df_counts_suc")

# import genome annotation
df_ref <- read_csv("../data/ref/Ralstonia_H16_genome_annotation.csv") %>%
  filter(!duplicated(locus_tag)) %>%
  mutate(eggNOG_name = if_else(is.na(eggNOG_name), gene_name, eggNOG_name))

# define standard colors
stdcol <- custom.colorblind()$superpose.line$col
```

## Summary statistics

Overview about the number of reads per barcode, barcodes per gene and so on. Around 8-10 M reads were mapped on average, per sample. 

```{r, fig.width = 8, fig.height = 8, message = FALSE}
# Number of total mapped reads
df_counts %>% group_by(sample) %>%
  summarize(n_million_reads = sum(n_reads)/10^6) %>%
  barchart(factor(sample) ~ n_million_reads, .,
    par.settings = custom.colorblind(),
    horizontal = TRUE, border = NULL,
    scales = list(y = list(cex = 0.7)),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.barchart(x, y, ...)
    }
  )
```

Distribution of **number of reads per barcode**. There is a sufficient number of reads for quantification, on average log2(n) = 5 = 32 reads per barcode.

```{r, fig.width = 8, fig.height = 6.2}
plot_reads_per_bc <- histogram(~ log2(n_reads) | sample,
  df_counts, as.table = TRUE, layout = c(8,6),
  par.settings = custom.colorblind(),
  between = list(x = 0.5, y = 0.5),
  xlab = expression("log"[2]*" reads per barcode"),
  scales = list(alternating = FALSE),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
  }
)

print(plot_reads_per_bc)
```

Similarly to the above, this is an overview about the **number of barcodes per gene** as a histogram. This distribution is the same for all conditions and replicates. The second plot is the **number of reads per gene**, averaged as median over all conditions (excluding 0 time point where counts were averaged by BarSeq pipeline). The average reads per gene are log2(n) = 8, which translates to n = 2^8 or around 256 reads per gene.


```{r, fig.width = 8, fig.height = 4.2, message = FALSE}
plot_reads_per_bc <- histogram(~ strains_per_gene,
  df_fitness %>% select(locus_tag, strains_per_gene) %>% 
    distinct %>% filter(strains_per_gene < 40), 
  par.settings = custom.colorblind(), breaks = 20, xlim=c(-2, 42),
  xlab = expression("mutants per gene"),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
    panel.ablineq(v = mean(x, na.rm = TRUE),
      fontfamily = "FreeSans", col = grey(0.5), lwd = 2, lty = 2)
  }
)

plot_reads_per_gene <- histogram(~ log2(reads_per_gene_median),
  df_fitness %>% filter(time != 0) %>% group_by(locus_tag) %>%
    summarize(reads_per_gene_median = median(counts)),
  par.settings = custom.colorblind(), breaks = 20,
  xlab = expression("log"[2]*" reads per gene (med)"),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
    panel.ablineq(v = mean(x[!is.infinite(x)]),
      fontfamily = "FreeSans", col = grey(0.5), lwd = 2, lty = 2)
  }
)

print(plot_reads_per_bc, split = c(1,1,2,1), more = TRUE)
print(plot_reads_per_gene, split = c(2,1,2,1))
```

# Gene fitness analysis

## Depletion over time (generations)

We can plot log2 FC or normalized gene fitness over generations. For this type of overview it is best to summarize individual replicates (4x) to the mean or median, per time point and condition. We also add genome annotation to the summary table. The plots shows that depletion of some strains is so strong already at 8 generations that fitness/log2 FC could not be quantified for 16 generations due to missing read counts. It is best to focus on 8 generations as because it provides a more complete picture.

```{r, messages = FALSE}
df_fitness_summary <- df_fitness %>%
  group_by(locus_tag, scaffold, time, condition, substrate, strains_per_gene) %>%
  summarize(
    norm_gene_fitness_median = median(norm_gene_fitness, na.rm = TRUE),
    log2FC_median = median(log2FC, na.rm = TRUE),
    tstat_median = median(t, na.rm = TRUE)
  ) %>%
  left_join(df_ref)
```


```{r, fig.width = 6.5, fig.height = 8}
plot_hist_log2FC <- df_fitness_summary %>%
  filter(all(!is.infinite(log2FC_median))) %>%
  group_by(condition, substrate) %>%
  slice(1:2000) %>%
  
  xyplot(log2FC_median ~ time | substrate * condition, .,
    groups = locus_tag, as.table = TRUE,
    col = stdcol[1], alpha = 0.2, layout = c(3, 2),
    xlab = "", ylab = expression("log"[2]*" FC"),
    par.settings = custom.colorblind(), type = c("l"),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
    }
  )

plot_hist_normfg <- df_fitness_summary %>%
  filter(all(!is.infinite(log2FC_median))) %>%
  group_by(condition, substrate) %>%
  slice(1:2000) %>%
  
  xyplot(norm_gene_fitness_median ~ time | substrate * condition, .,
    groups = locus_tag, as.table = TRUE,
    col = stdcol[2], alpha = 0.2, layout = c(3, 2),
    xlab = "generations", ylab = "fitness",
    par.settings = custom.colorblind(), type = c("l"),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
    }
  )

print(plot_hist_log2FC, position = c(0,0.47,1,1), more = TRUE)
print(plot_hist_normfg, position = c(0,0,1,0.53))
```

As sort of an internal control, we compare the gene fitness obtained by a complex procedure to the log2 FC of read counts, which is a very simple measure of 'fitness'. The two variables correlate well all tested conditions and substrates.

```{r, fig.width = 6.5, fig.height = 5.2}
df_fitness_summary %>%
  filter(time == 8, all(!is.infinite(log2FC_median))) %>%
  
  xyplot(norm_gene_fitness_median ~ log2FC_median |  substrate * condition, .,
    as.table = TRUE, col = stdcol[5], pch = 19,
    alpha = 0.4, cex = 0.6,
    layout = c(3, 2), xlim = c(-8,3), ylim = c(-6,5),
    xlab = expression("log"[2]*" FC"), ylab = "fitness",
    par.settings = custom.colorblind(),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
      panel.lmlineq(x, y, fontfamily = "FreeSans", r.squared = TRUE, lwd = 1.5, 
        col.text = 1, pos = 3, offset = 5, ...)
      panel.abline(a = 2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
    }
  )
```

## Comparing gene fitness between conditions

Similarly to Figure 2 in [Wetmore et al, mBIO, 2015](https://mbio.asm.org/content/6/3/e00306-15), we can investigate condition-dependent gene fitness by comparing conditions and substrates one-on-one. The correlation between substrates and growth regimes (continuous, pulsed) is quite different. The strongest correlation does exist for the pulsed vs continuous regime for each substrate (R = 0.64 to 0.87). And for the pulsed conditions, comparison between substrates (R = 0.62 to 0.72).

```{r, fig.width = 8, fig.height = 8, message = FALSE}
df_fitness_comp <- df_fitness_summary %>% filter(time == 8) %>%
  group_by(locus_tag) %>% mutate(tstat_median = min(tstat_median)) %>%
  select(locus_tag, condition, substrate, norm_gene_fitness_median, gene_name, COG_Process, tstat_median) %>%
  unite(condition, condition, substrate) %>%
  pivot_wider(names_from = condition, values_from = norm_gene_fitness_median) %>%
  filter(!is.na(locus_tag))

custom_splom(df_fitness_comp %>% ungroup %>% 
  select(matches("conti|pulse")), col = grey(0.4, 0.4))
```

We can also compare selected conditions directly to identify genes enriched or depleted in several conditions.

```{r, fig.width = 8, fig.height = 8, message = FALSE}
# generalized plotting function
plot_fitness_comp <- function(data, vars){
  xyplot(get(vars[2]) ~ get(vars[1]), data,
    groups = abs(get(vars[1])-get(vars[2])) > 2, #tstat_median < -3
    par.settings = custom.colorblind(), col = stdcol[c(5,1)],
    pch = 19, alpha = 0.5, cex = 0.7, aspect = 1,
    xlim = c(-7, 4), ylim = c(-7, 4),
    xlab = vars[1], ylab = vars[2],
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.abline(v = 0, h = 0, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = 0, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = 2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = -2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.xyplot(x, y, ...)
      panel.key(...)
    }
  )
}

plot_fitness_comp(df_fitness_comp, c("continuous_formate", "pulse_formate")) %>%
  print(position = c(0,0.5,0.5,1), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_fructose", "pulse_fructose")) %>%
  print(position = c(0.5,0.5,1,1), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_succinate", "pulse_succinate")) %>%
  print(position = c(0,0,0.5,0.5), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_formate", "continuous_fructose")) %>%
  print(position = c(0.5,0,1,0.5), more = TRUE)
```

# Differential fitness of selected gene sets

Most genes correlate in fitness value between conditions. That means, only some genes have a **condition-specific fitness effect**, i.e. increase or decrease fitness of the respective strain in one substrate or growth regime specifically. This section investigates specific genes and their functions that show such differential fitness between conditions. A simple comparison of one condition versus another will not be helpful as there two many possible combinations. To identify interesting sets of genes, cluster analysis can be performed.

```{r, fig.width = 10, fig.height = 1.5}
# generate colorpalette for heatmap
heat_cols <- diverging_hcl(n = 7, h = c(255, 12), c = c(50, 80), l = c(20, 97), power = c(1, 1.3))
#heat_cols <- diverging_hcl(n = 7, h = c(340, 128), c = c(60, 80), l = c(30, 97), power = c(0.8, 1.5))
#heat_cols <- diverging_hcl(n = 7, h = c(300, 128), c = c(30, 65), l = c(20, 95), power = c(1, 1.4))

# create a matrix from wide fitness data for plotting heatmap
mat_heatmap <- df_fitness_comp %>% ungroup %>%
  filter(if_any(.cols = matches("conti|pulse"), ~ !between(., -2, 2))) %>%
  select(matches("locus_tag|conti|pulse")) %>%
  # filter out NA rows, and replace extreme values
  drop_na %>% mutate(across(matches("conti|pulse"), 
    function(x) {y = replace(x, x > 6, 6); replace(y, y < -6, -6)})) %>%
  column_to_rownames(var = "locus_tag") %>%
  as.matrix

# create cluster for reordering
mat_cluster <- mat_heatmap %>% dist %>% hclust(method = "ward.D")
mat_heatmap <- mat_heatmap[order.dendrogram(as.dendrogram(mat_cluster)), c(1,4,2,5,3,6)]

# plot heatmap
plot_heatmap <- levelplot(mat_heatmap,
  par.settings = custom.colorblind(),
  col.regions = colorRampPalette(heat_cols)(16),
  at = seq(-6, 6, 1), aspect = "fill",
  xlab = "", ylab = "", scales = list(x = list(draw = FALSE)),
  panel = function(x, y, z, ...) {
    panel.levelplot(x, y, z, ...)
    panel.abline(h = 1:5+0.5, col = "white", lwd = 1.5)
  }
)

print(plot_heatmap)
```

```{r, include = FALSE}
# silently export dendrogram
png("../figures/figure_barseq_heat.png", width = 1690, height = 250, res = 150)
print(plot_heatmap)
dev.off()
```

Clustering and silhouette analysis revealed that we have 4 to 7 clusters with acceptable separation, see the plot below. Of these, several clusters stick out:

1. mutants specifically depleted in formate conditions = essential for formate
2. mutants depleted in all conditions = essential in minimal medium
3. mutants enriched in fructose growth, neutral on other substrates
4. mutants enriched in continuous regime, mostly neutral in pulsed regime
5. mutants _slightly_ depleted in continuous succinate - low priority
6. mutants _slightly_ depleted in continuous fructose - low priority
7. mutants _slightly_ depleted in various conditions - low priority

```{r, fig.width = 10, fig.height = 4}
silhouetteResult <- Rtools::silhouette_analysis(mat_heatmap, mat_cluster, 2:20)
silhouetteResult$plot.summary

# plot colored dendrogram
plot(color_branches(
  mat_cluster, k = 7,
  groupLabels = TRUE,
  col = stdcol[1:7]
))

# collect names of enriched/depleted genes
list_enriched <- vegan::cutreeord(mat_cluster, k = 7)
list_enriched %>% table
```

```{r, include = FALSE}
# silently export dendrogram
png("../figures/figure_barseq_dendro.png", width = 1600, height = 700, res = 150)
plot(color_branches(
  mat_cluster, k = 7,
  groupLabels = TRUE,
  col = stdcol[1:7]
))
dev.off()
```


## Enrichment of mutants over time (increased fitness)

One of the most puzzling results is the appearance of (very) fast growing mutants in some conditions. These mutants quickly enrich over only 16 generations to a volume of up to 20% of all library mutants, they "take over the culture". This phenomenon was not observed previously in experiments with the *Synechocystis* sp. PCC6803 CRISPRi repression library (Yao et al., Nature Communications, 2020). The CRISPRi library is based on the same principle of depletion and enrichment of mutants depending on the associated fitness.

```{r, echo = FALSE}
# color vector for conditions
subs_col <- c(stdcol[2], lighten(stdcol[2], 0.3), stdcol[3],
  lighten(stdcol[3], 0.3), stdcol[4], lighten(stdcol[4], 0.3))

# generalized function for barplots
barchart_fitness <- function(data) {
  # sort by cumulative fitness
  data %>% group_by(locus_tag) %>% 
  mutate(sum_fitness = sum(norm_gene_fitness_median)) %>%
  arrange(sum_fitness) %>%
  mutate(name = paste0(locus_tag, " | ", eggNOG_name, " | ", 
    substr(eggNOG_description, 1, 40))) %>%
  
  barchart(factor(name, unique(name)) ~ norm_gene_fitness_median, .,
    group = paste(substrate, condition),
    par.settings = custom.colorblind(),
    col = subs_col, border = "white", stack = TRUE,
    xlab = "cumulative fitness",
    scales = list(y = list(cex = 0.7)),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.barchart(x, y, ...)
      panel.key(..., points = FALSE, cex = 0.6, corner = c(0.97, 0.03))
    }
  )
}

linechart_fitness <- function(data) {
  xyplot(norm_gene_fitness_median ~ time | locus_tag, data,
    groups = paste(substrate, condition), as.table = TRUE,
    par.settings = custom.colorblind(), type = c("l"),
    col = subs_col, lwd = 2.5,
    layout = c(5, ceiling(length(unique(data$locus_tag))/5)),
    xlab = "generations", ylab = "fitness",
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
    }
  )
}
```

#### Strains enriched on fructose

What is the identity of the enriched genes? Which ones are enriched in several conditions? First we identify the mutants highly and specifically enriched on fructose, predominantly in the continuous growth regime.


```{r, fig.width = 10, fig.height = 4.5}
df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 3))) %>%
  filter(time == 8) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 3))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

**Mechanisms enabling faster growth on fructose**

The mechanism for increased growth on fructose seems clearly different from the mechanism for faster growth in all other (mainly continuous)  conditions. For the fructose set, several genes are functionally linked suggesting a similar mechanism of action, 

1. Genes `H16_B0517` (alcohol dehydrogenase) and `H16_B1917` (aldehyde dehydrogenase): According to STRING DB, the genes are co-located i.e. either in direct proximity or in the same operon in alpha-, beta- and gamma-proteobacteria, and even cyanobcateria. BioCyc has 17 possible iso-enzymes in *Cupriavidus* for the 2-enzyme pathway "ethanol degradation 1": ethanol --> acetaldehyde --> acetyl-CoA (reverse EtOH fermentation, obtain 1 NADH per reaction) catalyzed by both genes. What could be the mechanism for improved growth/yield on fructose? Chemostats are more yield- than growth selective, this pathway is probably functional and secretes EtOH as a fermentation product; KO of this pathway probably increases yield.

2. Two genes are involved in nitrate respiration/reduction, `PHG269` or narK, and `H16_B2087`, a NarL-family response regulator. NarK is a nitrate transporter, and NarL senses nitrate and then activates transcription of its targets by binding the DNA. According to [this paper on NarL by Ruanto et al., 2020](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7419079/), "the Regulon DB database [11] for transcription regulation in E. coli lists 26 gene regulatory regions where NarL has a direct effect on transcript initiation, and it functions as an activator at 11 of these". It is not clear what effect the KO of nitrate transport could have on growth or yield, nitrate is not part of the medium.

3. Genes involved in translation/protein folding/sulphur metabolism:
    - `H16_B2521`, Alpha-ketoglutarate-dependent taurine dioxygenase; taurine is a sulphur carrier
    - `H16_A2861`, Glutathione S-transferase; Posttranslational modification, protein turnover, chaperones
    - `H16_A1138`, Thioredoxin; Posttranslational modification, protein turnover, chaperones
    - `H16_A1336`, yogA/tdcF, putative translation initiation inhibitor,yjg Ffamily; Translation, ribosomal structure and biogenesis
    - `H16_A2553`, cO, DNA repair protein RecO; Involved in DNA repair and RecF pathway recombination

4. Several other gene products are presumably regulatory proteins:
    - `16_A0310`, transcriptional regulator, GntR-family; Transcription
    - `H16_B2215`, HTH lysR-type domain-containing protein; helix turn helix DNA binding motif

#### Strains enriched on formate

Here, the obvious question with biotechnological importance is to find mutants with higher (specific) tolerance to formate, regardless if it's pulsed or continuously supplied. We explicitly exclude mutants beneficial for other growth conditions too, which are dealt with in the next section. The clustering results of the previous section could not identify formate-specific mutants enriched after 8 generations. We need to lower the filter threshold to detect genes that have a more subtle enrichment (fitness >= 2 for formate, and <= 1 for all other conditions) after 16 generations.

```{r, fig.width = 10, fig.height = 4.5}
list_fitness_formate <- df_fitness_summary %>%
  group_by(locus_tag, substrate, time) %>%
  mutate(formate_hit = case_when(
    substrate == "formate" & max(norm_gene_fitness_median) >= 2 ~ TRUE,
    substrate != "formate" & max(norm_gene_fitness_median) <= 1 ~ TRUE,
    TRUE ~ FALSE)
  ) %>% group_by(locus_tag, time) %>%
  filter(all(formate_hit), n() == 6) %>% 
  pull(locus_tag) %>% unique

df_fitness_summary %>%
  filter(time == 8, locus_tag %in% list_fitness_formate) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% list_fitness_formate) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

**Mechanisms enabling faster growth on formate**

According to STRING DB there are no obvious relationships between these genes except for the pair `PHG388` (pcaI, Oxoadipate CoA transferase alpha subunit) and `H16_B1583` (pcaD, 3-Oxoadipate enol-lactone hydrolase). Both genes are part of the catechol degradation pathway that is itself a larger part of benzene degradation. The gene products catalyze two sequential reactions 3-oxoadipate-enol-lactone --> 3-oxoadipate (pcaD) --> 3-oxoadipyl-CoA (pcaI). The next reaction catalyzed by four isoenzymes makes succinyl-CoA which ends up in the Citrate cycle. How the KO of this pathway would improve tolerance to formate is unclear.

#### Strains enriched on succinate

This is similar to the analysis above for formate, with the exception that many mutants associated with higher fitness in succinate conditions enrich only after 16 generations, and only in the continuous regime. This seems be an artifact of read compression. We therefore use the same thresholds as for formate (fitness >= 2 for succinate, and <= 1 for all other conditions) but after 8 instead of 16 generations.

```{r, fig.width = 10, fig.height = 4.5}
list_fitness_succinate <- df_fitness_summary %>%
  filter(time == 8) %>%
  group_by(locus_tag, substrate) %>%
  mutate(succinate_hit = case_when(
    substrate == "succinate" & max(norm_gene_fitness_median) >= 2 ~ TRUE,
    substrate != "succinate" & max(norm_gene_fitness_median) <= 1 ~ TRUE,
    TRUE ~ FALSE)
  ) %>% group_by(locus_tag) %>%
  filter(all(succinate_hit), n() == 6, locus_tag != "H16_A2912") %>% 
  pull(locus_tag) %>% unique

df_fitness_summary %>%
  filter(time == 8, locus_tag %in% list_fitness_succinate) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% list_fitness_succinate) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

**Mechanisms enabling faster growth on succinate**

A group of five highly related genes shows significant enrichment in (continuous) succinate: `H16_2096` (dppA1), `H16_A2406` (dppD2), `H16_A2407` (dppC2), `H16_A2408` (dppB2), and `H16_A2409`, dppA2. Alternative names are yejABCDE. All of these proteins are part of an ABC transporter of the PepT family. Quote from PFAM about this type of protein/domain: "All characterised members appear to be involved in the transport of oligopeptides or dipeptides. Some are important for sporulation or antibiotic resistance. Some dipeptide transporters also act on the heme precursor delta-aminolevulinic acid.
The enrichment seems to be highly reproducible: each of these genes has 7 to 15 different insertion mutants, all showing the same average pattern. At least two more genes are also involved in (amino acid?) transport, `H16_A3284` and `H16_A2521`. The latter one is a "D-amino acid transferase (D-AAT), required by bacteria to catalyse the synthesis of D-glutamic acid and D-alanine" (PFAM).
Supposed mechanism of enrichment: Growth on succinate could lead to a shortage/imbalance of certain amino acids, and investment in uptake might give a growth advantage. The growth advantage was only visible in the continuous condition, and is probably not universal for succinate. No gene/mutant was enriched in both succinate growth regimes.

#### Strains enriched in several conditions

A wealth of genes/mutants are enriched in multiple mainly pulsed growth conditions (cluster 4).

```{r, fig.width = 10, fig.height = 6.5}
df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 4))) %>%
  filter(time == 8) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 4))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

To learn more about functional relationships between enriched genes/mutants, we can submit the gene list to the STRING interaction database and retrieve a network of probable interactions.

```{r, message = FALSE}
library(ggraph)
library(tidygraph)

# function retrieve network interaction data from STRING DB
# separate gene IDs by "%0d"; species/taxon ID for Cupriavidus necator H16: 381666
# (see https://string-db.org/cgi/organisms)
retrieve_STRING <- function(gene_ID, taxon_ID, min_score = 0000, ref = NULL) {
  gene_list <- paste(gene_ID, collapse = "%0d")
  string_graph <- paste0(
    "https://string-db.org/api/tsv/network?identifiers=", 
    gene_list, "&species=", taxon_ID, "&required_score=", min_score) %>%
  read_tsv(col_types = cols()) %>%
  mutate(across(matches("stringId"), function(x) gsub(paste0(taxon_ID, "."), "", x))) %>%
  as_tbl_graph()
  if (!is.null(ref)) {
    left_join(string_graph, ref, by = "name")
  } else {
    string_graph
  }
}

# function to space labels in certain distance to circle
nudge_circle <- function(n, size_x = 0.15, size_y = 0.1) {
  nudge_x = size_x * -cos(seq(0.5*pi, 2.5*pi, length.out = n))
  nudge_y = size_y * sin(seq(0.5*pi, 2.5*pi, length.out = n))
  list(x = nudge_x, y = nudge_y)
}
```


```{r, fig.width = 8.5, fig.height = 4.8}
graph_all_enrich <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 4)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_all_enrich %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_x = nudge_circle(22)$x, nudge_y = nudge_circle(22)$y, 
    size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))
```

**Mechanisms enabling faster growth in several conditions**

The picture for these strains is more clear then for the fructose-enriched strains. 13 out of 27 enriched genes (48%), among them some of the most enriched, are directly involved in hydrogenase activity, i.e. hydrogen oxygenation (hoxA, hoxF, hoxU, hoxX, hypA, hypB, hypC, hypD hypE, hypF, hyaA, hyaB, hupH). The mechanism of enabling faster growth by KO of hydrogenases is unknown.

The second theme is enrichment of ptsI (`H16_A0326`) and ptsH (`H16_A0325`), together the entire functional unit of the PTS system, the PEP-dependent sugar phosphotransferase system (sugar PTS). "This major carbohydrate active-transport system catalyzes the phosphorylation of incoming sugar substrates concomitantly with their translocation across the cell membrane. Enzyme I transfers the phosphoryl group from (PEP) to the phosphoryl carrier protein (HPr)."
Why does the KO of PTS system provides a growth benefit in *in all conditions* with almost equal contribution? Must be related to energy metabolism, because neither fructose (ABC transporter), succinate and def not formate are transported via a PTS.

Other interesting mechanisms: KO of RNA polymerase sigma factor RpoS (`H16_A2373`) seems to have a beneficial effect on steady state growth, as well as other stress related transcription factors like cold shock protein capB (`H16_B2205`), and scoF (`H16_B0002`).

The gene `H16_B2043` is by far most abundant (by reads) and most enriched (by fitness) single mutant in almost all conditions. What is the mechanism? There is almost nothing known about this gene except that it probably has cyclic-guanylate-specific phosphodiesterase activity. This enzyme has, according to PFAM, two domains, the diguanylate cyclase (GGDEF) and the EAL domain. The first one synthesizes cyclic di-GMP [Ryjenkov et al., J Bact, 2005](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1064016/). The second domain can act as phosphodiesterase and cleave cyclic cAMP or cGMP. Both compounds play a role in signaling, cAMP for example as the hormone in catabolite repression. According to the discussion in the linked paper, "Mutations in the GGDEF domain proteins or overexpression of such proteins affect exopolysaccharide synthesis [...] and formation of biofilms. In C. crescentus, flagellum ejection, which is required for the switch from motile to sessile lifestyle, is impaired in a mutated GGDEF domain". The underlying mechanism could be disrupted biofilm formation, higher flagellum activity, and thus non-settling phenotype in the bioreactor. Alternatively the mutation affects cAMP formation and disrupts or enhances catabolite repression (more likely that disrupted catabolite repression is beneficial).


#### Sanity checks

All above results are based on fitness score, which is not much more than a normalized log2 fold change of read count over time.
We can compare if the mutants with extremely high fitness scores are also the ones that are super-abundant at the final time point, ie.e. have an average read count of >= 500,000 after 16 generations. In fact, only 3 genes have such an extreme read count, but 9 have more than 100,000 reads.

```{r}
df_fitness %>% filter(time == 16, locus_tag %in% (subset(list_enriched, list_enriched %in% c(3,4)) %>% names)) %>%
  group_by(locus_tag) %>% summarize(max_counts = max(counts), max_fitness = max(norm_gene_fitness)) %>%
  arrange(desc(max_counts))
```

On the other hand, we can check which genes have an extremely high read count (>= 500,000 in *any* condition) and see if this correlates with high fitness score. It does partly. 3 out of 7 highly abundant gene mutants enrich extremely over time. The other enrich too, but only 2^1 = 2 to 2^3.5 = 11 times. These non-enriching super-abundant mutants are:

  - `H16_A0774` - cphA, Cyanophycin synthase. Supposed mechanism: enriches in LB medium as it does not turn Asp
    into cyanophycin. No benefit on minimal medium.
  - `H16_B2570` - fecA, Outer membrane receptor for Fe(III). Supposed mechanism: regulator activity together with fecR, but also 
    transmembrane transporter for siderophores. KO reduces sensitivty to excess iron??
  - `H16_A3145` - Conserved protein/domain typically associated with flavoprotein oxygenases, DIM6/NTAB family. 
    Supposed mechanism: unknown.

```{r}
df_fitness %>% filter(time == 16, counts > 5*10^5) %>%
  group_by(locus_tag) %>% summarize(max_counts = max(counts), max_fitness = max(norm_gene_fitness)) %>%
  arrange(desc(max_counts))
```


## Depletion of mutants over time (reduced fitness)

#### Depletion on formate

Several genes/mutants are depleted over time depending on growth condition. Most interesting in this context are cluster 1 and 2, depletion only on formate, or in all conditions, respectively. Not surprisingly, formate-specific genes are highly depleted during growth on formate, but not cbb (Calvin cycle) genes.


```{r, fig.width = 10, fig.height = 6.5}
df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 1))) %>%
  filter(time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 1))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

What is the role of formate-essential genes? Are there patterns that emerge? We can submit the depleted gene list to STRING DB and retrieve a possible network of interactions. The network shows immediately that there are many high-scoring interactions between the genes, particularly the soluble FDH genes are sticking out as one cluster (fdsABDG) and their transcriptional regulator fdsR. Then the molybdenum cofactor processing proteins moaCDE, moeA, mobA, mog. And a range of cytochrome genes responsible for acceptance and transport of of electrons from formate to the ETC in cytoplasmic membrane (petABC, ccoGNOP, cyc). It is also intersting what is NOT essential on formate, such as no cbb genes except the master transcriptional regulator cbbR. The reason for this must be the redundancy of all cbb genes (two operons).


```{r, fig.width = 8.5, fig.height = 4.8}
graph_formate_depl <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 1)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_formate_depl %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_x = nudge_circle(34)$x, nudge_y = nudge_circle(34)$y,
    size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))
```

Let's create a pathway/operon based overview about the formate-essential genes. We group the genes into three categories, and also include the genes/proteins that belong to each group but were not depleted or enriched. The three groups are:

- **molybdenum cofactor biosynthesis**
  - modAB = molybdenum transporter
  - moaACDE = molybdopterin backbone biosynthesis
  - moeA = molybdopterin molybdenum transferase
  - mobAB = molybdenum cofactor guanylyltransferase, 
  - mog = molybdopterin biosynthesis protein (role?)
- **formate dehydrogenase**
  - soluble (fds, fwd)
  - membrane-bound (fdh, fdo)
- **electron transport chain**
  - petABC (aka qcrABC, `H16_A3398`, `H16_A3397`, `H16_A3396`) = cycochrome bc1 (reductase)
  - cyc (`H16_A3451`), cytochrome c553. Other cytochrome c553 genes to include according to uniprot:
    `H16_A0830`, `H16_A0846`, `H16_B1452`, `H16_A3576`, `H16_A1121`, `H16_A1120`
  - ccoGNOP (`H16_A2316`, `H16_A2317`, `H16_A2318`, `H16_A2319`), cytochrome C oxidase (cbb3 type)
  - VP2641 (`H16_A1031`) = Membrane spanning protein, RDD family. PFAM: unknown function, potential transporter? Interacts with cytochrome c553 and qcrC, cytochrome reductase. It's depletion implicates this protein in formate-specific e- transport in the membrane.

We can prove that these complexes are mostly or only required during growth on formate. Can we exclude other cytochrome reductase and cytochrome oxidase complexes, or other cytochrome c553 proteins that act as e- acceptors from FDH? This type of information can be used to carve out the specific path of e- from formate through the ETC. If other cytochrome oxidase/reductase proteins are essential in all conditions, it can not be excluded that they are not required for growth on formate. We therefore also include alternative cytochrome oxidases, like ctaABCDEG and coxMNOP, cyo and cyd operons.

To summarize, according to the essentiality data on formate, electrons from formate are transported through the membrane the following way:

```
S-FDH --> NADH --|
                 |
M-FDH --> Cytochrome C reductase (bc1 complex III) --> Cytochrome c553 () --> Cytochrome C oxidase (cbb3 complex IV) --> O2
```

What is known in literature about electron transport from formate to membrane?

- main FDH is S-FDH both in terms of protein abundance, inducibility, and essentiality
- cytochrome reductase bc1 complex (qcrABC) is **particularly important** during growth on formate (this study)
- it is also **beneficial** during heterotrophic growth (slightly reduced fitness)
- inhibition of this complex reduces growth in heterotrophy and lithotrophy (see Cramm 2006)
- qcrC seems to be a cytochrome C that is mobile (?) and transports e- in membrane
- bc1 complex functions in oxygen-limited respiration, therefore maybe no growth on formate and nitrate respiration?
- bc1 complex deletion mutant was completely deficient in oxygen-limited growth on nitrate (Garg et al., 2018).
- bc1 complex accepts reduced quinols as e- donor, pumps 4 H+ /2 e− via proton-motive Q cycle (quinol:cyt c oxidoreduction), and releases reduced cytochrome C e- carriers
- Cytochrome c553 proteins are e- shuttles in membrane, between bc1 and respiration complexes (?)
- cbb3-type cytochrome oxidase special member of the heme-copper oxidase family
- "in Bradyrhizobium japonicum, this enzyme operates at extremely low partial pressures of oxygen" [Preisig et al., 1996]


```{r}
# generalized function to plot (small) heatmaps
heatmap_fitness <- function(data, key = TRUE) {
  
  # make some small adjustments to optical appeaarance
  mutate(data, gene_name = factor(gene_name, unique(gene_name))) %>%
  mutate(condition = if_else(condition == "pulse", "P", "C")) %>%
  mutate(cond = paste0(substrate, " - ", condition)) %>%
  group_by(gene_name) %>% arrange(cond) %>%
  mutate(norm_gene_fitness_median = replace(norm_gene_fitness_median, norm_gene_fitness_median < -6, -6)) %>%

  # plot heatmap
  levelplot(norm_gene_fitness_median ~ gene_name * factor(cond), .,
    par.settings = custom.colorblind(), colorkey = key,
    col.regions = colorRampPalette(heat_cols)(16),
    at = seq(-6, 6, 1), aspect = "fill",
    xlab = "", ylab = "", scales = list(cex = 0.7, x = list(rot = 90)),
    panel = function(x, y, z, ...) {
      panel.levelplot(x, y, z, ...)
      panel.abline(v = seq_along(unique(x))+0.5, 
        h = seq_along(unique(y))+0.5, col = "white", lwd = 2)
    }
  )
}
```


```{r, fig.width = 10, fig.height = 5.0}
plot_formate_fit_1 <- df_fitness_summary %>% filter(time == 8, grepl("mo[aedbg]", gene_name)) %>%
  mutate(gene_name = stri_extract_first_regex(gene_name, "mo[aedbg]([A-Z0-9]+)?")) %>%
  ungroup %>% arrange(gene_name) %>%
  heatmap_fitness(key = FALSE)

plot_formate_fit_2 <- df_fitness_summary %>% filter(time == 8, grepl("fd[swho]", gene_name)) %>%
  mutate(gene_name = stri_extract_first_regex(gene_name, "fd[swho][A-Z0-9]+")) %>%
  mutate(gene_name = stri_replace_first_regex(gene_name, "fdhD$", "fdsC")) %>%
  ungroup %>% arrange(gene_name) %>%
  heatmap_fitness(key = FALSE)

plot_formate_fit_3 <- df_fitness_summary %>% filter(time == 8, locus_tag %in% c("H16_A3451", "H16_A0830",
    "H16_A0846", "16_B1452", "H16_A3576", "H16_A1121", "H16_A1120", "H16_A1031") | 
    grepl("qcr|cco|cta|cox[MNOPQ] |cyo|cyd", gene_name)) %>%
  mutate(gene_name = stri_extract_first_regex(gene_name, "H16_[AB][0-9]+|(qcr|cco|cta|cox|cyo|cyd)([A-Z0-9]+)?")) %>%
  ungroup %>% arrange(
    factor(str_sub(gene_name, 1, 3), c("cyo", "cyd", "qcr", "H16", "cox", "cta", "cco")), 
    str_sub(gene_name, 4, 5)) %>%
  heatmap_fitness()

print(plot_formate_fit_1, position = c(0, 0.475, 0.5, 1), more = TRUE)
print(plot_formate_fit_2, position = c(0.475, 0.49, 0.95, 1), more = TRUE)
print(plot_formate_fit_3, position = c(0, 0, 1, 0.57))
```

```{r, include = FALSE}
svg("../figures/figure_barseq_formate.svg", width = 10, height = 5.0)
print(plot_formate_fit_1, position = c(0, 0.48, 0.49, 1), more = TRUE)
print(plot_formate_fit_2, position = c(0.485, 0.49, 0.95, 1), more = TRUE)
print(plot_formate_fit_3, position = c(0, 0, 1, 0.57))
dev.off()
```


#### Depletion in all conditions

Which genes are depleted in all conditions (cluster 2)? Here we see the effect of the read compression, such that many genes seem to "recover" from initial depletion and enrich from 8 to 16 generation time point. This effect is only apparent for the continuous conditions and with very high probability related to the distortion of read counts by super-enriching mutants. The 8 generation time point should be considered as more reliable in those cases.

```{r, fig.width = 10, fig.height = 6.5}
df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 2))) %>%
  filter(time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 2))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

```{r, fig.width = 8, fig.height = 5.2}
graph_all_depl <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 2)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_all_depl %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_x = nudge_circle(38)$x, nudge_y = nudge_circle(38)$y,
    size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))
```

## Fitness of *cbb* genes

One of the most interesting features of *C. necator* is the *cbb* operon encoding enzymes for PPP and Calvin cycle (e.g. Rubisco). Two almost identical copies of the operon are present in the genome, one on the pHG1 megaplasmid and one one chromosome 2. The latter one has two additional genes, a putative *cbbB* formate dehydrogenase, and *cbbR*, the transcriptional regulator.
To visualize general trends for all genes of a pathway, we can use a beeswarm plot broken down by condition.

```{r, fig.width = 3.5, fig.height = 3}
plot_cbb_fitness_bee <- df_fitness_summary %>% ungroup %>%
  filter(time == 8, str_detect(gene_name, "cbb.|cfx.|cbx."), condition != "pulse") %>%
  mutate(gene_name = str_replace(gene_name, "cfx|cbx", "cbb")) %>%
  
  xyplot(norm_gene_fitness_median ~ factor(substrate), .,
    groups = factor(substrate), par.settings = custom.colorblind(col = subs_col[c(1,3,5)]),
    labels = stri_extract_first(.[["gene_name"]], regex = "cbb.|cfx.|cbx."),
    xlab = "", ylab = "fitness (8 generations)", ylim = c(-7.5, 2.5),
    scales = list(alternating = FALSE),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.abline(h = 0, lwd = 1.5, col = grey(0.7), lty = 2)
      panel.abline(h = -3, lwd = 1.5, col = grey(0.7), lty = 3)
      panel.text(3.1, -3.5, labels = "significant", col = grey(0.6), cex = 0.65)
      coords <- panel.beeswarm(x, y, bin_y = TRUE, breaks_y = seq(-10, 4, 0.4), 
        return_coords = TRUE, ...)
      panel.directlabel(x = coords$x, y = coords$y, y_boundary = c(-10, -4), 
        cex = 0.65, draw_box = TRUE, box_line = TRUE, ...)
      panel.key(labels = c("formate", "fructose", "succinate"), cex = 0.8, 
        corner = c(0.98, 0.05))
    }
  )

print(plot_cbb_fitness_bee)
```
Only one gene is really depleted and therefore essential on the two formate conditions, *cbbR*. At least two more genes for Rubisco and PRUK should be essential on formate as they have no counterpart on chromosome 1. They show no significant depletion. This proves that the two *cbb* operons can fully complement each other, except for *cbbR*, the one gene present in only copy (chromosome 2).

Note: Some *cbb* mutants for chromosome 2 seem to have a more positive fitness score compared to their counter parts on the pHG1 megaplasmid. Some genes with higher fitness show sudden leaps from 8 to 16 hours. A look up in the summary table revealed that they have only 1 or 2 mutants quantified per gene, which makes the results less reliable, particularly for the continuous conditions on fructose with low read count at 16 generationa. Results for these mutants have to be interpreted with care. The folowing plot focuses on the pHG1 *cbb* copies which have on average higher coverage with individual mutants.


```{r, fig.width = 5.5, fig.height = 3.5}
# single time series per gene
plot_cbb_fitness_select <- df_fitness %>% ungroup %>%
  left_join(select(df_ref, locus_tag, gene_name), by = "locus_tag") %>%
  filter(str_detect(gene_name, "(cbb|cfx|cbx)[PSLKGAT]P|cbbR"), condition != "pulse") %>%
  mutate(
    gene_name = case_when(
      gene_name == "cfxR cbbR H16_B1396" ~ "cbbR (REG)",
      gene_name == "cfxP cbbPP PHG421" ~ "cbbP (PRUK)",
      gene_name == "cbxSP cbbS cbbSP cfxSP rbcS PHG426" ~ "cbbS (RBPC)",
      gene_name == "cbbL2 cbbL cbxLP cfxLP PHG427" ~ "cbbL (RBPC)",
      gene_name == "cbbKP PHG417" ~ "cbbK (PGK)",
      gene_name == "cbbGP PHG418" ~ "cbbG (GAPDH)",
      gene_name == "cbbAP PHG416" ~ "cbbA (FBA)",
      gene_name == "cbbTP PHG420" ~ "cbbT (TKT)",
      #gene_name == "cbbZP PHG419" ~ "cbbZ (PGP)",
      #gene_name == "cbxXP cfxXP PHG425" ~ "cbbX (RBCA)",
      #gene_name == "cbbEP cfxE PHG423" ~ "cbbE (RPE)"
    ) %>% factor(., unique(.)[c(1,6,7,8,3,4,2,5)])
  ) %>% ungroup() %>%
  
  xyplot(norm_gene_fitness ~ factor(time) | gene_name, .,
    groups = substrate, par.settings = custom.lattice(col = subs_col[c(1,3,5)]),
    type = c("p", "l"), xlab = "generations", ylab = "fitness",
    ylim = c(-7.9, 2.5), layout = c(4, 2), between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2.5, as.table = TRUE,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.errbars(x, y, ewidth = 0, ...)
    }
  )

print(plot_cbb_fitness_select)
```

```{r, include = FALSE, fig.width = 7.7, fig.height = 3.2}
svg("../figures/figure_fitness_cbb.svg", width = 7.7, height = 3.2)
print(plot_cbb_fitness_bee, position = c(0, 0.03, 0.4, 1), more = TRUE)
print(plot_cbb_fitness_select, position = c(0.36, 0, 1, 1))
dev.off()
```

## Fitness of phosphoglycolate salvage mutants

Phosphoglycolate salvage, also known as photorespiration, is the process of recycling the product of Rubisco's oxygenation reaction, 2-phoshoglycolate (2-PGly). Depending on the local CO2 concentration around Rubisco, the flux towards 2-PGly becomes higher with decreasing CO2 concentration. Typically, flux towards 2-PGly increases signifcantly between ... and ... ppm CO2 [Reference?]. The following plot shows fitness of the most important 2-PGly salvage KO mutants.

The key enzymes for the universal 'upper part' of 2-PGly salvage are 2-PGly-phosphatase (cbbZ2, cbbZP) and glycolate dehydrogenase GDH (GlcDEF, ). The glycerate pathway is the dominant route for the 'lower part' of 2-PGly salvage with four enzymes being strongly upregulated in low CO2: glyoxylate carboligase GLXCL (H16_A3598), hydroxypyruvate isomerase HPYRI (H16_A3599), tartronate semialdehyde reductase TRSARr (H16_A3600), and glycerate kinase GLYCK (H16_B0612). The first three of these enzymes are encoded in one operon.

```{r, fig.width = 7, fig.height = 5.0}
df_fitness %>% ungroup %>%
  left_join(select(df_ref, locus_tag, gene_name), by = "locus_tag") %>%
  filter(
    locus_tag %in% c("H16_B1387", "PHG419", "H16_A3094", "H16_A3096", "H16_A3097",
      "H16_A3598", "H16_A3599", "H16_A3600", "H16_B0612", "H16_A0640", "H16_A0641",
      "H16_A0642", "H16_A0644"),
    condition == "pulse") %>%
  mutate(gene_name = str_replace(gene_name, " cbbZ2", "")) %>%
  
  xyplot(norm_gene_fitness ~ factor(time) | factor(gene_name, rev(unique(gene_name))), .,
    groups = substrate, par.settings = custom.lattice(col = subs_col[c(1,3,5)]),
    type = c("p", "l"), xlab = "generations", ylab = "fitness",
    ylim = c(-7.9, 2.5), layout = c(5, 3), between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2.5, as.table = TRUE,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.errbars(x, y, ewidth = 0, ...)
      panel.key(..., corner = c(0.05, 0.05))
    }
  )
```


**Possible explanation: local CO2 concentration is high during formatotrophic growth**

Question: What is steady-state CO2/HCO3-concentration in medium?

Known cultivation parameters for highest growth rate:

- medium concentration: 1.5 g/L formate
- growth rate/dilution rate: 0.25 h^-1
- residual formate at µ = 0.25 h^-1: 0.25 g L^-1
- 25% of CO2 from formate is fixed, 75% is emitted (*C. necator* GEM)

Answer:

- formate consumption rate: Q_HCOO3 = (1.5 g L^-1 - 0.25 g L^-1) * 0.25 h^-1 = 0.31 g formate L^-1 h^-1
- estimation of CO2 production rate: P_CO2 = Q_HCOO3 * MW_HCOO3 = 0.31 g L^-1 h^-1 / 48 g mol^-1 = 6.5 mmol L^-1 h^-1
- estimation of CO2 consumption rate: Q_CO2 = P_CO2 * 0.25 =  1.625 mmol L^-1 h^-1
- final CO2 production rate: P_CO2 - Q_CO2 = 4.875 mmol L^-1 h^-1

Unfortunately it is not possible to determine the steady-state concentration from the CO2 emission rate. However we can compare this rate with the growth-limiting CO2 supply for autotrophic growth (ambient air, Claassens et al., 2020).
The authors used:

- 0.04% CO2 or 10% CO2, 6.25 L/min = 375 L/h
- 0.7L culture volume
- density of CO2 = 1.836 g/L at 25*C
- CO2 supply rate = 375 L h^-1 * 0.0004 / 0.7 L culture * 1.836 g L^-1 = 0.39 g L^-1 h^-1 = 8.5 mmol L^-1 h^-1


## Fitness of central metabolism genes

In order to obtain an overview about fitness contribution of central carbon metabolism enzymes, we can select the same list of enzymes (reactions) that was used for Figure 3 of the manuscript. In this figure, absolute protein abundance, relative change in protein abundance, utilization was compared. The names of enzymatic reactions are taken from the genome scale modeland have to be mapped to gene IDs.

```{r, message = FALSE}
df_ccm_enzymes <- list(
  `CBB cycle` = c("PGK", "TPI", "GAPD", "FBA", "FBP", "TKT1", "TKT2", "TAh", "RPE", "RPI", "PRUK", "RBPC", "FDH"),
  `Entner-Doudoroff pathway` = c("PGI", "G6PDH2r", "PGL", "EDD", "EDA"),
  `Pyruvate metabolism` = c("PGM", "ENO", "PYK", "PDH1", "PDH2", "PDH3", "PC", "PPC", "ME1", "ME2", "CS"),
  `TCA cycle` = c("ACONT1", "ACONT2", "ICDHx", "ICDHyr", "AKGDH", "SUCOAS", "SUCDi", "SUCD1", "FUM", "MDH", "MALS")
) %>% enframe("model_group_short", "reaction_id") %>% 
unnest(reaction_id)

df_model_reactions <- read_csv("../data/input/model_reactions.csv", col_types = cols()) %>%
  select(reaction_id, reaction_name, genes) %>% separate_rows(genes, sep = ", ") %>%
  rename(locus_tag = genes) %>%
  right_join(df_ccm_enzymes, by = "reaction_id") %>%
  # remove 1 gene from TCA which is also listed in PYR (pdh3 / dihydrolipoamide dehydrogenase)
  filter(!(locus_tag == "H16_A1377" & model_group_short == "TCA"))
```

The beeswarm plot shows that some genes are conditionally essential, albeit not many. Trends that are consistent between time points and growth regimes are:
- depletion of `fdsABDG` aka the soluble FDH in both formate conditions (described previously in detail)
- depletion of `eda`, `edd` and `pgl` in fructose (all consecutive reactions of the ED pw), to a lower extent in formate too?
- `ppc` (PEP carbooxylase) in formate, also predicted by FBA/RBA to carry major flux in formate)
- `pdhA` (pyruvate dehydrogenase) on fructose and succinate
- `maeA` malic enzyme
- nothing from TCA cycle


```{r, fig.width = 8.2, fig.height = 3.0}
plot_ccm_bee <- df_fitness_summary %>%
  inner_join(df_model_reactions, by = "locus_tag") %>%
  ungroup %>% filter(time == 8, condition != "pulse") %>%
  mutate(gene_name = stri_extract_first(gene_name, regex = "[a-zA-Z0-9]+")) %>%

  xyplot(norm_gene_fitness_median ~ factor(substrate) | model_group_short, .,
    groups = substrate, par.settings = custom.colorblind(col = subs_col[c(1,3,5)]),
    labels = .[["gene_name"]], xlab = "", ylab = "fitness (8 generations)", 
    ylim = c(-7.2, 1.8), as.table = TRUE, between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), layout = c(4, 1),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.abline(h = 0, lwd = 1.5, col = grey(0.7), lty = 2)
      panel.abline(h = -3, lwd = 1.5, col = grey(0.7), lty = 3)
      panel.text(3, -3.5, labels = "significant", col = grey(0.6), cex = 0.65)
      coords <- panel.beeswarm(x, y, bin_y = TRUE, breaks_y = seq(-10, 4, 0.4),
        return_coords = TRUE, ...)
      panel.directlabel(coords$x, coords$y, y_boundary = c(-10, -3), cex = 0.65,
        draw_box = TRUE, box_line = TRUE, box_scale = -0.01, ...)
    }
  )

print(plot_ccm_bee)
```

```{r, include = FALSE}
svg("../figures/figure_fitness_ccm.svg", width = 8.2, height = 3.0)
print(plot_ccm_bee)
dev.off()
```

Looking at the single genes as a time series shows the same picture. Some genes however show a sudden "leap" in fitness from 8 to 16 generations even after previous depletion. This is an artifact caused by normalization of fitness by the overall read distribution. These genes have 0 reads quantified but the algorithm still determines if 0 reads can be expected for genes with low fitness and attributes this fitness to them. However if the read distribution becomes extremely skewed towards few high abundant mutants, the fitness calculation becomes impossible for low-abundant mutants. We therefore focus on the 8 generation time point.

```{r, fig.width = 8, fig.height = 3.5}
# single time series per gene
plot_ccm_fitness_select <- df_fitness %>% ungroup %>%
  left_join(select(df_ref, locus_tag, gene_name), by = "locus_tag") %>%
  inner_join(df_model_reactions, by = "locus_tag") %>%
  filter(str_detect(gene_name, "fds|pgl|pgi|edd1|eda|ppc|pdh|mae")) %>%
  mutate(condition = paste0(substrate, " - ", toupper(substr(condition, 1, 1)))) %>%
  mutate(gene_name = stri_extract_first(gene_name, regex = "[a-zA-Z0-9]+")) %>%
  
  xyplot(norm_gene_fitness ~ factor(time) | gene_name, .,
    groups = condition, as.table = TRUE,
    par.settings = custom.colorblind(col = subs_col), type = c("p", "l"),
    xlab = "generations", ylab = "fitness",
    ylim = c(-8.5, 4), layout = c(7, 2), 
    between = list(x = 0.5, y = 0.5),
    auto.key = list(columns = 3, cex = 0.7),
    scales = list(alternating = FALSE), lwd = 2,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.errbars(x, y, ewidth = 0, ...)
    }
  )

print(plot_ccm_fitness_select)
```

## Table with conditionally essential reactions in central metabolism

```{r}
df_ccm_essential <- df_model_reactions %>%
  filter(reaction_id %in% c("FDH", "PPC", "PGL", "EDD", "EDA", "PDH1", "ME1", "ME2")) %>%
  left_join(select(df_ref, locus_tag, gene_name), by = "locus_tag") %>%
  left_join(select(df_fitness_comp, matches("locus|continuous")), by = "locus_tag") %>%
  arrange(model_group_short, reaction_id, locus_tag) %>%
  mutate(across(matches("continuous_"), round, 1)) %>%
  mutate(gene_name = str_remove(gene_name, " ?H16\\_[AB0-9]+")) %>%
  rename_with(function(x) gsub("continuous_", "", x), matches("continuous_")) %>%
  rename(Pathway = model_group_short, Reaction = reaction_id,
    `Reaction name` = reaction_name, `Gene ID` = locus_tag, `Gene name` = gene_name) %>%
  select(c(4,1,2,3,5,6,7,8))

print(df_ccm_essential)
write_csv(df_ccm_essential, "../data/output/essentiality_ccm.csv")
```

# Export gene fitness data

Save an `Rdata` file with fitness data for all conditions that finally were used in the manuscript by Jahn et al., 2021.
This is used in the [ShinyLib](https://m-jahn.shinyapps.io/ShinyLib/) web app.

```{r}
Cupriavidus_BarSeq_2021 <- df_fitness %>%
  ungroup %>% select(-ID) %>%
  unite(condition, substrate, condition, sep = " - ") %>%
  mutate(fold_change = 2^log2FC) %>%
  rename(fitness_score = norm_gene_fitness, t_stat = t, timepoint = time) %>%
  left_join(select(df_ref, locus_tag, new_locus_tag, uniprot, gene_name, protein_name, System, 
    Process, Pathway), by = "locus_tag")

save(Cupriavidus_BarSeq_2021, file = "../../ShinyLib/data/Cupriavidus_BarSeq_2021.Rdata")
```

