412 lines
14 KiB
Plaintext
412 lines
14 KiB
Plaintext
---
|
|
title: "08.compare_mce_maier"
|
|
output: github_document
|
|
date: "2024-10-02"
|
|
author: Roberto Olayo Alarcon
|
|
---
|
|
|
|
In this file we prepare the final figures from the data analysed in `08.compare_mce_maier.ipynb`.
|
|
|
|
```{r setup, include=FALSE}
|
|
library(tidyverse)
|
|
library(ggrepel)
|
|
library(readxl)
|
|
```
|
|
|
|
## Prepare directories.
|
|
|
|
```{r prep.dirs}
|
|
|
|
OUTPUT_DIR <- "../data/08.compare_mce_maier/"
|
|
|
|
```
|
|
|
|
|
|
## Read data.
|
|
|
|
Here we read the UMAP coordinates.
|
|
```{r read.data}
|
|
|
|
# UMAP based on MolE representation
|
|
mole.umap <- read_tsv(file.path(OUTPUT_DIR, "mole_joint_umap.tsv.gz"),
|
|
show_col_types = FALSE)
|
|
|
|
# UMAP based on ECFP4 representation
|
|
ecfp4.umap <- read_tsv(file.path(OUTPUT_DIR, "ecfp4_joint_umap.tsv.gz"),
|
|
show_col_types = FALSE)
|
|
|
|
|
|
# Complete prediction set
|
|
completepreds <- read_excel("../data/04.new_predictions/mole_mce_predictions_litsearch.xlsx", sheet = "mole_prediction_overview")
|
|
|
|
# Literature search data.
|
|
litsearch.mole <- read_excel("../data/04.new_predictions/mole_mce_predictions_litsearch.xlsx", sheet = "mole_over10")
|
|
|
|
mce_litexamples <- litsearch.mole %>%
|
|
filter(!is.na(`Reported Activity`),
|
|
antibiotic == "not_abx") %>%
|
|
select(`Catalog Number`, `Reported Activity`) %>%
|
|
rename("chem_id" = "Catalog Number")
|
|
|
|
mce_litexamples %>% head()
|
|
```
|
|
|
|
## Plot UMAP.
|
|
|
|
Plot the UMAP, highlighting some chemicals that are predicted to be broad-spectrum and confirmed in the literature.
|
|
|
|
```{r chems.interest}
|
|
# Chemicals of interest
|
|
|
|
coi_df <- data.frame("chem_id" = c("HY-B0183", "HY-B0021", "HY-N0829", "HY-B0723", "HY-B2136" ),
|
|
"ProductName" = c("Ellagic acid", "Doxifluridine", "Shionone", "Ospemifene", "Tannic acid"))
|
|
|
|
coi_df
|
|
```
|
|
|
|
```{r}
|
|
|
|
mce_litexamples.umap <- mce_litexamples %>%
|
|
left_join(mole.umap, by="chem_id") %>%
|
|
mutate(`Chemical Library` = "MCE (Predictions with reported activity)")
|
|
```
|
|
|
|
|
|
```{r}
|
|
coi.mole <- coi_df %>%
|
|
left_join(mole.umap, by="chem_id")
|
|
```
|
|
|
|
```{r plot.umap.mole}
|
|
mole.umap <- mole.umap %>%
|
|
mutate(keep = case_when(`Chemical Library` == "Maier et.al." ~ TRUE,
|
|
`Chemical Library` == "MCE" & chem_id %in% completepreds$`Catalog Number` ~ TRUE,
|
|
TRUE ~ FALSE)) %>%
|
|
filter(keep)
|
|
|
|
|
|
|
|
u.mole <- ggplot(mole.umap, aes(x=umap1, y=umap2, fill=`Chemical Library`)) +
|
|
|
|
|
|
geom_point(size=2, color="white", shape=21, alpha=0.75) +
|
|
geom_point(data = mce_litexamples.umap, color="white", size=2.5, shape=21, alpha=0.75) +
|
|
|
|
|
|
geom_text_repel(data = coi.mole, aes(label=ProductName),
|
|
max.overlaps = Inf, size=3, min.segment.length = 0,
|
|
color="black", fontface="plain",
|
|
box.padding = 0.5,
|
|
nudge_x = if_else(coi.mole$ProductName %in% c("Ellagic acid", "Tannic acid", "Ospemifene"), 1, 0),
|
|
|
|
nudge_y = case_when(coi.mole$ProductName %in% c("Ospemifene", "Shionone") ~ 3,
|
|
coi.mole$ProductName %in% c("Doxifluridine") ~ -2,
|
|
coi.mole$ProductName %in% c("Tannic acid") ~ 2,
|
|
TRUE ~ 0)) +
|
|
|
|
|
|
scale_fill_manual(values=c("#DE1F84", "#C5C5C5", "#1F9DBB")) +
|
|
|
|
theme_bw() +
|
|
|
|
theme(legend.position = "bottom",
|
|
axis.ticks.x = element_blank(),
|
|
axis.text.x = element_blank(),
|
|
axis.ticks.y = element_blank(),
|
|
axis.text.y = element_blank(),
|
|
panel.background = element_rect(fill = "transparent", color=NA),
|
|
plot.background = element_rect(fill = "transparent", colour = NA)) +
|
|
|
|
labs(x="UMAP 1",
|
|
y="UMAP 2")
|
|
|
|
u.mole
|
|
ggsave(filename = file.path(OUTPUT_DIR, "mole_joint_umap.pdf"), plot=u.mole, dpi = 300,
|
|
height = 15, width = 21, units="cm")
|
|
|
|
ggsave(filename = file.path(OUTPUT_DIR, "mole_joint_umap.png"), plot=u.mole, dpi = 300,
|
|
height = 15, width = 21, units="cm")
|
|
|
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
mce_metadata <- completepreds %>%
|
|
select(`Catalog Number`, ProductName, nk_total, antibiotic) %>%
|
|
rename("chem_id" = "Catalog Number") %>%
|
|
|
|
left_join(mce_litexamples, by="chem_id")
|
|
|
|
|
|
mole.umap %>%
|
|
left_join(mce_metadata, by="chem_id") %>%
|
|
select(-keep) %>%
|
|
rename("n_predicted_inhibited_strains" = "nk_total") %>%
|
|
write_tsv("../data/08.compare_mce_maier/umap_library_comparison.tsv")
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
The same but for ECFP4.
|
|
```{r plot.umap.fps}
|
|
|
|
mce_litexamples.umap <- mce_litexamples %>%
|
|
left_join(ecfp4.umap, by="chem_id") %>%
|
|
mutate(`Chemical Library` = "MCE (Predictions with reported activity)")
|
|
|
|
coi.ecfp4 <- coi_df %>%
|
|
left_join(ecfp4.umap, by="chem_id")
|
|
|
|
|
|
|
|
u.ecfp4 <- ggplot(ecfp4.umap, aes(x=umap1, y=umap2, fill=`Chemical Library`)) +
|
|
geom_point(size=1.5, color="white", shape=21, alpha=0.75) +
|
|
geom_point(data = mce_litexamples.umap, color="white", size=2.5, shape=21, alpha=0.75) +
|
|
|
|
|
|
geom_text_repel(data = coi.ecfp4, aes(label=ProductName),
|
|
max.overlaps = Inf, size=3, min.segment.length = 0,
|
|
color="black", fontface="plain",
|
|
box.padding = 0.5,
|
|
nudge_y = if_else(coi.mole$ProductName %in% c("Ospemifene", "Tannic acid"), -5, 0),
|
|
nudge_x = if_else(coi.mole$ProductName %in% c("Doxifluridine", "Ellagic acid"), -3, 0)) +
|
|
|
|
|
|
scale_fill_manual(values=c("#DE1F84", "#C5C5C5","#1F9DBB")) +
|
|
|
|
theme_bw() +
|
|
|
|
theme(axis.ticks.x = element_blank(),
|
|
axis.text.x = element_blank(),
|
|
axis.ticks.y = element_blank(),
|
|
axis.text.y = element_blank(),
|
|
panel.background = element_rect(fill = "transparent", color=NA),
|
|
plot.background = element_rect(fill = "transparent", colour = NA)) +
|
|
|
|
labs(x="UMAP 1",
|
|
y="UMAP 2")
|
|
|
|
u.ecfp4
|
|
ggsave(filename = file.path(OUTPUT_DIR, "ecfp4_joint_umap.pdf"), plot=u.ecfp4, dpi = 300,
|
|
height = 13, width = 19, units="cm")
|
|
|
|
|
|
```
|
|
|
|
## Comparing MolE and ECFP4
|
|
|
|
```{r panel.lib}
|
|
library(ggpubr)
|
|
```
|
|
|
|
|
|
```{r panel}
|
|
|
|
umap.arranged <- ggarrange(u.mole + ggtitle("MolE"),
|
|
u.ecfp4 + ggtitle("ECFP4"),
|
|
common.legend = TRUE,
|
|
legend="bottom")
|
|
|
|
umap.arranged
|
|
ggsave(filename = file.path(OUTPUT_DIR, "joint_umap_panel.pdf"), plot=umap.arranged, dpi = 300,
|
|
height = 11, width = 20, units="cm")
|
|
```
|
|
|
|
## Comparing molecular similarity.
|
|
|
|
```{r read.distance.qs}
|
|
|
|
# Read molecular similarity
|
|
mol.similarity <- read_tsv("../data/08.compare_mce_maier/ecfp4_distance_to_maier.tsv.gz", show_col_types = FALSE) %>%
|
|
# Make tidy form to ease handling
|
|
pivot_longer(cols = -selected_chem, names_to = "prestwick_ID", values_to = "jaccard_distance") %>%
|
|
|
|
# Get tanimoto similarity
|
|
mutate(tanimoto_similarity = 1-jaccard_distance) %>%
|
|
|
|
# Gather the ranking per molecule. Make sure to rank ties the same
|
|
group_by(selected_chem) %>%
|
|
mutate(similarity_ranking = rank(-tanimoto_similarity, ties.method = "min"))
|
|
|
|
# Read prestwick metadata
|
|
maier_chems_metadata = read_excel("../raw_data/maier_microbiome/chem_library_info_SF1.xlsx") %>%
|
|
select(prestwick_ID, `chemical name`, `target species`) %>%
|
|
rename("prestwick_chemname" = "chemical name",
|
|
"target_species" = "target species")
|
|
|
|
|
|
# Read screening results
|
|
maier_screening_results <- read_tsv("../data/01.prepare_training_data/maier_screening_results.tsv.gz", show_col_types = FALSE) %>%
|
|
# Gather chemicals used in training
|
|
filter(prestwick_ID %in% unique(mol.similarity$prestwick_ID)) %>%
|
|
|
|
# Count number of strains inhibited
|
|
pivot_longer(cols = -prestwick_ID, names_to = "strain", values_to = "growth_inhibition") %>%
|
|
group_by(prestwick_ID) %>%
|
|
summarise(n_inhibited_strains = sum(growth_inhibition)) %>%
|
|
ungroup()
|
|
|
|
|
|
# Join information
|
|
mol.similarity <- mol.similarity %>%
|
|
left_join(maier_chems_metadata, by="prestwick_ID") %>%
|
|
left_join(maier_screening_results, by="prestwick_ID")
|
|
|
|
```
|
|
|
|
|
|
Determine the most similar molecule percategory
|
|
```{r}
|
|
# Overall
|
|
overall.most_similar <- mol.similarity %>%
|
|
filter(similarity_ranking == 1) %>%
|
|
# Thymidine has two molecules with equal ranking
|
|
head(6) %>%
|
|
|
|
mutate(sim_category = "Overall")
|
|
|
|
# Most similar antibiotic
|
|
abx.most_similar <- mol.similarity %>%
|
|
filter(target_species == "bacteria") %>%
|
|
group_by(selected_chem) %>%
|
|
filter(similarity_ranking == min(similarity_ranking)) %>%
|
|
mutate(sim_category = "Antibiotic")
|
|
|
|
# Most similar non-antibiotic with broad spectrum activity
|
|
nonabx.bsa.most_similar <- mol.similarity %>%
|
|
filter(target_species != "bacteria",
|
|
n_inhibited_strains >= 10) %>%
|
|
group_by(selected_chem) %>%
|
|
filter(similarity_ranking == min(similarity_ranking)) %>%
|
|
mutate(sim_category = "Non-antibiotic with BSA")
|
|
|
|
|
|
most_similar.df <- bind_rows(overall.most_similar, abx.most_similar, nonabx.bsa.most_similar) %>%
|
|
select(selected_chem, prestwick_ID, tanimoto_similarity, similarity_ranking, target_species, n_inhibited_strains, sim_category) %>%
|
|
|
|
# For some compounds, the same molecule is the most similar in more than one category. Let's choose the more specific annotation.
|
|
group_by(selected_chem, prestwick_ID) %>%
|
|
slice_tail() %>%
|
|
|
|
# Prepare labelling
|
|
mutate(tansim_label = round(tanimoto_similarity, 2),
|
|
sim_category = factor(sim_category, levels=c("Overall", "Antibiotic", "Non-antibiotic with BSA")))
|
|
|
|
|
|
most_similar.df
|
|
```
|
|
|
|
|
|
```{r}
|
|
similarity.chosen <- ggplot(mol.similarity, aes(x=similarity_ranking, y=tanimoto_similarity)) +
|
|
geom_hline(yintercept = 0.3, linetype="longdash") +
|
|
|
|
geom_point(size=1.8, color="white", shape=21, alpha=1, fill="#C5C5C5") +
|
|
geom_point(data=most_similar.df, aes(fill=sim_category, shape=sim_category), size=2,
|
|
color="black", shape=21, alpha=0.75) +
|
|
|
|
scale_fill_manual(values=c("#377eb8", "#e41a1c", "#1b9e77")) +
|
|
|
|
|
|
geom_text_repel(data = most_similar.df, aes(label=tansim_label),
|
|
max.overlaps = Inf, size=3.5,
|
|
min.segment.length = 0,
|
|
color="black",
|
|
fontface="plain",
|
|
point.padding = 0.2,
|
|
box.padding = 0.2,
|
|
segment.size=0.3,
|
|
|
|
nudge_y = case_when(most_similar.df$selected_chem == "Cetrorelix" & most_similar.df$sim_category == "Non-antibiotic with BSA" ~ -0.125,
|
|
most_similar.df$selected_chem == "Cetrorelix" & most_similar.df$sim_category == "Overall" ~ 0.125,
|
|
most_similar.df$selected_chem == "Cetrorelix" & most_similar.df$sim_category == "Antibiotic" ~ -0.05,
|
|
|
|
most_similar.df$selected_chem == "Elvitegravir" & most_similar.df$sim_category == "Non-antibiotic with BSA" ~ -0.15,
|
|
most_similar.df$selected_chem == "Elvitegravir" & most_similar.df$sim_category == "Antibiotic" ~ 0.1255,
|
|
|
|
most_similar.df$selected_chem == "Opicapone" & most_similar.df$sim_category == "Antibiotic" ~ 0.03,
|
|
most_similar.df$selected_chem == "Opicapone" & most_similar.df$sim_category == "Non-antibiotic with BSA" ~ 0.15,
|
|
|
|
most_similar.df$selected_chem == "Visomitin" & most_similar.df$sim_category == "Overall" ~ 0.15,
|
|
most_similar.df$selected_chem == "Visomitin" & most_similar.df$sim_category == "Antibiotic" ~ -0.125,
|
|
|
|
most_similar.df$selected_chem == "Ebastine" & most_similar.df$sim_category == "Antibiotic" ~ -0.15,
|
|
most_similar.df$selected_chem == "Ebastine" & most_similar.df$sim_category == "Overall" ~ 0.125,
|
|
|
|
TRUE ~ 0),
|
|
|
|
nudge_x = case_when(most_similar.df$selected_chem == "Ebastine" & most_similar.df$sim_category == "Non-antibiotic with BSA" ~ 125,
|
|
most_similar.df$selected_chem == "Ebastine" & most_similar.df$sim_category == "Overall" ~ 125,
|
|
|
|
most_similar.df$selected_chem == "Thymidine" & most_similar.df$sim_category == "Non-antibiotic with BSA" ~ 150,
|
|
most_similar.df$selected_chem == "Thymidine" & most_similar.df$sim_category == "Antibiotic" ~ 150,
|
|
|
|
most_similar.df$selected_chem == "Opicapone" & most_similar.df$sim_category == "Antibiotic" ~ 150,
|
|
|
|
most_similar.df$selected_chem == "Elvitegravir" & most_similar.df$sim_category == "Antibiotic" ~ 100,
|
|
|
|
most_similar.df$selected_chem == "Visomitin" & most_similar.df$sim_category == "Non-antibiotic with BSA" ~ 180,
|
|
|
|
most_similar.df$selected_chem == "Cetrorelix" & most_similar.df$sim_category == "Antibiotic" ~ 125,
|
|
TRUE ~ 0)) +
|
|
|
|
|
|
facet_wrap(~selected_chem) +
|
|
|
|
theme_bw() +
|
|
theme(legend.position = "bottom",
|
|
axis.text.x = element_text(size=6)) +
|
|
labs(x="Ranking of training set compounds",
|
|
y="Tanimoto similarity",
|
|
fill = "Similarity category")
|
|
similarity.chosen
|
|
|
|
ggsave(plot=similarity.chosen, filename = "../data/08.compare_mce_maier/similarity_to_maier.png",
|
|
dpi = 300, width = 21, height = 12.5, units="cm")
|
|
|
|
ggsave(plot=similarity.chosen, filename = "../data/08.compare_mce_maier/similarity_to_maier.pdf",
|
|
dpi = 300, width = 21, height = 12.5, units="cm")
|
|
```
|
|
|
|
|
|
|
|
```{r}
|
|
most_similar_metadata <- most_similar.df %>%
|
|
unite("selected_prest", c(selected_chem, prestwick_ID), sep=":") %>%
|
|
select(selected_prest, sim_category)
|
|
```
|
|
|
|
|
|
|
|
|
|
```{r}
|
|
|
|
mol.similarity %>%
|
|
unite("selected_prest", c(selected_chem, prestwick_ID), sep=":", remove=FALSE) %>%
|
|
|
|
left_join(most_similar_metadata, by="selected_prest") %>%
|
|
select(-selected_prest) %>%
|
|
|
|
arrange(selected_chem, similarity_ranking) %>%
|
|
|
|
rename("measured_n_inhibited_strains" = "n_inhibited_strains",
|
|
"most_similar_category" = "sim_category") %>%
|
|
write_tsv("../data/08.compare_mce_maier/selected_prestwick_tanimoto.tsv")
|
|
|
|
|
|
|
|
```
|
|
|
|
## Session Info:
|
|
|
|
```{r}
|
|
sessionInfo()
|
|
```
|
|
|