--- title: "Analysing MCE predictions" author: "Roberto Olayo Alarcon" date: "27/03/2024" output: github_document --- In this file we analyse the predictions made in the previous step. At this point a literature search has been performed in order to validate predictions ```{r setup, message=FALSE} library(tidyverse) library(ggrepel) library(readxl) library(uwot) ``` ## Prepare directories. ```{r prep.directories} PREDICTIONS_DIR <- "../data/04.new_predictions" OUTPUT_DIR <- "../data/05.analyze_mce_predictions" # Create output dir if(!dir.exists(OUTPUT_DIR)){ dir.create(OUTPUT_DIR) } ``` ## Library Representation. In this section, we perform dimensionality reduction on the MolE representation of the MedChemExpress library for which we make predictions. ```{r read.mole} # Read Prediction Output mole_predictions <- read_excel(file.path(PREDICTIONS_DIR, "mole_mce_predictions_litsearch.xlsx"), sheet = "mole_prediction_overview") mole_predictions.over10 <- read_excel(file.path(PREDICTIONS_DIR, "mole_mce_predictions_litsearch.xlsx"), sheet = "mole_over10") %>% mutate(`Reported Activity` = if_else(is.na(`Reported Activity`), "None", `Reported Activity`)) # Read the representation mole_representation <- vroom::vroom(file.path(PREDICTIONS_DIR, "MolE_representation_medchemexpress.tsv.gz"), show_col_types = FALSE) %>% # Rename id column rename("Catalog Number" = "...1") %>% # Filter for the molecules for which we make predictions filter(`Catalog Number` %in% mole_predictions$`Catalog Number`) %>% # Set column to row.names column_to_rownames("Catalog Number") ``` ```{r read.fps} # Read Prediction Output ecfp4_predictions <- read_excel(file.path(PREDICTIONS_DIR, "ecfp4_mce_predictions_litsearch.xlsx"), sheet = "ecfp4_prediction_overview") ecfp4_predictions.over10 <- read_excel(file.path(PREDICTIONS_DIR, "ecfp4_mce_predictions_litsearch.xlsx"), sheet = "ecfp4_over10") %>% mutate(`Reported Activity` = if_else(is.na(`Reported Activity`), "None", `Reported Activity`)) # Read the representation ecfp4_representation <- vroom::vroom(file.path(PREDICTIONS_DIR, "ecfp4_representation_medchemexpress.tsv.gz"), show_col_types = FALSE) %>% # Rename id column rename("Catalog Number" = "...1") %>% # Filter for the molecules for which we make predictions filter(`Catalog Number` %in% mole_predictions$`Catalog Number`) %>% # Set column to row.names column_to_rownames("Catalog Number") ``` ### Preprocess representation. Remove constant and correlated features ```{r preprocess.func} remove_constant <- function(m.df, var.cutoff){ #' Removes constant columns from a data frame based on a variance threshold. #' #' @param m.df A data frame. Input data frame. #' @param var.cutoff A numeric. Variance threshold. Columns with variance below this threshold will be removed. #' #' @return A data frame. Data frame with constant columns removed. #' #' @examples #' remove_constant(m.df = my_data_frame, var.cutoff = 0.01) #' #' The function calculates the variance of each dimension in the input data frame. #' It then identifies columns with variance above the specified threshold and removes constant columns. #' The resulting data frame without constant columns is returned. #' # Variance of each dimension var.feats <- apply(m.df, 2, var) # Keep columns above variance thresh keep_cols <- names(var.feats[var.feats > var.cutoff]) # Filter df m.var.df <- m.df %>% select(all_of(as.character(keep_cols))) return(m.var.df) } remove_corfeats <- function(m.df, cor.thresh){ #' Removes correlated features from a data frame based on a correlation threshold. #' #' @param m.df A data frame. Input data frame. #' @param cor.thresh A numeric. Correlation threshold. #' #' @return A data frame. Data frame with correlated features removed. #' #' @examples #' remove_corfeats(m.df = my_data_frame, cor.thresh = 0.7) #' #' The function calculates the correlation matrix of the input data frame. #' It then identifies correlated columns based on the correlation threshold. #' Columns with correlations above the threshold are removed. #' The resulting data frame without correlated features is returned. #' # Correlation Matrix cor.mat <- cor(m.df) # Format triangle cor.mat[lower.tri(cor.mat, diag = TRUE)] <- 0 # Find correlated columns corr.descision <- apply(cor.mat, 2, function(x){ifelse(any(x >= cor.thresh), "remove", "remain")}) # Keep columns keep_cols <- names(corr.descision[corr.descision == "remain"]) # Filter df m.uncor <- m.df %>% select(all_of(keep_cols)) return(m.uncor) } preprocess_mole <- function(mole_df, min.var, corr.thresh){ #' Preprocesses molecular data by filtering out constant columns and correlated features. #' #' @param mole_df A data frame. Input molecular data frame. #' @param min.var A numeric. Variance threshold for removing constant columns. #' @param corr.thresh A numeric. Correlation threshold for removing correlated features. #' #' @return A data frame. Preprocessed molecular data frame. #' #' @examples #' preprocess_mole(mole_df = my_molecular_data_frame, min.var = 0.01, corr.thresh = 0.7) #' #' The function preprocesses molecular data by first removing constant columns using the specified variance threshold. #' Then, it removes correlated features based on the correlation threshold. #' The resulting data frame is the preprocessed molecular data with constant columns and correlated features removed. #' # Filter variance mole.variable <- remove_constant(mole_df, min.var) # Filter correlated mole.decorrelated <- remove_corfeats(mole.variable, corr.thresh) return(mole.decorrelated) } # Prepare data mole_prepared <- preprocess_mole(mole_representation, min.var=0.01, corr.thresh=0.90) mole_prepared %>% dim() ``` Similar procedure for ECFP4 representation ```{r preprocess.fps} hamming <- function(X) { D <- (1 - X) %*% t(X) HamDist <- D + t(D) return(HamDist / ncol(X)) } remove_rare <- function(f.df, min_chems){ # Removes columns where a feature is not present in the majority of compounds # # Parameters # ---------- # - fdf: pandas dataframe. Columns are bits of the fingerprint. Rows are compounds # - mc: int. The minimum number of molecules a feature should be present in order for it to be preserved # Number of chemicals a feature is present in n.freq <- colSums(f.df) names(n.freq) <- colnames(f.df) # Filter popular features common.features <- n.freq[n.freq >= min_chems] # Alter df f.common <- f.df %>% select(all_of(names(common.features))) return(f.common) } remove_correlated <- function(f.df, min_dist){ # Removes highly similar features # # Parameters # ---------- # - fdf: pandas dataframe. Dataframe where columns are bits in the fingerprint # - mdist: float. Is minimal distance between features in order to be different enough # # Returns # ------- # pandas dataframe # Highly correlated features are removed (one of them is preserved) # Hamming distance dist.matrix <- hamming(t(f.df)) # Format dist.matrix[lower.tri(dist.matrix, diag = TRUE)] <- 1 # Find correlated columns corr.descision <- apply(dist.matrix, 2, function(x){ifelse(any(x <= min_dist), "remove", "remain")}) # Only keep un-correlated feats keep_cols <- names(corr.descision[corr.descision == "remain"]) # Filter df f.uncorr <- f.df %>% select(all_of(as.character(keep_cols))) return(f.uncorr)} preprocess_fps <- function(fps_df, min_compounds, sim_threshold){ # Processes a fingerprint dataframe. Removes rare and highly correlated features # # Parameters # ---------- # - fps_df: pandas dataframe. Dataframe where columns are bits in the fingerprint # - min_compounds: int. The minimum number of molecules a feature should be present in order for it to be preserved # - sim_threshold: float. Is minimal distance between features in order to be different enough fps_common <- remove_rare(fps_df, min_compounds) fps.decorr <- remove_correlated(fps_common, sim_threshold) return(fps.decorr) } ecfp4_prepared <- preprocess_fps(ecfp4_representation, 10, 0.01) ecfp4_prepared %>% dim() ``` ### Peform UMAP reduction. ```{r mole.umap} set.seed(42) mole.umap <- umap(X=mole_prepared, n_neighbors = 25, n_components = 2, min_dist = 0.35, n_threads = 20, metric="cosine") mole.umap <- as.data.frame(mole.umap) %>% rownames_to_column("catalog_number") ``` ```{r mole.umap.plot} mole.umap %>% ggplot(aes(x=V1, y=V2)) + geom_point(alpha=0.9, color="white", fill="#C5C5C5", size=2, shape=21, stroke=0.2) + theme_classic() + labs(x="UMAP 1", y = "UMAP 2", title = "MolE representation of MCE") ``` ```{r ecfp4.umap} # Jaccard distance jac.dist <- dist(ecfp4_prepared, method = "binary") set.seed(1234) ecfp4.umap <- umap(X=jac.dist, n_neighbors = 20, n_components = 2, min_dist = 0.3, n_threads = 20) ecfp4.umap <- as.data.frame(ecfp4.umap) %>% rownames_to_column("catalog_number") ``` ```{r ecfp4.umap.plot} ecfp4.umap %>% ggplot(aes(x=V1, y=V2)) + geom_point(alpha=0.9, color="white", fill="#C5C5C5", size=2, shape=21, stroke=0.2) + theme_classic() + labs(x="UMAP 1", y = "UMAP 2", title = "ECFP4 representation of MCE") ``` ### Plotting predicted broad-spectrum. ```{r mole.umap.over10} # Add information of broad-spectrum activity mole.umap <- mole.umap %>% mutate(pred_activity = if_else(catalog_number %in% mole_predictions.over10$`Catalog Number`, "Antimicrobial", "Not Antimicrobial")) # Plot m.umap.over10 <- ggplot(mole.umap, aes(x=V1, y=V2)) + geom_point(data=subset(mole.umap, pred_activity == "Not Antimicrobial"), aes(color="Not Antimicrobial"), size=1.5) + geom_point(data=subset(mole.umap, pred_activity == "Antimicrobial"), aes(color="Antimicrobial"), size=1.5) + scale_color_manual(values=c("Antimicrobial" = alpha("#DE1F84", 0.7), "Not Antimicrobial" = alpha("#C5C5C5", 0.7))) + theme_void() + #coord_fixed(ratio = 0.4) + theme(legend.position = "bottom", axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(), text=element_text(size=10), panel.background = element_rect(fill = "transparent", color=NA), plot.background = element_rect(fill = "transparent", colour = NA)) + labs(x="UMAP 1", y="UMAP 2", color="Predicted Activity") m.umap.over10 ``` ```{r annotating.litchems.mole} chems_interst <- mole_predictions.over10 %>% filter(ProductName %in% c("Ospemifene", "Shionone", "Bekanamycin", "Doxifluridine", "Ellagic acid")) %>% select(`Catalog Number`, ProductName) %>% rename("catalog_number" = "Catalog Number") chems_interest_umap <- chems_interst %>% left_join(mole.umap, by="catalog_number") # Plot m.umap.over10.annot <- ggplot(mole.umap, aes(x=V1, y=V2)) + geom_point(data=subset(mole.umap, pred_activity == "Not Antimicrobial"), aes(color="Not Antimicrobial"), size=1.5) + geom_point(data=subset(mole.umap, pred_activity == "Antimicrobial"), aes(color="Antimicrobial"), size=1.5) + geom_text_repel(data = chems_interest_umap, aes(label=ProductName), max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, fontface="bold") + scale_color_manual(values=c("Antimicrobial" = alpha("#DE1F84", 0.7), "Not Antimicrobial" = alpha("#C5C5C5", 0.7))) + theme_void() + #coord_fixed(ratio = 0.4) + theme(legend.position = "bottom", axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(), text=element_text(size=10), panel.background = element_rect(fill = "transparent", color=NA), plot.background = element_rect(fill = "transparent", colour = NA)) + labs(x="UMAP 1", y="UMAP 2", color="Predicted Activity") m.umap.over10.annot ``` ```{r} meta_info <- mole_predictions %>% select(`Catalog Number`, nk_total, ProductName) %>% rename("catalog_number" = "Catalog Number") mole.umap %>% left_join(meta_info, by="catalog_number") %>% rename("umap1" = "V1", "umap2" = "V2", "pred_n_inhibited_total" = "nk_total") %>% write_tsv("../data/05.analyze_mce_predictions/umap_mce_predictions.tsv") ``` ```{r ecfp4.umap.over10} # Add information of broad-spectrum activity ecfp4.umap <- ecfp4.umap %>% mutate(pred_activity = if_else(catalog_number %in% ecfp4_predictions.over10$`Catalog Number`, "Antimicrobial", "Not Antimicrobial")) # Plot e.umap.over10 <- ggplot(ecfp4.umap, aes(x=V1, y=V2)) + geom_point(data=subset(ecfp4.umap, pred_activity == "Not Antimicrobial"), aes(color="Not Antimicrobial"), size=1.5) + geom_point(data=subset(ecfp4.umap, pred_activity == "Antimicrobial"), aes(color="Antimicrobial"), size=1.5) + scale_color_manual(values=c("Antimicrobial" = alpha("#DE1F84", 0.7), "Not Antimicrobial" = alpha("#C5C5C5", 0.7)), labels=c("Broad Spectrum", "Not Broad Spectrum")) + theme_void() + #coord_fixed(ratio = 0.4) + theme(legend.position = "bottom", axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(), text=element_text(size=10), panel.background = element_rect(fill = "transparent", color=NA), plot.background = element_rect(fill = "transparent", colour = NA)) + labs(x="UMAP 1", y="UMAP 2", color="Predicted Activity") e.umap.over10 ``` ```{r annotating.litchems.fps} chems_interst <- mole_predictions.over10 %>% filter(ProductName %in% c("Ospemifene", "Shionone", "Bekanamycin", "Doxifluridine", "Ellagic acid")) %>% select(`Catalog Number`, ProductName) %>% rename("catalog_number" = "Catalog Number") chems_interest_umap <- chems_interst %>% left_join(ecfp4.umap, by="catalog_number") e.umap.over10.annot <- ggplot(ecfp4.umap, aes(x=V1, y=V2)) + geom_point(data=subset(ecfp4.umap, pred_activity == "Not Antimicrobial"), aes(color="Not Antimicrobial"), size=1.5) + geom_point(data=subset(ecfp4.umap, pred_activity == "Antimicrobial"), aes(color="Antimicrobial"), size=1.5) + geom_text_repel(data = chems_interest_umap, aes(label=ProductName), max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, fontface="bold") + scale_color_manual(values=c("Antimicrobial" = alpha("#DE1F84", 0.7), "Not Antimicrobial" = alpha("#C5C5C5", 0.7)), labels=c("Broad Spectrum", "Not Broad Spectrum")) + theme_void() + #coord_fixed(ratio = 0.4) + theme(legend.position = "bottom", axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(), text=element_text(size=10), panel.background = element_rect(fill = "transparent", color=NA), plot.background = element_rect(fill = "transparent", colour = NA)) + labs(x="UMAP 1", y="UMAP 2", color="Predicted Activity") e.umap.over10.annot ggsave(plot=e.umap.over10.annot, filename = "../data/05.analyze_mce_predictions/ecfp4_mce_umap.pdf", width = 21, height = 15, units="cm", dpi=300) ``` ```{r} meta_info <- ecfp4_predictions %>% select(`Catalog Number`, nk_total, ProductName) %>% rename("catalog_number" = "Catalog Number") ecfp4.umap %>% left_join(meta_info, by="catalog_number") %>% rename("umap1" = "V1", "umap2" = "V2", "pred_n_inhibited_total" = "nk_total") %>% write_tsv("../data/05.analyze_mce_predictions/umap_mce_predictions_ecfp4.tsv") ``` ## Ranking molecule predictions. Here we will create plots that compare the antimicrobial activity and the number of predicted inhibited strains. ```{r plot.ap.vs.nk} # Format labels scatter.nkill.apscore <- mole_predictions %>% mutate(antibiotic = if_else(antibiotic == "abx", "Antibiotic", "Non-Antibiotic")) %>% # Create plots ggplot(aes(x=nk_total, y=apscore_total, color=antibiotic)) + geom_point(alpha=0.5, size=1) + scale_color_manual(breaks = c("Antibiotic", "Non-Antibiotic"), values=c("red", "#C5C5C5")) + geom_vline(xintercept = 10, linetype="longdash") + theme_light() + labs(x="Predicted number of inhibited strains", y = "Antimicrobial Potential", color="Compound Class") + theme(legend.position = c(0.7, 0.3), text=element_text(size=10), legend.text = element_text(size=8), legend.title = element_text(size=8)) score.vs.nkill.marginal <- ggExtra::ggMarginal(scatter.nkill.apscore, type="boxplot", groupColour = TRUE, groupFill = TRUE) score.vs.nkill.marginal ``` ```{r plot.ap.vs.nk.e} # Format labels scatter.nkill.apscore.e <- ecfp4_predictions %>% mutate(antibiotic = if_else(antibiotic == "abx", "Antibiotic", "Non-Antibiotic")) %>% # Create plots ggplot(aes(x=nk_total, y=apscore_total, color=antibiotic)) + geom_point(alpha=0.5, size=1) + scale_color_manual(breaks = c("Antibiotic", "Non-Antibiotic"), values=c("red", "#C5C5C5")) + geom_vline(xintercept = 10, linetype="longdash") + theme_light() + labs(x="Predicted number of inhibited strains", y = "Antimicrobial Potential", color="Compound Class") + theme(legend.position = c(0.7, 0.3), text=element_text(size=10), legend.text = element_text(size=8), legend.title = element_text(size=8)) score.vs.nkill.marginal.e <- ggExtra::ggMarginal(scatter.nkill.apscore.e, type="boxplot", groupColour = TRUE, groupFill = TRUE) score.vs.nkill.marginal.e ``` ## Comparing Gram stains. Here we compare the antimicrobial potential scores for gram negatives and gram positives. ```{r gneg.vs.gpos.labels} # Selected chemicals selected_chems <- c("Visomitin", "Ebastine", "Opicapone", "Cetrorelix (Acetate)", 'Thymidine', "Elvitegravir") # Uridine derivatives uridin.deriv <- c("Uridine", "Uridine 5'-monophosphate", "5-Methyluridine", "2'-Deoxyuridine", "Doxifluridine") # Other interests # Other interests other.interest <- c("Tannic acid", "Teniposide", "Mequitazine", "Dimetridazole", "Azelnipidine", "Luliconazole", "Bergenin") # Format Product Names selected.chemicals.data <- mole_predictions.over10 %>% mutate(ProductName = if_else(ProductName %in% selected_chems, ProductName, ""), ProductName = if_else(ProductName == "Cetrorelix (Acetate)", "Cetrorelix", ProductName)) uridine.deriv.data <- mole_predictions.over10 %>% mutate(ProductName = if_else(ProductName %in% uridin.deriv, ProductName, "")) other.interest.data <- mole_predictions.over10 %>% mutate(ProductName = if_else(ProductName %in% other.interest, ProductName, "")) # Plot predBS.mole <- mole_predictions.over10 %>% # Only non-antibiotics filter(antibiotic == "not_abx") %>% mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`)) %>% ggplot(aes(x=apscore_gnegative, y=apscore_gpositive, color=`Reported Activity`)) + # Basic aes geom_point(size=1) + geom_abline(linetype="longdash", alpha=0.25) + # Add names geom_text_repel(data = selected.chemicals.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, fontface="bold", nudge_x = if_else(selected.chemicals.data$ProductName %in% c("Cetrorelix", "Ebastine"), -1, 0)) + geom_text_repel(data = uridine.deriv.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), max.overlaps = Inf, size=1.8, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, nudge_x = if_else(uridine.deriv.data$ProductName %in% c("Uridine"), -1, 0)) + geom_text_repel(data = other.interest.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), max.overlaps = Inf, size=1.8, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2) + # Color by reported activity scale_color_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"), values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) + theme_light() + theme(legend.position = "bottom", text = element_text(size=10)) + labs(x = latex2exp::TeX("Antimicrobial Potential $G^-$"), y = latex2exp::TeX("Antimicrobial Potential $G^+$")) predBS.mole ``` ```{r gneg.vs.gpos.labels.e} # Selected chemicals selected_chems <- c("Visomitin", "Ebastine", "Opicapone", "Cetrorelix (Acetate)", 'Thymidine', "Elvitegravir") # Uridine derivatives uridin.deriv <- c("Uridine", "Uridine 5'-monophosphate", "5-Methyluridine", "2'-Deoxyuridine", "Doxifluridine") # Other interests other.interest <- c("Tannic acid", "Teniposide", "Dimetridazole", "Azelnipidine", "Luliconazole", "Bergenin") # Format Product Names selected.chemicals.data <- ecfp4_predictions.over10 %>% mutate(ProductName = if_else(ProductName %in% selected_chems, ProductName, ""), ProductName = if_else(ProductName == "Cetrorelix (Acetate)", "Cetrorelix", ProductName)) uridine.deriv.data <- ecfp4_predictions.over10 %>% mutate(ProductName = if_else(ProductName %in% uridin.deriv, ProductName, "")) other.interest.data <- ecfp4_predictions.over10 %>% mutate(ProductName = if_else(ProductName %in% other.interest, ProductName, "")) # Plot predBS.fps <- ecfp4_predictions.over10 %>% # Only non-antibiotics filter(antibiotic == "not_abx") %>% mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`)) %>% ggplot(aes(x=apscore_gnegative, y=apscore_gpositive, color=`Reported Activity`)) + # Basic aes geom_point(size=1) + geom_abline(linetype="longdash", alpha=0.25) + # Add names geom_text_repel(data = selected.chemicals.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, fontface="bold", nudge_x = if_else(selected.chemicals.data$ProductName %in% c("Cetrorelix", "Ebastine"), -1, 0)) + geom_text_repel(data = uridine.deriv.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), max.overlaps = Inf, size=1.8, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, nudge_x = if_else(uridine.deriv.data$ProductName %in% c("Uridine"), -1, 0)) + geom_text_repel(data = other.interest.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), max.overlaps = Inf, size=1.8, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2) + # Color by reported activity scale_color_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"), values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) + theme_light() + theme(legend.position = "bottom", text = element_text(size=10)) + labs(x = latex2exp::TeX("Antimicrobial Potential $G^-$"), y = latex2exp::TeX("Antimicrobial Potential $G^+$")) predBS.fps ``` ## Propotion with reported activity ```{r activity.counts} # Get the activity counts proportion.known.activity <- mole_predictions.over10 %>% # Only consider non-antibiotics filter(antibiotic == "not_abx") %>% # Consolidate antiparasitic category mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`)) %>% count(`Reported Activity`) %>% mutate(`Reported Activity` = factor(`Reported Activity`, levels=c("None", "Antiparasitic", "Antifungal", "Antiviral","Antibacterial")), x=" ") # Gather reported.g <- ggplot(proportion.known.activity, aes(x=n, y=x, fill=`Reported Activity`)) + geom_bar(position="fill", stat="identity", color="black") + scale_fill_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"), values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) + theme_classic() + coord_flip() + theme(text=element_text(size=10), legend.position = "right", aspect.ratio = 3, legend.text = element_text(size = 8), legend.title = element_text(size=8)) + labs(x="Proportion with reported activity", y="Predicted antimicrobial", fill = "Reported Activity") reported.g ``` ```{r activity.counts.e} # Get the activity counts proportion.known.activity.e <- ecfp4_predictions.over10 %>% # Only consider non-antibiotics filter(antibiotic == "not_abx") %>% # Consolidate antiparasitic category mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`)) %>% count(`Reported Activity`) %>% mutate(`Reported Activity` = factor(`Reported Activity`, levels=c("None", "Antiparasitic", "Antifungal", "Antiviral","Antibacterial")), x=" ") # Gather reported.g.e <- ggplot(proportion.known.activity.e, aes(x=n, y=x, fill=`Reported Activity`)) + geom_bar(position="fill", stat="identity", color="black") + scale_fill_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"), values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) + theme_classic() + coord_flip() + theme(text=element_text(size=10), legend.position = "right", aspect.ratio = 3, legend.text = element_text(size = 8), legend.title = element_text(size=8)) + labs(x="Proportion with reported activity", y="Predicted antimicrobial", fill = "Reported Activity") reported.g.e ``` ## Comparing MolE and ECFP4 predictions. ```{r global.comparison.ap} mole_complete_comp <- mole_predictions %>% select(`Catalog Number`, nk_total, apscore_total, antibiotic, ProductName) %>% rename("mole_nk_total" = "nk_total", "mole_apscore_total" = "apscore_total") ecfp4_complete_comp <- ecfp4_predictions %>% select(`Catalog Number`, nk_total, apscore_total) %>% rename("ecfp4_nk_total" = "nk_total", "ecfp4_apscore_total" = "apscore_total") complete_pred_comparison <- ecfp4_complete_comp %>% left_join(mole_complete_comp, by='Catalog Number') %>% mutate(antibiotic = if_else(antibiotic == "abx", "Antibiotic", "Non-Antibiotic")) ``` ```{r global.comparison.nk} ggplot(complete_pred_comparison, aes(x=mole_nk_total, y=ecfp4_nk_total, color=antibiotic)) + geom_point(alpha=0.5, size=1) + geom_abline(linetype="longdash", alpha=0.5, color="blue") + geom_vline(xintercept = 10,linetype="longdash", alpha=0.5, color="black") + geom_hline(yintercept = 10,linetype="longdash", alpha=0.5, color="black") + scale_color_manual(breaks = c("Antibiotic", "Non-Antibiotic"), values=c("red", "#C5C5C5")) + theme_light() + labs(title="Predicted number of inhibited strains comparison", x="MolE", y="ECFP4", color="Compound Class") + theme(text=element_text(size=10), legend.text = element_text(size=8), legend.title = element_text(size=8)) ``` Comparing predicted broad spectrum compounds. ```{r ggvenn.lib} library(ggvenn) ``` ```{r broad.spec.venn} fsize <- 3.5 # Complete comparison of broad spectrum antimicrobials broad.list <- list("MolE" = mole_predictions.over10$`Catalog Number`, "ECFP4" = ecfp4_predictions.over10$`Catalog Number`) venn.abx.comparison <- ggvenn(broad.list, fill_color = c("#DE1F84", "#1F9DBB"), set_name_size=fsize, text_size=fsize) venn.abx.comparison # Comparison of broad spectrum antimicrobials no antibiotics broad.list <- list("MolE" = mole_predictions.over10 %>% filter(antibiotic == "not_abx", `Reported Activity` != "None") %>% select(`Catalog Number`) %>% unlist(), "ECFP4" = ecfp4_predictions.over10 %>% filter(antibiotic == "not_abx", `Reported Activity` != "None") %>% select(`Catalog Number`) %>% unlist()) venn.nonabx.comparison <- ggvenn(broad.list, fill_color = c("#DE1F84", "#1F9DBB"), set_name_size=fsize, text_size=fsize) venn.nonabx.comparison ``` Now compare results from literature search. ```{r litsearch.comparison} proportion.known.activity <- proportion.known.activity %>% mutate(representation = "MolE") proportion.known.activity.e <- proportion.known.activity.e %>% mutate(representation = "ECFP4") proportion.comparison <- bind_rows(proportion.known.activity, proportion.known.activity.e) # Gather totals for n total_count = proportion.comparison %>% group_by(representation) %>% summarise(total = sum(n)) %>% mutate(tx_sum = paste("n =", total)) reported.bar.comparison <- ggplot(proportion.comparison, aes(x=n, y=representation)) + annotate("text", y=c("ECFP4", "MolE"), x=1.04, label=total_count$tx_sum,) + geom_bar(position="fill", stat="identity", color="black", aes(fill=`Reported Activity`)) + scale_fill_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"), values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) + theme_classic() + coord_flip() + theme(text=element_text(size=10), legend.position = "right", aspect.ratio = 2, legend.text = element_text(size = 8), legend.title = element_text(size=8)) + labs(y="", x="Proportion", fill = "Reported Activity") reported.bar.comparison ``` ```{r proportion.abx.broad} mole.o10.abx <- mole_predictions.over10 %>% count(antibiotic) %>% mutate(representation = "MolE") ecfp4.o10.abx <- ecfp4_predictions.over10 %>% count(antibiotic) %>% mutate(representation = "ECFP4") over10.abx <- bind_rows(mole.o10.abx, ecfp4.o10.abx) %>% mutate(antibiotic = if_else(antibiotic == "abx", "Antibiotic", "Other"), antibiotic = factor(antibiotic, levels=c("Other", "Antibiotic"))) abx.bar.comparison <- ggplot(over10.abx, aes(x=n, y=representation, fill=antibiotic)) + geom_bar(position="fill", stat="identity", color="black") + scale_fill_manual(breaks = c("Other", "Antibiotic"), values = c("#C5C5C5", "red")) + theme_classic() + coord_flip() + theme(text=element_text(size=10), legend.position = "right", aspect.ratio = 2, legend.text = element_text(size = 8), legend.title = element_text(size=8)) + labs(y="", x="Proportion", fill = "Compound Class") abx.bar.comparison ``` Comparing predicted broad spectrum. ```{r comparing.over10} mole.litsearch <- mole_predictions.over10 %>% filter(antibiotic == "not_abx") %>% select(`Catalog Number`, `Reported Activity`, ProductName) %>% # Consolidate antiparasitic category mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`)) ecfp4.litsearch <- ecfp4_predictions.over10 %>% filter(antibiotic == "not_abx") %>% select(`Catalog Number`, `Reported Activity`, ProductName) %>% # Consolidate antiparasitic category mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`)) combined.litsearch <- bind_rows(mole.litsearch, ecfp4.litsearch) %>% distinct() mole_o10 <- mole_predictions %>% filter(`Catalog Number` %in% combined.litsearch$`Catalog Number`) %>% select(`Catalog Number`, nk_total, apscore_total) %>% rename("mole_nk_total" = "nk_total", "mole_apscore_total" = "apscore_total") ecfp4_o10 <- ecfp4_predictions %>% filter(`Catalog Number` %in% combined.litsearch$`Catalog Number`) %>% select(`Catalog Number`, nk_total, apscore_total) %>% rename("ecfp4_nk_total" = "nk_total", "ecfp4_apscore_total" = "apscore_total") broadS.comp <- combined.litsearch %>% left_join(mole_o10, by="Catalog Number") %>% left_join(ecfp4_o10, by="Catalog Number") %>% filter(`Reported Activity` != "None") ``` ```{r broadS.comp} # Selected chemicals selected_chems <- c("Visomitin", "Ebastine", "Opicapone", "Cetrorelix (Acetate)", 'Thymidine', "Elvitegravir") # Other interests other.interest <- c("Bergenin", "Fenticonazole (Nitrate)", "Doxifluridine", "Broxyquinoline", "Triclabendazole", "Octenidine (dihydrochloride)", "Benzalkonium (chloride)", "Simeprevir") # Format Product Names selected.chemicals.data <- broadS.comp %>% mutate(ProductName = if_else(ProductName %in% selected_chems, ProductName, ""), ProductName = if_else(ProductName == "Cetrorelix (Acetate)", "Cetrorelix", ProductName)) other.interest.data <- broadS.comp %>% mutate(ProductName = if_else(ProductName %in% other.interest, ProductName, ""), ProductName = if_else(ProductName == "Fenticonazole (Nitrate)", "Fenticonazole", ProductName), ProductName = if_else(ProductName == "Octenidine (dihydrochloride)", "Octenidine", ProductName), ProductName = if_else(ProductName == "Benzalkonium (chloride)", "Benzalkonium", ProductName)) nk.comparison <- ggplot(broadS.comp, aes(x=mole_nk_total, y=ecfp4_nk_total, color=`Reported Activity`)) + geom_vline(xintercept = 10, linetype="longdash", color="black", alpha=0.5) + geom_hline(yintercept=10, linetype="longdash", color="black", alpha=0.5) + geom_point(alpha=0.5) + geom_abline(linetype="longdash", color="red", alpha=0.5) + # Add names geom_text_repel(data = selected.chemicals.data, aes(x=mole_nk_total, y=ecfp4_nk_total, label=ProductName), max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, fontface="bold", nudge_x = if_else(selected.chemicals.data$ProductName %in% c("Elvitegravir"), 1, 0)) + geom_text_repel(data = other.interest.data, aes(x=mole_nk_total, y=ecfp4_nk_total, label=ProductName), max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, nudge_y = if_else(other.interest.data$ProductName %in% c("Benzalkonium"), 1, 0)) + scale_color_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"), values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) + theme_light() + labs(x="MolE - N. inhibited strains", y="ECFP4 - N. inhibited strains") nk.comparison ``` ## Figure panels. ```{r fig.arrange} library(ggpubr) ``` ```{r fps.fig.recreate} top_row <- ggarrange(e.umap.over10 + theme(aspect.ratio = 0.4), reported.g.e, labels = c("a", "b"), widths = c(2, 1),font.label = list(face="plain")) bottom_row <- ggarrange(score.vs.nkill.marginal.e, predBS.fps + theme(legend.position = "none"), labels = c("c", "d"), font.label = list(face="plain")) pred.fps.panel <- ggarrange(top_row, bottom_row, ncol = 1, nrow = 2, heights = c(0.8, 1)) pred.fps.panel ggsave(filename = "../data/05.analyze_mce_predictions/ecfp4_mce_overview.pdf", plot=pred.fps.panel, width = 21, height = 15, units="cm", dpi=300) ggsave(filename = "../data/05.analyze_mce_predictions/ecfp4_mce_overview.svg", plot=pred.fps.panel, width = 21, height = 15, units="cm", dpi=300) ``` ```{r mole.prediction.overview} top_row <- ggarrange(m.umap.over10 + theme(aspect.ratio = 0.4), reported.g, labels = c("a", "b"), widths = c(2, 1),font.label = list(face="plain")) bottom_row <- ggarrange(score.vs.nkill.marginal, predBS.mole + theme(legend.position = "none"), labels = c("c", "d"), font.label = list(face="plain")) pred.mole.panel <- ggarrange(top_row, bottom_row, ncol = 1, nrow = 2, heights = c(0.8, 1)) pred.mole.panel ggsave(filename = "../data/05.analyze_mce_predictions/mole_mce_overview.pdf", plot=pred.mole.panel, width = 21, height = 15, units="cm", dpi=300) ``` Combine complete prediction information ```{r combine.pred.info} ecfp4_predranks <- ecfp4_predictions %>% mutate(representation = "ECFP4") mole_predranks <- mole_predictions %>% mutate(representation = "MolE") complete_predranks <- bind_rows(ecfp4_predranks, mole_predranks) ``` ```{r known.abx.hist} nk.abx.hist <- complete_predranks %>% filter(antibiotic == "abx") %>% ggplot(aes(x=nk_total, color=representation, color=representation)) + geom_histogram(binwidth = 1, aes(y=after_stat(count)), position="dodge", alpha=0.1, fill="white") + geom_density(alpha=0.2, aes(y=after_stat(count)), size=1.25, adjust=1/5) + scale_color_manual(values=c("#1F9DBB", "#DE1F84")) + theme_light() + labs(x="Predicted number of inhibited strains", y="Count", color="Representation", title = "Known antibiotics") nk.abx.hist ``` ```{r litsearch.comparison.} mole.o10 <- mole_predictions.over10 %>% filter(antibiotic == "not_abx") %>% filter(`Reported Activity` != "None") %>% select(`Catalog Number`) %>% unlist() ecfp4.o10 <- ecfp4_predictions.over10 %>% filter(antibiotic == "not_abx") %>% filter(`Reported Activity` != "None") %>% select(`Catalog Number`) %>% unlist() ecfp4.comppreds <- complete_predranks %>% filter(representation == "ECFP4", `Catalog Number` %in% ecfp4.o10) mole.comppreds <- complete_predranks %>% filter(representation == "MolE", `Catalog Number` %in% mole.o10) complete_litsearch_comparison <- bind_rows(ecfp4.comppreds, mole.comppreds) ``` ```{r litsearch.hist.nk} nk.ls.hist <- ggplot(complete_litsearch_comparison, aes(x=nk_total, color=representation)) + geom_histogram(binwidth = 1, aes(y=after_stat(count)), position="dodge", alpha=0.1, fill="white") + geom_density(alpha=0.2, aes(y=after_stat(count)), size=1.25, adjust=1/4) + scale_color_manual(values=c("#1F9DBB", "#DE1F84")) + #scale_fill_manual(values=c("#1F9DBB", "#DE1F84")) + theme_light() + labs(x="Predicted number of inhibited strains", y="Count", color="Representation", title = "Non-antibiotic antimicrobials") nk.ls.hist ``` ```{r panel.nkill.comp} panel.comp.nk <- ggarrange(nk.abx.hist, nk.ls.hist, nrow = 1, ncol = 2, labels = c("a", "b"), font.label = list(face="plain"), common.legend = TRUE, legend = "right") panel.comp.nk ``` ```{r literature.comparison} panel.comp.lit <- ggarrange(reported.bar.comparison, nk.comparison, nrow = 1, ncol = 2, labels = c("c", "d"), font.label = list(face="plain"), common.legend = TRUE, legend = "right") panel.comp.lit ``` ```{r complete.panel.comp} complete.comp.panel <- ggarrange(panel.comp.nk, NULL, panel.comp.lit, heights = c(1, 0.1, 1), nrow = 3, ncol = 1) ggsave(plot = complete.comp.panel, filename = "../data/05.analyze_mce_predictions/mce_pred_comparison.pdf", width = 21, height = 18, units="cm", dpi=300) ggsave(plot = complete.comp.panel, filename = "../data/05.analyze_mce_predictions/mce_pred_comparison.png", width = 21, height = 18, units="cm", dpi=300) complete.comp.panel ``` ## Supplementary Files. ### SF8 ```{r, eval=FALSE, include=FALSE} sf8a <- complete_predranks_wide_abx <- complete_predranks %>% filter(antibiotic=="abx") %>% pivot_wider(id_cols = c(`Catalog Number`, ProductName, antibiotic), names_from = representation, values_from = nk_total) %>% rename("ECFP4_predicted_n_inhibited_strains" = "ECFP4", "MolE_predicted_n_inhibited_strains" = "MolE") sf8bd <- broadS.comp %>% mutate(antibiotic="not_abx") %>% select(`Catalog Number`, ProductName, antibiotic, `Reported Activity`, mole_nk_total, ecfp4_nk_total) %>% rename("ECFP4_predicted_n_inhibited_strains" = "ecfp4_nk_total", "MolE_predicted_n_inhibited_strains" = "mole_nk_total") sf8c <- proportion.comparison %>% pivot_wider(id_cols = `Reported Activity`, names_from = representation, values_from = n) ``` ```{r, eval=FALSE, include=FALSE} library(xlsx) ``` ```{r, eval=FALSE, include=FALSE} write.xlsx2(sf8a, "../data/Supplementary Source Data/Supplementary Figure 8.xlsx", sheetName = "SF8a", col.names = TRUE, row.names = TRUE, append = FALSE, showNA = FALSE) write.xlsx2(sf8bd, "../data/Supplementary Source Data/Supplementary Figure 8.xlsx", sheetName = "SF8b,d", col.names = TRUE, row.names = TRUE, append = TRUE, showNA = FALSE) write.xlsx2(sf8c, "../data/Supplementary Source Data/Supplementary Figure 8.xlsx", sheetName = "SF8c", col.names = TRUE, row.names = TRUE, append = TRUE, showNA = FALSE) ``` ### SF13 ```{r} # Names chem_names <- read_tsv("../data/04.new_predictions/medchemexpress_filtered.tsv.gz", show_col_types = FALSE) # MolE scores mole_scores <- read_excel("../data/04.new_predictions/mole_mce_predictions_litsearch.xlsx", sheet = "mole_scores") %>% pivot_longer(cols = -chem_id, names_to = "strain", values_to = "MolE_predictive_probability") %>% unite("pred_id", c(chem_id, strain), sep=":", remove=FALSE) # ECFP4 scores ecfp4_scores <- read_excel("../data/04.new_predictions/ecfp4_mce_predictions_litsearch.xlsx", sheet = "ecfp4_scores") %>% pivot_longer(cols = -chem_id, names_to = "strain", values_to = "ECFP4_predictive_probability") %>% unite("pred_id", c(chem_id, strain), sep=":") # ChemDesc scores chemDesc_scores <- read_excel("../../udl_microbiome/data/chemDesc_mce_predictions.xlsx", sheet = "chemDesc_scores") %>% pivot_longer(cols = -chem_id, names_to = "strain", values_to = "chemDesc_predictive_probability") %>% unite("pred_id", c(chem_id, strain), sep=":") # Optimal thresholds optimal_thresholds <- read_tsv("../data/03.model_evaluation/optimal_thresholds.tsv.gz", show_col_types = FALSE) %>% filter(score_type == "optimized") %>% select(representation, threshold) ``` ```{r} optimal_thresholds ``` ```{r} selected.predictions <- chem_names %>% filter(ProductName %in% c("Ebastine", "Elvitegravir", "Visomitin", "Opicapone", "Thymidine", "Cetrorelix (Acetate)")) %>% select(`Catalog Number`, ProductName) %>% rename("chem_id" = "Catalog Number") %>% left_join(mole_scores, by="chem_id") %>% left_join(ecfp4_scores, by="pred_id") %>% left_join(chemDesc_scores, by="pred_id") %>% mutate("MolE_optimal_threshold" = optimal_thresholds %>% filter(representation == "MolE") %>% select(threshold) %>% unlist(), "ECPF4_optimal_threshold" = optimal_thresholds %>% filter(representation == "ecfp4") %>% select(threshold) %>% unlist(), "chemDesc_optimal_threshold" = optimal_thresholds %>% filter(representation == "chemDesc") %>% select(threshold) %>% unlist(), "MolE_predicted_growth_inhibition" = if_else(MolE_predictive_probability >= MolE_optimal_threshold, 1, 0), "ECFP4_predicted_growth_inhibition" = if_else(ECFP4_predictive_probability >= ECPF4_optimal_threshold, 1, 0), "chemDesc_predicted_growth_inhibition" = if_else(chemDesc_predictive_probability >= chemDesc_optimal_threshold, 1, 0)) %>% select(-pred_id) ``` ```{r} write.xlsx2(selected.predictions %>% filter(ProductName == "Cetrorelix (Acetate)"), "../data/Supplementary Source Data/Supplementary Figure 13.xlsx", sheetName = "SF13a", col.names = TRUE, row.names = TRUE, append = FALSE, showNA = TRUE) write.xlsx2(selected.predictions %>% filter(ProductName == "Ebastine"), "../data/Supplementary Source Data/Supplementary Figure 13.xlsx", sheetName = "SF13b", col.names = TRUE, row.names = TRUE, append = TRUE, showNA = FALSE) write.xlsx2(selected.predictions %>% filter(ProductName == "Elvitegravir"), "../data/Supplementary Source Data/Supplementary Figure 13.xlsx", sheetName = "SF13c", col.names = TRUE, row.names = TRUE, append = TRUE, showNA = FALSE) write.xlsx2(selected.predictions %>% filter(ProductName == "Opicapone"), "../data/Supplementary Source Data/Supplementary Figure 13.xlsx", sheetName = "SF13d", col.names = TRUE, row.names = TRUE, append = TRUE, showNA = FALSE) write.xlsx2(selected.predictions %>% filter(ProductName == "Thymidine"), "../data/Supplementary Source Data/Supplementary Figure 13.xlsx", sheetName = "SF13e", col.names = TRUE, row.names = TRUE, append = TRUE, showNA = FALSE) write.xlsx2(selected.predictions %>% filter(ProductName == "Visomitin"), "../data/Supplementary Source Data/Supplementary Figure 13.xlsx", sheetName = "SF13f", col.names = TRUE, row.names = TRUE, append = TRUE, showNA = FALSE) ``` ```{r} selected.predictions ``` ## Session Info ```{r session.info} sessionInfo() ```