---
title: "Analysing MCE predictions"
author: "Roberto Olayo Alarcon"
date: "27/03/2024"
output: github_document
---
  
  
In this file we analyse the predictions made in the previous step. At this point a literature search has been performed in order to validate predictions
  
  
```{r setup, message=FALSE}
library(tidyverse)
library(ggrepel)
library(readxl)
library(uwot)
```
  
  
## Prepare directories. 
  
```{r prep.directories}

PREDICTIONS_DIR <- "../data/04.new_predictions"
OUTPUT_DIR <- "../data/05.analyze_mce_predictions"

# Create output dir

if(!dir.exists(OUTPUT_DIR)){
  
  dir.create(OUTPUT_DIR)
  
}


```

  
## Library Representation. 
  
In this section, we perform dimensionality reduction on the MolE representation of the MedChemExpress library for which we make predictions. 
  
```{r read.mole}

# Read Prediction Output
mole_predictions <- read_excel(file.path(PREDICTIONS_DIR, "mole_mce_predictions_litsearch.xlsx"), sheet = "mole_prediction_overview")

mole_predictions.over10 <- read_excel(file.path(PREDICTIONS_DIR, "mole_mce_predictions_litsearch.xlsx"), sheet = "mole_over10") %>% 
  mutate(`Reported Activity` = if_else(is.na(`Reported Activity`), "None", `Reported Activity`))


# Read the representation  
mole_representation <- vroom::vroom(file.path(PREDICTIONS_DIR, "MolE_representation_medchemexpress.tsv.gz"),
                                 show_col_types = FALSE) %>% 
  
  # Rename id column
  rename("Catalog Number" = "...1") %>% 
  
  # Filter for the molecules for which we make predictions
  filter(`Catalog Number` %in% mole_predictions$`Catalog Number`) %>% 
  
  # Set column to row.names
  column_to_rownames("Catalog Number")
```


```{r read.fps}

# Read Prediction Output
ecfp4_predictions <- read_excel(file.path(PREDICTIONS_DIR, "ecfp4_mce_predictions_litsearch.xlsx"), sheet = "ecfp4_prediction_overview")

ecfp4_predictions.over10 <- read_excel(file.path(PREDICTIONS_DIR, "ecfp4_mce_predictions_litsearch.xlsx"), sheet = "ecfp4_over10") %>% 
  mutate(`Reported Activity` = if_else(is.na(`Reported Activity`), "None", `Reported Activity`))


# Read the representation  
ecfp4_representation <- vroom::vroom(file.path(PREDICTIONS_DIR, "ecfp4_representation_medchemexpress.tsv.gz"),
                                 show_col_types = FALSE) %>% 
  
  # Rename id column
  rename("Catalog Number" = "...1") %>% 
  
  # Filter for the molecules for which we make predictions
  filter(`Catalog Number` %in% mole_predictions$`Catalog Number`) %>% 
  
  # Set column to row.names
  column_to_rownames("Catalog Number")
```
  
  
### Preprocess representation. 
   
Remove constant and correlated features
   
```{r preprocess.func}


remove_constant <- function(m.df, var.cutoff){
  

  #' Removes constant columns from a data frame based on a variance threshold.
  #' 
  #' @param m.df A data frame. Input data frame.
  #' @param var.cutoff A numeric. Variance threshold. Columns with variance below this threshold will be removed.
  #' 
  #' @return A data frame. Data frame with constant columns removed.
  #' 
  #' @examples
  #' remove_constant(m.df = my_data_frame, var.cutoff = 0.01)
  #' 
  #' The function calculates the variance of each dimension in the input data frame. 
  #' It then identifies columns with variance above the specified threshold and removes constant columns. 
  #' The resulting data frame without constant columns is returned.
  #' 
  
  
  # Variance of each dimension
  var.feats <-  apply(m.df, 2, var)
  
  
  # Keep columns above variance thresh
  keep_cols <- names(var.feats[var.feats > var.cutoff])
  
  # Filter df
  m.var.df <- m.df %>% 
    select(all_of(as.character(keep_cols)))
  
  
  return(m.var.df)
}


remove_corfeats <- function(m.df, cor.thresh){
  #' Removes correlated features from a data frame based on a correlation threshold.
  #' 
  #' @param m.df A data frame. Input data frame.
  #' @param cor.thresh A numeric. Correlation threshold.
  #' 
  #' @return A data frame. Data frame with correlated features removed.
  #' 
  #' @examples
  #' remove_corfeats(m.df = my_data_frame, cor.thresh = 0.7)
  #' 
  #' The function calculates the correlation matrix of the input data frame. 
  #' It then identifies correlated columns based on the correlation threshold. 
  #' Columns with correlations above the threshold are removed. 
  #' The resulting data frame without correlated features is returned.
  #' 
  

  # Correlation Matrix
  cor.mat <-  cor(m.df)
  
  # Format triangle
  cor.mat[lower.tri(cor.mat, diag = TRUE)] <- 0
  
  # Find correlated columns
  corr.descision <- apply(cor.mat, 2, function(x){ifelse(any(x >= cor.thresh), "remove", "remain")})
  
  # Keep columns
  keep_cols <- names(corr.descision[corr.descision == "remain"])
  
  # Filter df
  
  m.uncor <- m.df %>% 
    select(all_of(keep_cols))
  
  
  return(m.uncor)
  
}

preprocess_mole <- function(mole_df, min.var, corr.thresh){
  #' Preprocesses molecular data by filtering out constant columns and correlated features.
  #' 
  #' @param mole_df A data frame. Input molecular data frame.
  #' @param min.var A numeric. Variance threshold for removing constant columns.
  #' @param corr.thresh A numeric. Correlation threshold for removing correlated features.
  #' 
  #' @return A data frame. Preprocessed molecular data frame.
  #' 
  #' @examples
  #' preprocess_mole(mole_df = my_molecular_data_frame, min.var = 0.01, corr.thresh = 0.7)
  #' 
  #' The function preprocesses molecular data by first removing constant columns using the specified variance threshold. 
  #' Then, it removes correlated features based on the correlation threshold. 
  #' The resulting data frame is the preprocessed molecular data with constant columns and correlated features removed.
  #' 
  
  
  # Filter variance
  mole.variable <- remove_constant(mole_df, min.var)
  
  
  # Filter correlated
  mole.decorrelated <- remove_corfeats(mole.variable, corr.thresh)
  
  
  return(mole.decorrelated)
  
  
}

# Prepare data

mole_prepared <- preprocess_mole(mole_representation, min.var=0.01, corr.thresh=0.90)
mole_prepared %>% dim()
```

  
Similar procedure for ECFP4 representation  
  
```{r preprocess.fps}

hamming <- function(X) {
    D <- (1 - X) %*% t(X)
    HamDist <- D + t(D)
    
    return(HamDist / ncol(X))
}

remove_rare <-  function(f.df, min_chems){
  
  # Removes columns where a feature is not present in the majority of compounds
  #   
  #   Parameters
  #   ----------
  #   - fdf: pandas dataframe. Columns are bits of the fingerprint. Rows are compounds
  #   - mc: int. The minimum number of molecules a feature should be present in order for it to be preserved
  
  
  # Number of chemicals a feature is present in
  n.freq <- colSums(f.df)
  names(n.freq) <- colnames(f.df)
  
  # Filter popular features
  common.features <- n.freq[n.freq >= min_chems]
  
  # Alter df
  
  f.common <- f.df %>% 
    select(all_of(names(common.features)))
  
  
  return(f.common)
}

remove_correlated <- function(f.df, min_dist){
  
  # Removes highly similar features
  # 
  #   Parameters
  #   ----------
  #   - fdf: pandas dataframe. Dataframe where columns are bits in the fingerprint
  #   - mdist: float. Is minimal distance between features in order to be different enough
  # 
  #   Returns
  #   -------
  #   pandas dataframe
  #   Highly correlated features are removed (one of them is preserved)
  
  # Hamming distance
  dist.matrix <-  hamming(t(f.df))
  
  # Format 
  dist.matrix[lower.tri(dist.matrix, diag = TRUE)] <- 1
  
  # Find correlated columns
  corr.descision <- apply(dist.matrix, 2, function(x){ifelse(any(x <= min_dist), "remove", "remain")})
  
  # Only keep un-correlated feats
  keep_cols <- names(corr.descision[corr.descision == "remain"])
  
  
  # Filter df
  f.uncorr <- f.df %>% 
    select(all_of(as.character(keep_cols)))
  
  
  return(f.uncorr)}


preprocess_fps <- function(fps_df, min_compounds, sim_threshold){
  

    # Processes a fingerprint dataframe. Removes rare and highly correlated features
    # 
    # Parameters
    # ----------
    # - fps_df: pandas dataframe. Dataframe where columns are bits in the fingerprint
    # - min_compounds: int. The minimum number of molecules a feature should be present in order for it to be preserved
    # - sim_threshold: float. Is minimal distance between features in order to be different enough
    
  
  fps_common <-  remove_rare(fps_df, min_compounds)
  
  fps.decorr <-  remove_correlated(fps_common, sim_threshold)
  
  return(fps.decorr)
}


ecfp4_prepared <- preprocess_fps(ecfp4_representation, 10, 0.01)
ecfp4_prepared %>% dim()
```

  
### Peform UMAP reduction. 
  
```{r mole.umap}

set.seed(42)
mole.umap <- umap(X=mole_prepared,
                 n_neighbors = 25,
                 n_components = 2,
                 min_dist = 0.35, 
                 n_threads = 20,
                 metric="cosine")


mole.umap <- as.data.frame(mole.umap) %>% 
  rownames_to_column("catalog_number")

```


```{r mole.umap.plot}
mole.umap %>% 
  
ggplot(aes(x=V1, y=V2)) +
 geom_point(alpha=0.9, color="white", fill="#C5C5C5",
             size=2, shape=21, stroke=0.2) +
  
  theme_classic() + 
  
  labs(x="UMAP 1", 
       y = "UMAP 2",
       title = "MolE representation of MCE")
```


```{r ecfp4.umap}

# Jaccard distance
jac.dist <- dist(ecfp4_prepared, method = "binary")

set.seed(1234)
ecfp4.umap <- umap(X=jac.dist,
                 n_neighbors = 20,
                 n_components = 2,
                 min_dist = 0.3, 
                 n_threads = 20)


ecfp4.umap <- as.data.frame(ecfp4.umap) %>% 
  rownames_to_column("catalog_number")

```

  
```{r ecfp4.umap.plot}
ecfp4.umap %>% 
  
ggplot(aes(x=V1, y=V2)) +
 geom_point(alpha=0.9, color="white", fill="#C5C5C5",
             size=2, shape=21, stroke=0.2) +
  
  theme_classic() + 
  
  labs(x="UMAP 1", 
       y = "UMAP 2",
       title = "ECFP4 representation of MCE")
```


### Plotting predicted broad-spectrum. 
   
```{r mole.umap.over10}
  
# Add information of broad-spectrum activity
mole.umap <- mole.umap %>% 
  mutate(pred_activity = if_else(catalog_number %in% mole_predictions.over10$`Catalog Number`, "Antimicrobial", "Not Antimicrobial"))


# Plot 
m.umap.over10 <- ggplot(mole.umap, aes(x=V1, y=V2)) +
  
  geom_point(data=subset(mole.umap, pred_activity == "Not Antimicrobial"), aes(color="Not Antimicrobial"), size=1.5) +
  geom_point(data=subset(mole.umap, pred_activity == "Antimicrobial"), aes(color="Antimicrobial"), size=1.5) +
  
  scale_color_manual(values=c("Antimicrobial" = alpha("#DE1F84", 0.7), "Not Antimicrobial" = alpha("#C5C5C5", 0.7))) +
  
  theme_void() +
  #coord_fixed(ratio = 0.4) +
  
  theme(legend.position = "bottom",
    axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        text=element_text(size=10),
        panel.background = element_rect(fill = "transparent", color=NA),
      plot.background = element_rect(fill = "transparent", colour = NA)) +
  
  labs(x="UMAP 1",
       y="UMAP 2",
       color="Predicted Activity")

m.umap.over10
```


```{r annotating.litchems.mole}

chems_interst <- mole_predictions.over10 %>% 
  filter(ProductName %in% c("Ospemifene", "Shionone", "Bekanamycin", "Doxifluridine", "Ellagic acid")) %>% 
  select(`Catalog Number`, ProductName) %>% 
  rename("catalog_number" = "Catalog Number")

chems_interest_umap <- chems_interst %>% 
  left_join(mole.umap, by="catalog_number")

# Plot 
m.umap.over10.annot <- ggplot(mole.umap, aes(x=V1, y=V2)) +
  
  geom_point(data=subset(mole.umap, pred_activity == "Not Antimicrobial"), aes(color="Not Antimicrobial"), size=1.5) +
  geom_point(data=subset(mole.umap, pred_activity == "Antimicrobial"), aes(color="Antimicrobial"), size=1.5) +
  geom_text_repel(data = chems_interest_umap, aes(label=ProductName), 
                  max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, fontface="bold") +
  scale_color_manual(values=c("Antimicrobial" = alpha("#DE1F84", 0.7), "Not Antimicrobial" = alpha("#C5C5C5", 0.7))) +
  
  theme_void() +
  #coord_fixed(ratio = 0.4) +
  
  theme(legend.position = "bottom",
    axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        text=element_text(size=10),
        panel.background = element_rect(fill = "transparent", color=NA),
      plot.background = element_rect(fill = "transparent", colour = NA)) +
  
  labs(x="UMAP 1",
       y="UMAP 2",
       color="Predicted Activity")

m.umap.over10.annot
```
  
```{r}
meta_info <- mole_predictions %>% 
  select(`Catalog Number`, nk_total, ProductName) %>% 
  rename("catalog_number" = "Catalog Number")

mole.umap %>% 
  left_join(meta_info, by="catalog_number") %>% 
  rename("umap1" = "V1",
         "umap2" = "V2",
         "pred_n_inhibited_total" = "nk_total") %>% 
  
  write_tsv("../data/05.analyze_mce_predictions/umap_mce_predictions.tsv")

```


```{r ecfp4.umap.over10}
  
# Add information of broad-spectrum activity
ecfp4.umap <- ecfp4.umap %>% 
  mutate(pred_activity = if_else(catalog_number %in% ecfp4_predictions.over10$`Catalog Number`, "Antimicrobial", "Not Antimicrobial"))


# Plot 
e.umap.over10 <- ggplot(ecfp4.umap, aes(x=V1, y=V2)) +
  
  geom_point(data=subset(ecfp4.umap, pred_activity == "Not Antimicrobial"), aes(color="Not Antimicrobial"), size=1.5) +
  geom_point(data=subset(ecfp4.umap, pred_activity == "Antimicrobial"), aes(color="Antimicrobial"), size=1.5) +
  
  scale_color_manual(values=c("Antimicrobial" = alpha("#DE1F84", 0.7), "Not Antimicrobial" = alpha("#C5C5C5", 0.7)),
                     labels=c("Broad Spectrum", "Not Broad Spectrum")) +
  
  theme_void() +
  #coord_fixed(ratio = 0.4) +
  
  theme(legend.position = "bottom",
    axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        text=element_text(size=10),
        panel.background = element_rect(fill = "transparent", color=NA),
      plot.background = element_rect(fill = "transparent", colour = NA)) +
  
  labs(x="UMAP 1",
       y="UMAP 2",
       color="Predicted Activity")

e.umap.over10
```
  
```{r annotating.litchems.fps}
chems_interst <- mole_predictions.over10 %>% 
  filter(ProductName %in% c("Ospemifene", "Shionone", "Bekanamycin", "Doxifluridine", "Ellagic acid")) %>% 
  select(`Catalog Number`, ProductName) %>% 
  rename("catalog_number" = "Catalog Number")

chems_interest_umap <- chems_interst %>% 
  left_join(ecfp4.umap, by="catalog_number")

e.umap.over10.annot <- ggplot(ecfp4.umap, aes(x=V1, y=V2)) +
  
  geom_point(data=subset(ecfp4.umap, pred_activity == "Not Antimicrobial"), aes(color="Not Antimicrobial"), size=1.5) +
  geom_point(data=subset(ecfp4.umap, pred_activity == "Antimicrobial"), aes(color="Antimicrobial"), size=1.5) +
  
  geom_text_repel(data = chems_interest_umap, aes(label=ProductName), 
                  max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, fontface="bold") +
  
  scale_color_manual(values=c("Antimicrobial" = alpha("#DE1F84", 0.7), "Not Antimicrobial" = alpha("#C5C5C5", 0.7)),
                     labels=c("Broad Spectrum", "Not Broad Spectrum")) +
  
  theme_void() +
  #coord_fixed(ratio = 0.4) +
  
  theme(legend.position = "bottom",
    axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        text=element_text(size=10),
        panel.background = element_rect(fill = "transparent", color=NA),
      plot.background = element_rect(fill = "transparent", colour = NA)) +
  
  labs(x="UMAP 1",
       y="UMAP 2",
       color="Predicted Activity")

e.umap.over10.annot
ggsave(plot=e.umap.over10.annot, 
       filename = "../data/05.analyze_mce_predictions/ecfp4_mce_umap.pdf", 
       width = 21, height = 15, units="cm", dpi=300)
```


```{r}
meta_info <- ecfp4_predictions %>% 
  select(`Catalog Number`, nk_total, ProductName) %>% 
  rename("catalog_number" = "Catalog Number")

ecfp4.umap %>% 
  left_join(meta_info, by="catalog_number") %>% 
  rename("umap1" = "V1",
         "umap2" = "V2",
         "pred_n_inhibited_total" = "nk_total") %>% 
  
  write_tsv("../data/05.analyze_mce_predictions/umap_mce_predictions_ecfp4.tsv")
```


## Ranking molecule predictions. 
  
Here we will create plots that compare the antimicrobial activity and the number of predicted inhibited strains.   
   
```{r plot.ap.vs.nk}

# Format labels
scatter.nkill.apscore <- mole_predictions %>% 
  mutate(antibiotic = if_else(antibiotic == "abx", "Antibiotic", "Non-Antibiotic")) %>% 

# Create plots
ggplot(aes(x=nk_total, y=apscore_total, color=antibiotic)) +
  geom_point(alpha=0.5, size=1) +
  
  scale_color_manual(breaks = c("Antibiotic", "Non-Antibiotic"),
                     values=c("red", "#C5C5C5")) +
  
  geom_vline(xintercept = 10, linetype="longdash") +
  
  theme_light() +
  labs(x="Predicted number of inhibited strains",
       y = "Antimicrobial Potential",
       color="Compound Class") +
  theme(legend.position = c(0.7, 0.3),
        text=element_text(size=10),
        legend.text = element_text(size=8),
        legend.title = element_text(size=8))
  

score.vs.nkill.marginal <- ggExtra::ggMarginal(scatter.nkill.apscore, type="boxplot", groupColour = TRUE, groupFill = TRUE)
score.vs.nkill.marginal
```
  
```{r plot.ap.vs.nk.e}

# Format labels
scatter.nkill.apscore.e <- ecfp4_predictions %>% 
  mutate(antibiotic = if_else(antibiotic == "abx", "Antibiotic", "Non-Antibiotic")) %>% 

# Create plots
ggplot(aes(x=nk_total, y=apscore_total, color=antibiotic)) +
  geom_point(alpha=0.5, size=1) +
  
  scale_color_manual(breaks = c("Antibiotic", "Non-Antibiotic"),
                     values=c("red", "#C5C5C5")) +
  
  geom_vline(xintercept = 10, linetype="longdash") +
  
  theme_light() +
  labs(x="Predicted number of inhibited strains",
       y = "Antimicrobial Potential",
       color="Compound Class") +
  theme(legend.position = c(0.7, 0.3),
        text=element_text(size=10),
        legend.text = element_text(size=8),
        legend.title = element_text(size=8))
  

score.vs.nkill.marginal.e <- ggExtra::ggMarginal(scatter.nkill.apscore.e, type="boxplot", groupColour = TRUE, groupFill = TRUE)
score.vs.nkill.marginal.e
```


## Comparing Gram stains. 
  
Here we compare the antimicrobial potential scores for gram negatives and gram positives. 
   
```{r gneg.vs.gpos.labels}

# Selected chemicals
selected_chems <- c("Visomitin", 
               "Ebastine", 
               "Opicapone",
               "Cetrorelix (Acetate)",
               'Thymidine',
               "Elvitegravir")

# Uridine derivatives
uridin.deriv <- c("Uridine",
                  "Uridine 5'-monophosphate",
                  "5-Methyluridine",
                  "2'-Deoxyuridine",
                  "Doxifluridine")
  
# Other interests
# Other interests
other.interest <- c("Tannic acid",
                    "Teniposide",
                    "Mequitazine",
                    "Dimetridazole",
                    "Azelnipidine",
                    "Luliconazole",
                    "Bergenin")


# Format Product Names
selected.chemicals.data <- mole_predictions.over10 %>% 
  mutate(ProductName = if_else(ProductName %in% selected_chems, ProductName, ""),
         ProductName = if_else(ProductName == "Cetrorelix (Acetate)", "Cetrorelix", ProductName))

uridine.deriv.data <- mole_predictions.over10 %>% 
   mutate(ProductName = if_else(ProductName %in% uridin.deriv, ProductName, ""))

other.interest.data <- mole_predictions.over10 %>% 
   mutate(ProductName = if_else(ProductName %in% other.interest, ProductName, ""))


# Plot
predBS.mole <- mole_predictions.over10 %>% 
  
  # Only non-antibiotics
  filter(antibiotic == "not_abx") %>% 
  mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`)) %>% 
  
  
  ggplot(aes(x=apscore_gnegative, y=apscore_gpositive, color=`Reported Activity`)) +
  
  # Basic aes
  geom_point(size=1) +
  geom_abline(linetype="longdash", alpha=0.25) +
  
  # Add names
  geom_text_repel(data = selected.chemicals.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), 
                  max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, fontface="bold",
                  nudge_x = if_else(selected.chemicals.data$ProductName %in% c("Cetrorelix", "Ebastine"), -1, 0)) +
  
  geom_text_repel(data = uridine.deriv.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), 
                  max.overlaps = Inf, size=1.8, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2,
                  nudge_x = if_else(uridine.deriv.data$ProductName %in% c("Uridine"), -1, 0)) +
  
  geom_text_repel(data = other.interest.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), 
                  max.overlaps = Inf, size=1.8, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2) +
  
  # Color by reported activity
  scale_color_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"),
                    values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) +
  
  theme_light() +
  theme(legend.position = "bottom",
        text = element_text(size=10)) +
  labs(x = latex2exp::TeX("Antimicrobial Potential $G^-$"),
       y = latex2exp::TeX("Antimicrobial Potential $G^+$"))
  
predBS.mole
```

```{r gneg.vs.gpos.labels.e}

# Selected chemicals
selected_chems <- c("Visomitin", 
               "Ebastine", 
               "Opicapone",
               "Cetrorelix (Acetate)",
               'Thymidine',
               "Elvitegravir")

# Uridine derivatives
uridin.deriv <- c("Uridine",
                  "Uridine 5'-monophosphate",
                  "5-Methyluridine",
                  "2'-Deoxyuridine",
                  "Doxifluridine")
  
# Other interests
other.interest <- c("Tannic acid",
                    "Teniposide",
                    "Dimetridazole",
                    "Azelnipidine",
                    "Luliconazole",
                    "Bergenin")


# Format Product Names
selected.chemicals.data <- ecfp4_predictions.over10 %>% 
  mutate(ProductName = if_else(ProductName %in% selected_chems, ProductName, ""),
         ProductName = if_else(ProductName == "Cetrorelix (Acetate)", "Cetrorelix", ProductName))

uridine.deriv.data <- ecfp4_predictions.over10 %>% 
   mutate(ProductName = if_else(ProductName %in% uridin.deriv, ProductName, ""))

other.interest.data <- ecfp4_predictions.over10 %>% 
   mutate(ProductName = if_else(ProductName %in% other.interest, ProductName, ""))

# Plot
predBS.fps <- ecfp4_predictions.over10 %>% 
  
  # Only non-antibiotics
  filter(antibiotic == "not_abx") %>% 
  mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`)) %>% 
  
  
  ggplot(aes(x=apscore_gnegative, y=apscore_gpositive, color=`Reported Activity`)) +
  
  # Basic aes
  geom_point(size=1) +
  geom_abline(linetype="longdash", alpha=0.25) +
  
  # Add names
  geom_text_repel(data = selected.chemicals.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), 
                  max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, fontface="bold",
                  nudge_x = if_else(selected.chemicals.data$ProductName %in% c("Cetrorelix", "Ebastine"), -1, 0)) +
  
  geom_text_repel(data = uridine.deriv.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), 
                  max.overlaps = Inf, size=1.8, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2,
                  nudge_x = if_else(uridine.deriv.data$ProductName %in% c("Uridine"), -1, 0)) +
  
  geom_text_repel(data = other.interest.data, aes(x=apscore_gnegative, y=apscore_gpositive, label=ProductName), 
                  max.overlaps = Inf, size=1.8, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2) +
  
  # Color by reported activity
  scale_color_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"),
                    values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) +
  
  theme_light() +
  theme(legend.position = "bottom",
        text = element_text(size=10)) +
  labs(x = latex2exp::TeX("Antimicrobial Potential $G^-$"),
       y = latex2exp::TeX("Antimicrobial Potential $G^+$"))

predBS.fps
```

  
## Propotion with reported activity
  
```{r activity.counts}


# Get the activity counts
proportion.known.activity <- mole_predictions.over10 %>% 
  
  # Only consider non-antibiotics
  filter(antibiotic == "not_abx") %>% 
  
  # Consolidate antiparasitic category
  mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`)) %>% 
  
  count(`Reported Activity`) %>% 
  
  mutate(`Reported Activity` = factor(`Reported Activity`, levels=c("None", "Antiparasitic", "Antifungal", "Antiviral","Antibacterial")),
         x=" ")


# Gather 
reported.g <- ggplot(proportion.known.activity, aes(x=n, y=x, fill=`Reported Activity`)) +
   geom_bar(position="fill", stat="identity", color="black") +
  scale_fill_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"),
                    values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) +
  
  theme_classic() +
  
  coord_flip() +
  
  theme(text=element_text(size=10),
        legend.position = "right",
        aspect.ratio = 3,
        legend.text = element_text(size = 8),
        legend.title = element_text(size=8)) +
  labs(x="Proportion with reported activity", 
       y="Predicted antimicrobial", 
       fill = "Reported Activity")

reported.g

```
    
```{r activity.counts.e}


# Get the activity counts
proportion.known.activity.e <- ecfp4_predictions.over10 %>% 
  
  # Only consider non-antibiotics
  filter(antibiotic == "not_abx") %>% 
  
  # Consolidate antiparasitic category
  mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`)) %>% 
  
  count(`Reported Activity`) %>% 
  
  mutate(`Reported Activity` = factor(`Reported Activity`, levels=c("None", "Antiparasitic", "Antifungal", "Antiviral","Antibacterial")),
         x=" ")


# Gather 
reported.g.e <- ggplot(proportion.known.activity.e, aes(x=n, y=x, fill=`Reported Activity`)) +
   geom_bar(position="fill", stat="identity", color="black") +
  scale_fill_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"),
                    values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) +
  
  theme_classic() +
  
  coord_flip() +
  
  theme(text=element_text(size=10),
        legend.position = "right",
        aspect.ratio = 3,
        legend.text = element_text(size = 8),
        legend.title = element_text(size=8)) +
  labs(x="Proportion with reported activity", 
       y="Predicted antimicrobial", 
       fill = "Reported Activity")

reported.g.e

```

  
## Comparing MolE and ECFP4 predictions. 
  
```{r global.comparison.ap}

mole_complete_comp <- mole_predictions %>% 
  select(`Catalog Number`, nk_total, apscore_total, antibiotic, ProductName) %>% 
  rename("mole_nk_total" = "nk_total",
         "mole_apscore_total" = "apscore_total")


ecfp4_complete_comp <- ecfp4_predictions %>% 
  select(`Catalog Number`, nk_total, apscore_total) %>% 
  rename("ecfp4_nk_total" = "nk_total",
         "ecfp4_apscore_total" = "apscore_total")


complete_pred_comparison <- ecfp4_complete_comp %>% 
  left_join(mole_complete_comp, by='Catalog Number') %>% 
   mutate(antibiotic = if_else(antibiotic == "abx", "Antibiotic", "Non-Antibiotic"))

```

```{r global.comparison.nk}
ggplot(complete_pred_comparison, aes(x=mole_nk_total, y=ecfp4_nk_total, color=antibiotic)) +
  
  geom_point(alpha=0.5, size=1) +
  geom_abline(linetype="longdash", alpha=0.5, color="blue") +
  
  geom_vline(xintercept = 10,linetype="longdash", alpha=0.5, color="black") +
  geom_hline(yintercept = 10,linetype="longdash", alpha=0.5, color="black") +
  
  scale_color_manual(breaks = c("Antibiotic", "Non-Antibiotic"),
                     values=c("red", "#C5C5C5")) +
  
  theme_light() +
  
  labs(title="Predicted number of inhibited strains comparison",
       x="MolE",
       y="ECFP4",
       color="Compound Class") +
  
  theme(text=element_text(size=10),
        legend.text = element_text(size=8),
        legend.title = element_text(size=8))
```
  
Comparing predicted broad spectrum compounds. 
  
```{r ggvenn.lib}
library(ggvenn)
```


```{r broad.spec.venn}
fsize <- 3.5

# Complete comparison of broad spectrum antimicrobials
broad.list <- list("MolE" = mole_predictions.over10$`Catalog Number`,
                   "ECFP4" = ecfp4_predictions.over10$`Catalog Number`)

venn.abx.comparison <- ggvenn(broad.list,
       fill_color = c("#DE1F84", "#1F9DBB"),
       set_name_size=fsize,
       text_size=fsize)

venn.abx.comparison

# Comparison of broad spectrum antimicrobials no antibiotics
broad.list <- list("MolE" = mole_predictions.over10 %>% filter(antibiotic == "not_abx",
                                                               `Reported Activity` != "None") %>% select(`Catalog Number`) %>% unlist(),
                   "ECFP4" = ecfp4_predictions.over10 %>% filter(antibiotic == "not_abx",
                                                                 `Reported Activity` != "None") %>% select(`Catalog Number`) %>% unlist())

venn.nonabx.comparison <- ggvenn(broad.list,
       fill_color = c("#DE1F84", "#1F9DBB"),
       set_name_size=fsize,
       text_size=fsize)
venn.nonabx.comparison
```


Now compare results from literature search. 
  
```{r litsearch.comparison}


proportion.known.activity <- proportion.known.activity %>% 
  mutate(representation = "MolE")


proportion.known.activity.e <- proportion.known.activity.e %>% 
  mutate(representation = "ECFP4")

proportion.comparison <- bind_rows(proportion.known.activity, 
                                   proportion.known.activity.e)

# Gather totals for n
total_count = proportion.comparison %>% 
  group_by(representation) %>% 
  summarise(total = sum(n)) %>% 
  mutate(tx_sum = paste("n =", total))

reported.bar.comparison <- ggplot(proportion.comparison, aes(x=n, y=representation)) +
  
  annotate("text", y=c("ECFP4", "MolE"), x=1.04, label=total_count$tx_sum,) +
  
  geom_bar(position="fill", stat="identity", color="black", aes(fill=`Reported Activity`)) +
  scale_fill_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"),
                    values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) +
  
  theme_classic() +
  
  coord_flip() +
  
  theme(text=element_text(size=10),
        legend.position = "right",
        aspect.ratio = 2,
        legend.text = element_text(size = 8),
        legend.title = element_text(size=8)) +
  
  
  labs(y="", 
       x="Proportion", 
       fill = "Reported Activity")

reported.bar.comparison

```

```{r proportion.abx.broad}

mole.o10.abx <- mole_predictions.over10 %>% 
  count(antibiotic) %>% 
  mutate(representation = "MolE")

ecfp4.o10.abx <- ecfp4_predictions.over10 %>% 
  count(antibiotic) %>% 
  mutate(representation = "ECFP4")
  

over10.abx <- bind_rows(mole.o10.abx, ecfp4.o10.abx) %>% 
  mutate(antibiotic = if_else(antibiotic == "abx", "Antibiotic", "Other"),
         antibiotic = factor(antibiotic, levels=c("Other", "Antibiotic")))


abx.bar.comparison <- ggplot(over10.abx, aes(x=n, y=representation, fill=antibiotic)) +
  
   geom_bar(position="fill", stat="identity", color="black") +
  scale_fill_manual(breaks = c("Other", "Antibiotic"),
                    values = c("#C5C5C5", "red")) +
  
  theme_classic() +
  
  coord_flip() +
  
  theme(text=element_text(size=10),
        legend.position = "right",
        aspect.ratio = 2,
        legend.text = element_text(size = 8),
        legend.title = element_text(size=8)) +
  
  
  labs(y="", 
       x="Proportion", 
       fill = "Compound Class")

abx.bar.comparison
```
  
Comparing predicted broad spectrum. 
  
```{r comparing.over10}

mole.litsearch <- mole_predictions.over10 %>% 
  filter(antibiotic == "not_abx") %>% 
  select(`Catalog Number`, `Reported Activity`, ProductName) %>% 
  
  # Consolidate antiparasitic category
  mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`))


ecfp4.litsearch <- ecfp4_predictions.over10 %>% 
  filter(antibiotic == "not_abx") %>% 
  select(`Catalog Number`, `Reported Activity`, ProductName) %>% 
  
  # Consolidate antiparasitic category
  mutate(`Reported Activity` = if_else(`Reported Activity` %in% c("Antiplasmodium", "Insecticide"), "Antiparasitic", `Reported Activity`))


combined.litsearch <- bind_rows(mole.litsearch, ecfp4.litsearch) %>% 
  distinct()


mole_o10 <- mole_predictions %>% 
  filter(`Catalog Number` %in% combined.litsearch$`Catalog Number`) %>% 
  select(`Catalog Number`, nk_total, apscore_total) %>% 
  
   rename("mole_nk_total" = "nk_total",
         "mole_apscore_total" = "apscore_total")


ecfp4_o10 <- ecfp4_predictions %>% 
  filter(`Catalog Number` %in% combined.litsearch$`Catalog Number`) %>% 
  select(`Catalog Number`, nk_total, apscore_total) %>% 
  
   rename("ecfp4_nk_total" = "nk_total",
         "ecfp4_apscore_total" = "apscore_total")


broadS.comp <- combined.litsearch %>% 
  left_join(mole_o10, by="Catalog Number") %>% 
  left_join(ecfp4_o10, by="Catalog Number") %>% 
  filter(`Reported Activity` != "None")
```


```{r broadS.comp}


# Selected chemicals
selected_chems <- c("Visomitin", 
               "Ebastine", 
               "Opicapone",
               "Cetrorelix (Acetate)",
               'Thymidine',
               "Elvitegravir")
  
# Other interests
other.interest <- c("Bergenin",
                    "Fenticonazole (Nitrate)",
                    "Doxifluridine",
                    "Broxyquinoline",
                    "Triclabendazole",
                    "Octenidine (dihydrochloride)",
                    "Benzalkonium (chloride)",
                    "Simeprevir")


# Format Product Names
selected.chemicals.data <- broadS.comp %>% 
  mutate(ProductName = if_else(ProductName %in% selected_chems, ProductName, ""),
         ProductName = if_else(ProductName == "Cetrorelix (Acetate)", "Cetrorelix", ProductName))

other.interest.data <- broadS.comp %>% 
   mutate(ProductName = if_else(ProductName %in% other.interest, ProductName, ""),
          ProductName = if_else(ProductName == "Fenticonazole (Nitrate)", "Fenticonazole", ProductName),
          ProductName = if_else(ProductName == "Octenidine (dihydrochloride)", "Octenidine", ProductName),
          ProductName = if_else(ProductName == "Benzalkonium (chloride)", "Benzalkonium", ProductName))


nk.comparison <- ggplot(broadS.comp, aes(x=mole_nk_total, y=ecfp4_nk_total, color=`Reported Activity`))  +
  
  geom_vline(xintercept = 10, linetype="longdash", color="black", alpha=0.5) +
  geom_hline(yintercept=10, linetype="longdash", color="black", alpha=0.5) + 
  
  geom_point(alpha=0.5) +
  geom_abline(linetype="longdash", color="red", alpha=0.5) +

   # Add names
  geom_text_repel(data = selected.chemicals.data, aes(x=mole_nk_total, y=ecfp4_nk_total, label=ProductName), 
                  max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2, fontface="bold",
                  nudge_x = if_else(selected.chemicals.data$ProductName %in% c("Elvitegravir"), 1, 0)) +
  
  geom_text_repel(data = other.interest.data, aes(x=mole_nk_total, y=ecfp4_nk_total, label=ProductName), 
                  max.overlaps = Inf, size=2, min.segment.length = 0, color="black", box.padding = 0.7, point.padding = 0.2,
                  nudge_y = if_else(other.interest.data$ProductName %in% c("Benzalkonium"), 1, 0)) +
  
  
  scale_color_manual(breaks = c("Antibacterial", "Antiviral", "Antifungal", "Antiparasitic", "None"),
                    values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#C5C5C5")) +
  
  theme_light() +
  
  labs(x="MolE - N. inhibited strains",
       y="ECFP4 - N. inhibited strains")

nk.comparison

```
  

## Figure panels. 

```{r fig.arrange}
library(ggpubr)
```

 
```{r fps.fig.recreate}

top_row <- ggarrange(e.umap.over10 + theme(aspect.ratio = 0.4), reported.g.e, labels = c("a", "b"), 
          widths = c(2, 1),font.label = list(face="plain"))

bottom_row <- ggarrange(score.vs.nkill.marginal.e, predBS.fps + theme(legend.position = "none"),
          labels = c("c", "d"), font.label = list(face="plain"))

pred.fps.panel <- ggarrange(top_row, bottom_row, ncol = 1, nrow = 2, heights = c(0.8, 1))
pred.fps.panel
ggsave(filename = "../data/05.analyze_mce_predictions/ecfp4_mce_overview.pdf", 
       plot=pred.fps.panel,
       width = 21, height = 15, units="cm", dpi=300)

ggsave(filename = "../data/05.analyze_mce_predictions/ecfp4_mce_overview.svg", 
       plot=pred.fps.panel,
       width = 21, height = 15, units="cm", dpi=300)
```

```{r mole.prediction.overview}

top_row <- ggarrange(m.umap.over10 + theme(aspect.ratio = 0.4), reported.g, labels = c("a", "b"), 
          widths = c(2, 1),font.label = list(face="plain"))

bottom_row <- ggarrange(score.vs.nkill.marginal, predBS.mole + theme(legend.position = "none"),
          labels = c("c", "d"), font.label = list(face="plain"))

pred.mole.panel <- ggarrange(top_row, bottom_row, ncol = 1, nrow = 2, heights = c(0.8, 1))
pred.mole.panel
ggsave(filename = "../data/05.analyze_mce_predictions/mole_mce_overview.pdf", 
       plot=pred.mole.panel,
       width = 21, height = 15, units="cm", dpi=300)

```
  
Combine complete prediction information

```{r combine.pred.info}
ecfp4_predranks <- ecfp4_predictions %>% 
  mutate(representation = "ECFP4")

mole_predranks <- mole_predictions %>% 
  mutate(representation = "MolE")

complete_predranks <- bind_rows(ecfp4_predranks, mole_predranks)
```

```{r known.abx.hist}
nk.abx.hist <- complete_predranks %>% 
  filter(antibiotic == "abx") %>% 
  
  ggplot(aes(x=nk_total, color=representation, color=representation)) +
  geom_histogram(binwidth = 1, aes(y=after_stat(count)), position="dodge", alpha=0.1, fill="white") +
  geom_density(alpha=0.2, aes(y=after_stat(count)), size=1.25, adjust=1/5) +
  
  scale_color_manual(values=c("#1F9DBB", "#DE1F84")) +

  theme_light() +
  
  labs(x="Predicted number of inhibited strains",
       y="Count",
       color="Representation",
       title = "Known antibiotics")

nk.abx.hist
```


```{r litsearch.comparison.}
mole.o10 <- mole_predictions.over10 %>% 
  filter(antibiotic == "not_abx") %>% 
  filter(`Reported Activity` != "None") %>% 
  select(`Catalog Number`) %>% 
  unlist()

ecfp4.o10 <- ecfp4_predictions.over10 %>% 
  filter(antibiotic == "not_abx") %>% 
  filter(`Reported Activity` != "None") %>% 
  select(`Catalog Number`) %>% 
  unlist()


ecfp4.comppreds <- complete_predranks %>% 
  filter(representation == "ECFP4",
         `Catalog Number` %in% ecfp4.o10)

mole.comppreds <- complete_predranks %>% 
  filter(representation == "MolE",
         `Catalog Number` %in% mole.o10)


complete_litsearch_comparison <- bind_rows(ecfp4.comppreds, mole.comppreds)

```


```{r litsearch.hist.nk}

nk.ls.hist <- ggplot(complete_litsearch_comparison, aes(x=nk_total, color=representation)) +
  geom_histogram(binwidth = 1, aes(y=after_stat(count)), position="dodge", alpha=0.1, fill="white") +
  geom_density(alpha=0.2, aes(y=after_stat(count)), size=1.25, adjust=1/4) +
  
  scale_color_manual(values=c("#1F9DBB", "#DE1F84")) +
  #scale_fill_manual(values=c("#1F9DBB", "#DE1F84")) +
  
  theme_light() +
   labs(x="Predicted number of inhibited strains",
       y="Count",
       color="Representation",
       title = "Non-antibiotic antimicrobials")


nk.ls.hist
```


```{r panel.nkill.comp}

panel.comp.nk <- ggarrange(nk.abx.hist, nk.ls.hist,
          nrow = 1, ncol = 2,
          labels = c("a", "b"),
          font.label = list(face="plain"),
          common.legend = TRUE,
          legend = "right")
panel.comp.nk
```


```{r literature.comparison}
panel.comp.lit <- ggarrange(reported.bar.comparison, nk.comparison,
          nrow = 1, ncol = 2,
          labels = c("c", "d"),
          font.label = list(face="plain"),
          common.legend = TRUE,
          legend = "right")

panel.comp.lit
```


```{r complete.panel.comp}

complete.comp.panel <- ggarrange(panel.comp.nk, 
                                 NULL,
                                 panel.comp.lit,
                                 heights = c(1, 0.1, 1),
          nrow = 3, ncol = 1)


ggsave(plot = complete.comp.panel, 
       filename = "../data/05.analyze_mce_predictions/mce_pred_comparison.pdf",
       width = 21, height = 18, units="cm", dpi=300)


ggsave(plot = complete.comp.panel, 
       filename = "../data/05.analyze_mce_predictions/mce_pred_comparison.png",
       width = 21, height = 18, units="cm", dpi=300)


complete.comp.panel
```


## Supplementary Files. 
    
    
### SF8

```{r, eval=FALSE, include=FALSE}

sf8a <- complete_predranks_wide_abx <- complete_predranks %>% 
  filter(antibiotic=="abx") %>% 
  pivot_wider(id_cols = c(`Catalog Number`, ProductName, antibiotic), names_from = representation, values_from = nk_total) %>% 
  
  rename("ECFP4_predicted_n_inhibited_strains" = "ECFP4",
         "MolE_predicted_n_inhibited_strains" = "MolE")


sf8bd <- broadS.comp %>% 
  mutate(antibiotic="not_abx") %>% 
  select(`Catalog Number`, ProductName, antibiotic, `Reported Activity`, mole_nk_total, ecfp4_nk_total) %>% 
  rename("ECFP4_predicted_n_inhibited_strains" = "ecfp4_nk_total",
         "MolE_predicted_n_inhibited_strains" = "mole_nk_total")

sf8c <- proportion.comparison %>% 
  pivot_wider(id_cols = `Reported Activity`, names_from = representation, values_from = n)


```

```{r, eval=FALSE, include=FALSE}
library(xlsx)
```


```{r, eval=FALSE, include=FALSE}
write.xlsx2(sf8a, "../data/Supplementary Source Data/Supplementary Figure 8.xlsx",
           sheetName = "SF8a",
           col.names = TRUE, 
           row.names = TRUE, 
           append = FALSE,
           showNA = FALSE)

write.xlsx2(sf8bd, "../data/Supplementary Source Data/Supplementary Figure 8.xlsx",
           sheetName = "SF8b,d",
           col.names = TRUE, 
           row.names = TRUE, 
           append = TRUE,
           showNA = FALSE)

write.xlsx2(sf8c, "../data/Supplementary Source Data/Supplementary Figure 8.xlsx",
           sheetName = "SF8c",
           col.names = TRUE, 
           row.names = TRUE, 
           append = TRUE,
           showNA = FALSE)
```

### SF13

```{r}
# Names
chem_names <- read_tsv("../data/04.new_predictions/medchemexpress_filtered.tsv.gz", show_col_types = FALSE)

# MolE scores
mole_scores <- read_excel("../data/04.new_predictions/mole_mce_predictions_litsearch.xlsx", sheet = "mole_scores") %>% 
  pivot_longer(cols = -chem_id, names_to = "strain", values_to = "MolE_predictive_probability") %>% 
  
  unite("pred_id", c(chem_id, strain), sep=":", remove=FALSE)

# ECFP4 scores
ecfp4_scores <- read_excel("../data/04.new_predictions/ecfp4_mce_predictions_litsearch.xlsx", sheet = "ecfp4_scores") %>% 
  pivot_longer(cols = -chem_id, names_to = "strain", values_to = "ECFP4_predictive_probability") %>% 
  unite("pred_id", c(chem_id, strain), sep=":")

# ChemDesc scores
chemDesc_scores <- read_excel("../../udl_microbiome/data/chemDesc_mce_predictions.xlsx", sheet = "chemDesc_scores") %>% 
  pivot_longer(cols = -chem_id, names_to = "strain", values_to = "chemDesc_predictive_probability") %>% 
  unite("pred_id", c(chem_id, strain), sep=":")


# Optimal thresholds
optimal_thresholds <- read_tsv("../data/03.model_evaluation/optimal_thresholds.tsv.gz", show_col_types = FALSE) %>% 
  filter(score_type == "optimized") %>% 
  select(representation, threshold)


```


```{r}
optimal_thresholds
```


```{r}
selected.predictions <- chem_names %>% 
  filter(ProductName %in% c("Ebastine", "Elvitegravir", "Visomitin", "Opicapone", "Thymidine", "Cetrorelix (Acetate)")) %>% 
  select(`Catalog Number`, ProductName) %>% 
  rename("chem_id" = "Catalog Number") %>% 
  
  left_join(mole_scores, by="chem_id") %>% 
  left_join(ecfp4_scores, by="pred_id") %>% 
  left_join(chemDesc_scores, by="pred_id") %>% 
  
  mutate("MolE_optimal_threshold" = optimal_thresholds %>% filter(representation == "MolE") %>% select(threshold) %>% unlist(),
         "ECPF4_optimal_threshold" = optimal_thresholds %>% filter(representation == "ecfp4") %>% select(threshold) %>% unlist(),
         "chemDesc_optimal_threshold" = optimal_thresholds %>% filter(representation == "chemDesc") %>% select(threshold) %>% unlist(),
         
         
         "MolE_predicted_growth_inhibition" = if_else(MolE_predictive_probability >= MolE_optimal_threshold, 1, 0),
         "ECFP4_predicted_growth_inhibition" = if_else(ECFP4_predictive_probability >= ECPF4_optimal_threshold, 1, 0),
         "chemDesc_predicted_growth_inhibition" = if_else(chemDesc_predictive_probability >= chemDesc_optimal_threshold, 1, 0)) %>% 
    
    select(-pred_id)
```

```{r}
write.xlsx2(selected.predictions %>% filter(ProductName == "Cetrorelix (Acetate)"), 
            "../data/Supplementary Source Data/Supplementary Figure 13.xlsx",
           sheetName = "SF13a",
           col.names = TRUE, 
           row.names = TRUE, 
           append = FALSE,
           showNA = TRUE)

write.xlsx2(selected.predictions %>% filter(ProductName == "Ebastine"), 
            "../data/Supplementary Source Data/Supplementary Figure 13.xlsx",
           sheetName = "SF13b",
           col.names = TRUE, 
           row.names = TRUE, 
           append = TRUE,
           showNA = FALSE)

write.xlsx2(selected.predictions %>% filter(ProductName == "Elvitegravir"), 
            "../data/Supplementary Source Data/Supplementary Figure 13.xlsx",
           sheetName = "SF13c",
           col.names = TRUE, 
           row.names = TRUE, 
           append = TRUE,
           showNA = FALSE)

write.xlsx2(selected.predictions %>% filter(ProductName == "Opicapone"), 
            "../data/Supplementary Source Data/Supplementary Figure 13.xlsx",
           sheetName = "SF13d",
           col.names = TRUE, 
           row.names = TRUE, 
           append = TRUE,
           showNA = FALSE)


write.xlsx2(selected.predictions %>% filter(ProductName == "Thymidine"), 
            "../data/Supplementary Source Data/Supplementary Figure 13.xlsx",
           sheetName = "SF13e",
           col.names = TRUE, 
           row.names = TRUE, 
           append = TRUE,
           showNA = FALSE)

write.xlsx2(selected.predictions %>% filter(ProductName == "Visomitin"), 
            "../data/Supplementary Source Data/Supplementary Figure 13.xlsx",
           sheetName = "SF13f",
           col.names = TRUE, 
           row.names = TRUE, 
           append = TRUE,
           showNA = FALSE)
```


```{r}
selected.predictions 
```


## Session Info

```{r session.info}
sessionInfo()
```