mole_broad_spectrum_parallel/workflow/01.prepare_training_data.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare Training Data  \n",
    "\n",
    "### Author: Roberto Olayo-Alarcon\n",
    "  \n",
    "In this step we read the data from [Maier, L., et al (2018)](https://www.nature.com/articles/nature25979). And prepare it to be used for training, testing and validation of the XGBoost model. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read Libraries\n",
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from tqdm import tqdm\n",
    "import pubchempy as pcp\n",
    "\n",
    "from dataset.dataset_representation import process_dataset\n",
    "\n",
    "\n",
    "from rdkit import Chem\n",
    "from rdkit.Chem import Descriptors\n",
    "from rdkit.Chem.SaltRemover import SaltRemover"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Global variables  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Raw data dir\n",
    "RAW_DATA_DIR = \"../raw_data/\"\n",
    "\n",
    "# Directory from which to read the raw data\n",
    "INPUT_DIR = os.path.join(RAW_DATA_DIR, \"maier_microbiome\")\n",
    "\n",
    "# Create the output directory\n",
    "OUTPUT_DIR = \"../data/01.prepare_training_data\"\n",
    "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
    "\n",
    "# Pvalue cutoff for label determination\n",
    "PVAL_CUTOFF = 0.05"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read data and binarize\n",
    "  \n",
    "Here we read in Supplementary Table 3 from the study."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Akkermansia muciniphila (NT5021)</th>\n",
       "      <th>Bacteroides caccae (NT5050)</th>\n",
       "      <th>Bacteroides fragilis (ET) (NT5033)</th>\n",
       "      <th>Bacteroides fragilis (NT) (NT5003)</th>\n",
       "      <th>Bacteroides ovatus (NT5054)</th>\n",
       "      <th>Bacteroides thetaiotaomicron (NT5004)</th>\n",
       "      <th>Bacteroides uniformis (NT5002)</th>\n",
       "      <th>Bacteroides vulgatus (NT5001)</th>\n",
       "      <th>Bacteroides xylanisolvens (NT5064)</th>\n",
       "      <th>Bifidobacterium adolescentis (NT5022)</th>\n",
       "      <th>...</th>\n",
       "      <th>Parabacteroides merdae (NT5071)</th>\n",
       "      <th>Prevotella copri (NT5019)</th>\n",
       "      <th>Roseburia hominis (NT5079)</th>\n",
       "      <th>Roseburia intestinalis (NT5011)</th>\n",
       "      <th>Ruminococcus bromii (NT5045)</th>\n",
       "      <th>Ruminococcus gnavus (NT5046)</th>\n",
       "      <th>Ruminococcus torques (NT5047)</th>\n",
       "      <th>Streptococcus parasanguinis (NT5072)</th>\n",
       "      <th>Streptococcus salivarius (NT5038)</th>\n",
       "      <th>Veillonella parvula (NT5017)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>prestwick_ID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Prestw-1109</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Prestw-1399</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Prestw-145</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Prestw-1464</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Prestw-31</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 40 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              Akkermansia muciniphila (NT5021)  Bacteroides caccae (NT5050)  \\\n",
       "prestwick_ID                                                                  \n",
       "Prestw-1109                                  1                            1   \n",
       "Prestw-1399                                  1                            1   \n",
       "Prestw-145                                   1                            1   \n",
       "Prestw-1464                                  1                            1   \n",
       "Prestw-31                                    1                            1   \n",
       "\n",
       "              Bacteroides fragilis (ET) (NT5033)  \\\n",
       "prestwick_ID                                       \n",
       "Prestw-1109                                    1   \n",
       "Prestw-1399                                    1   \n",
       "Prestw-145                                     1   \n",
       "Prestw-1464                                    1   \n",
       "Prestw-31                                      1   \n",
       "\n",
       "              Bacteroides fragilis (NT) (NT5003)  Bacteroides ovatus (NT5054)  \\\n",
       "prestwick_ID                                                                    \n",
       "Prestw-1109                                    1                            1   \n",
       "Prestw-1399                                    1                            1   \n",
       "Prestw-145                                     1                            1   \n",
       "Prestw-1464                                    1                            1   \n",
       "Prestw-31                                      1                            1   \n",
       "\n",
       "              Bacteroides thetaiotaomicron (NT5004)  \\\n",
       "prestwick_ID                                          \n",
       "Prestw-1109                                       1   \n",
       "Prestw-1399                                       1   \n",
       "Prestw-145                                        1   \n",
       "Prestw-1464                                       1   \n",
       "Prestw-31                                         1   \n",
       "\n",
       "              Bacteroides uniformis (NT5002)  Bacteroides vulgatus (NT5001)  \\\n",
       "prestwick_ID                                                                  \n",
       "Prestw-1109                                1                              1   \n",
       "Prestw-1399                                1                              1   \n",
       "Prestw-145                                 1                              1   \n",
       "Prestw-1464                                1                              1   \n",
       "Prestw-31                                  1                              1   \n",
       "\n",
       "              Bacteroides xylanisolvens (NT5064)  \\\n",
       "prestwick_ID                                       \n",
       "Prestw-1109                                    1   \n",
       "Prestw-1399                                    1   \n",
       "Prestw-145                                     1   \n",
       "Prestw-1464                                    1   \n",
       "Prestw-31                                      1   \n",
       "\n",
       "              Bifidobacterium adolescentis (NT5022)  ...  \\\n",
       "prestwick_ID                                         ...   \n",
       "Prestw-1109                                       1  ...   \n",
       "Prestw-1399                                       1  ...   \n",
       "Prestw-145                                        1  ...   \n",
       "Prestw-1464                                       1  ...   \n",
       "Prestw-31                                         1  ...   \n",
       "\n",
       "              Parabacteroides merdae (NT5071)  Prevotella copri (NT5019)  \\\n",
       "prestwick_ID                                                               \n",
       "Prestw-1109                                 1                          1   \n",
       "Prestw-1399                                 1                          1   \n",
       "Prestw-145                                  1                          1   \n",
       "Prestw-1464                                 1                          1   \n",
       "Prestw-31                                   1                          1   \n",
       "\n",
       "              Roseburia hominis (NT5079)  Roseburia intestinalis (NT5011)  \\\n",
       "prestwick_ID                                                                \n",
       "Prestw-1109                            1                                1   \n",
       "Prestw-1399                            1                                1   \n",
       "Prestw-145                             1                                1   \n",
       "Prestw-1464                            1                                1   \n",
       "Prestw-31                              1                                1   \n",
       "\n",
       "              Ruminococcus bromii (NT5045)  Ruminococcus gnavus (NT5046)  \\\n",
       "prestwick_ID                                                               \n",
       "Prestw-1109                              1                             1   \n",
       "Prestw-1399                              1                             1   \n",
       "Prestw-145                               1                             1   \n",
       "Prestw-1464                              1                             1   \n",
       "Prestw-31                                1                             1   \n",
       "\n",
       "              Ruminococcus torques (NT5047)  \\\n",
       "prestwick_ID                                  \n",
       "Prestw-1109                               1   \n",
       "Prestw-1399                               1   \n",
       "Prestw-145                                1   \n",
       "Prestw-1464                               1   \n",
       "Prestw-31                                 1   \n",
       "\n",
       "              Streptococcus parasanguinis (NT5072)  \\\n",
       "prestwick_ID                                         \n",
       "Prestw-1109                                      1   \n",
       "Prestw-1399                                      1   \n",
       "Prestw-145                                       1   \n",
       "Prestw-1464                                      1   \n",
       "Prestw-31                                        1   \n",
       "\n",
       "              Streptococcus salivarius (NT5038)  Veillonella parvula (NT5017)  \n",
       "prestwick_ID                                                                   \n",
       "Prestw-1109                                   1                             1  \n",
       "Prestw-1399                                   1                             1  \n",
       "Prestw-145                                    1                             1  \n",
       "Prestw-1464                                   1                             1  \n",
       "Prestw-31                                     1                             1  \n",
       "\n",
       "[5 rows x 40 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Read the raw datas\n",
    "screen_df = pd.read_excel(os.path.join(INPUT_DIR, \"screen_results_info_SF3.xlsx\"))\n",
    "\n",
    "# Clean the data\n",
    "screen_df.drop(columns=[\"chemical_name\", \"drug_class\", \"n_hit\"], inplace=True)\n",
    "screen_df.set_index(\"prestwick_ID\", inplace=True)\n",
    "\n",
    "# Convert the data to binary\n",
    "screen_df = screen_df <= PVAL_CUTOFF\n",
    "screen_df = screen_df.astype(int)\n",
    "screen_df.head()\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Gather SMILES\n",
    "  \n",
    "We use the chemical names to gather the SMILES from PubChem using [PubChemPy](https://pubchempy.readthedocs.io/en/latest/)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# FUNCTIONS FOR PROCESSING OBTAINING SMILES\n",
    "\n",
    "def clean_names_chemlibrary(original_name):\n",
    "    \n",
    "    # Remove additional information from name \n",
    "    name = original_name.split(\" (\")[0]\n",
    "    name = name.split(\" [\")[0]\n",
    "    name = name.rstrip()\n",
    "    \n",
    "    return name\n",
    "\n",
    "def get_pubchemid(name):\n",
    "    \n",
    "    \"\"\"\n",
    "    Retrieve PubChem compound information based on the given name.\n",
    "\n",
    "    Parameters:\n",
    "    - name (str): The name of the compound to search for.\n",
    "\n",
    "    Returns:\n",
    "    - result_df (pandas.DataFrame): DataFrame containing PubChem compound information.\n",
    "        Columns:\n",
    "            - 'name' (str): The name of the compound.\n",
    "            - 'cid' (str): PubChem Compound ID. 'not_found' if the compound is not found.\n",
    "            - 'pchem_canonical_smile' (str): Canonical SMILES representation of the compound.\n",
    "                'not_found' if the compound is not found.\n",
    "            - 'pchem_isomeric_smile' (str): Isomeric SMILES representation of the compound.\n",
    "                'not_found' if the compound is not found.\n",
    "            - 'pchem_inchi' (str): InChI representation of the compound.\n",
    "                'not_found' if the compound is not found.\n",
    "            - 'pchem_inchikey' (str): InChIKey representation of the compound.\n",
    "                'not_found' if the compound is not found.\n",
    "    \"\"\"\n",
    "    \n",
    "    # Attempt to find result with search name\n",
    "    results = pcp.get_compounds(name, 'name')\n",
    "    result_dict = {}\n",
    "\n",
    "    # If that did not work, use the clean name\n",
    "    if len(results) == 0:\n",
    "        clean_name = clean_names_chemlibrary(name)\n",
    "        results = pcp.get_compounds(clean_name, \"name\")\n",
    "    \n",
    "    # Now prepare the output\n",
    "    if len(results) > 0:\n",
    "        result_dict[\"name\"] = name \n",
    "        result_dict[\"cid\"] = results[0].cid\n",
    "        result_dict[\"pchem_canonical_smile\"] = results[0].canonical_smiles\n",
    "        result_dict[\"pchem_isomeric_smile\"] = results[0].isomeric_smiles\n",
    "        result_dict[\"pchem_inchi\"] = results[0].inchi\n",
    "        result_dict[\"pchem_inchikey\"] = results[0].inchikey\n",
    "        \n",
    "    else:\n",
    "        result_dict[\"name\"] = name\n",
    "        result_dict[\"cid\"] = \"not_found\"\n",
    "        result_dict[\"pchem_canonical_smile\"] = \"not_found\"\n",
    "        result_dict[\"pchem_isomeric_smile\"] = \"not_found\"\n",
    "        result_dict[\"pchem_inchi\"] = \"not_found\"\n",
    "        result_dict[\"pchem_inchikey\"] = \"not_found\"\n",
    "        \n",
    "    result_df = pd.DataFrame(result_dict, index=[0])\n",
    "    \n",
    "    return result_df\n",
    "\n",
    "\n",
    "def cid_info(cid, df):\n",
    "    \"\"\"\n",
    "    Retrieve information about a compound from PubChem using its CID (Compound ID).\n",
    "\n",
    "    Parameters:\n",
    "    - cid (str): PubChem Compound ID of the compound to query.\n",
    "    - df (pandas.DataFrame): DataFrame containing compound information with 'cid' column.\n",
    "\n",
    "    Returns:\n",
    "    - result_df (pandas.DataFrame): DataFrame containing PubChem compound information.\n",
    "        Columns:\n",
    "            - 'name' (str): The name of the compound.\n",
    "            - 'cid' (str): PubChem Compound ID.\n",
    "            - 'pchem_canonical_smile' (str): Canonical SMILES representation of the compound.\n",
    "            - 'pchem_isomeric_smile' (str): Isomeric SMILES representation of the compound.\n",
    "            - 'pchem_inchi' (str): InChI representation of the compound.\n",
    "            - 'pchem_inchikey' (str): InChIKey representation of the compound.\n",
    "    \"\"\"\n",
    "    \n",
    "    # Query Pubchem\n",
    "    results = pcp.Compound.from_cid(cid)\n",
    "    \n",
    "    # Init dictionary\n",
    "    result_dict = {}\n",
    "    \n",
    "    # Prepare output\n",
    "    result_dict[\"name\"] = df.loc[df[\"cid\"]==cid, \"name\"].values[0]\n",
    "    result_dict[\"cid\"] = cid\n",
    "    result_dict[\"pchem_canonical_smile\"] = results.canonical_smiles\n",
    "    result_dict[\"pchem_isomeric_smile\"] = results.isomeric_smiles\n",
    "    result_dict[\"pchem_inchi\"] = results.inchi\n",
    "    result_dict[\"pchem_inchikey\"] = results.inchikey\n",
    "    \n",
    "    return pd.DataFrame(result_dict, index=[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# READ THE DATA\n",
    "\n",
    "maier_chemicals = pd.read_excel(os.path.join(INPUT_DIR, \"chem_library_info_SF1.xlsx\"))\n",
    "maier_chemicals.set_index(\"prestwick_ID\", inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I will use the **chemical_name** field to query PubChem (via pubchempy) to find their SMILES and other relevant information (this might take some time)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1200/1200 [10:43<00:00,  1.87it/s]\n",
      "100%|██████████| 9/9 [00:04<00:00,  2.14it/s]\n"
     ]
    }
   ],
   "source": [
    "# Iterate over the unique names in the dataset\n",
    "cid_search = pd.concat([get_pubchemid(name) for name in tqdm(maier_chemicals[\"chemical name\"].unique())])\n",
    "\n",
    "# Couldn't find these chemicals using PCP, so we will manually add them\n",
    "manual_cid = pd.DataFrame([[\"(-)-Eseroline fumarate salt\", 16219298], [\"Clonixin Lysinate\", 3080836], [\"Ziprasidone  Hydrochloride\", 219099],\n",
    "             [\"Clavulanate potassium salt\", 23665591], [\"Oxibendazol\", 4622], \n",
    "             [\"Morpholinoethylamino-3-benzocyclohepta-(5,6-c)-pyridazine dihydrochloride\", 195164],\n",
    "             [\"Gabazine bromide\", 71316800], [\"Colistin sulfate\", 91885449], [\"Bacitracin\", 11980094]], \n",
    "             columns=[\"name\", \"cid\"])\n",
    "\n",
    "manual_try = pd.concat([cid_info(c, df=manual_cid) for c in tqdm(manual_cid.cid.to_list())])\n",
    "\n",
    "# Concatenate the results\n",
    "chem_smiles = pd.concat([cid_search[cid_search[\"cid\"] != \"not_found\"], manual_try]).drop_duplicates()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can now get the RDKit versions of these SMILES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Obtiain canonical smiles\n",
    "chem_smiles[\"rdkit_canonical_smile\"] = chem_smiles[\"pchem_canonical_smile\"].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x),\n",
    "                                                                                          canonical=True, \n",
    "                                                                                          isomericSmiles=False))\n",
    "\n",
    "# Obtain isomeric smiles\n",
    "chem_smiles[\"rdkit_isomeric_smile\"] = chem_smiles[\"pchem_canonical_smile\"].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x),\n",
    "                                                                                          canonical=True, \n",
    "                                                                                          isomericSmiles=True))\n",
    "\n",
    "# Obtain chemical metadata\n",
    "chem_smiles[\"n_atoms\"] = chem_smiles[\"pchem_canonical_smile\"].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms())\n",
    "chem_smiles[\"n_bonds\"] = chem_smiles[\"pchem_canonical_smile\"].apply(lambda x: Chem.MolFromSmiles(x).GetNumBonds())\n",
    "chem_smiles[\"ExactMolWt\"] = chem_smiles[\"pchem_canonical_smile\"].apply(lambda x: Descriptors.ExactMolWt(Chem.MolFromSmiles(x)))\n",
    "\n",
    "\n",
    "# Remove the salts\n",
    "remover = SaltRemover()\n",
    "chem_smiles[\"rdkit_no_salt\"] = chem_smiles[\"rdkit_canonical_smile\"].apply(lambda x: Chem.MolToSmiles(remover.StripMol(Chem.MolFromSmiles(x))))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can combine this information with the provided data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prestwick_ID</th>\n",
       "      <th>STITCH4 id</th>\n",
       "      <th>ATC codes</th>\n",
       "      <th>target species</th>\n",
       "      <th>dose (µmol)</th>\n",
       "      <th>estimated intestine concentration (µM)</th>\n",
       "      <th>plasma concentration (µM)</th>\n",
       "      <th>source for plasma concentration</th>\n",
       "      <th>fraction excreted in feces</th>\n",
       "      <th>fraction excreted in urine</th>\n",
       "      <th>...</th>\n",
       "      <th>pchem_canonical_smile</th>\n",
       "      <th>pchem_isomeric_smile</th>\n",
       "      <th>pchem_inchi</th>\n",
       "      <th>pchem_inchikey</th>\n",
       "      <th>rdkit_canonical_smile</th>\n",
       "      <th>rdkit_isomeric_smile</th>\n",
       "      <th>n_atoms</th>\n",
       "      <th>n_bonds</th>\n",
       "      <th>ExactMolWt</th>\n",
       "      <th>rdkit_no_salt</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chemical name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Azaguanine-8</th>\n",
       "      <td>Prestw-1</td>\n",
       "      <td>CID100008646</td>\n",
       "      <td>L01BB</td>\n",
       "      <td>human</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>C12=NNN=C1N=C(NC2=O)N</td>\n",
       "      <td>C12=NNN=C1N=C(NC2=O)N</td>\n",
       "      <td>InChI=1S/C4H4N6O/c5-4-6-2-1(3(11)7-4)8-10-9-2/...</td>\n",
       "      <td>LPXQRXLUHJKZIE-UHFFFAOYSA-N</td>\n",
       "      <td>Nc1nc2n[nH]nc2c(=O)[nH]1</td>\n",
       "      <td>Nc1nc2n[nH]nc2c(=O)[nH]1</td>\n",
       "      <td>11.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>152.044659</td>\n",
       "      <td>Nc1nc2n[nH]nc2c(=O)[nH]1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Sulfaguanidine</th>\n",
       "      <td>Prestw-10</td>\n",
       "      <td>CID100005324</td>\n",
       "      <td>A07AB03</td>\n",
       "      <td>bacteria</td>\n",
       "      <td>18670.009221</td>\n",
       "      <td>6223.336407</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N</td>\n",
       "      <td>C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N</td>\n",
       "      <td>InChI=1S/C7H10N4O2S/c8-5-1-3-6(4-2-5)14(12,13)...</td>\n",
       "      <td>BRBKOPJOKNSWSG-UHFFFAOYSA-N</td>\n",
       "      <td>NC(N)=NS(=O)(=O)c1ccc(N)cc1</td>\n",
       "      <td>NC(N)=NS(=O)(=O)c1ccc(N)cc1</td>\n",
       "      <td>14.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>214.052447</td>\n",
       "      <td>NC(N)=NS(=O)(=O)c1ccc(N)cc1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Nocodazole</th>\n",
       "      <td>Prestw-100</td>\n",
       "      <td>CID100004122</td>\n",
       "      <td>L01XX</td>\n",
       "      <td>human</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3</td>\n",
       "      <td>COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3</td>\n",
       "      <td>InChI=1S/C14H11N3O3S/c1-20-14(19)17-13-15-9-5-...</td>\n",
       "      <td>KYRVNWMVYQXFEU-UHFFFAOYSA-N</td>\n",
       "      <td>COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1</td>\n",
       "      <td>COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1</td>\n",
       "      <td>21.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>301.052112</td>\n",
       "      <td>COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lymecycline</th>\n",
       "      <td>Prestw-1000</td>\n",
       "      <td>CID154687131</td>\n",
       "      <td>J01AA04</td>\n",
       "      <td>bacteria</td>\n",
       "      <td>995.607694</td>\n",
       "      <td>331.869231</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4...</td>\n",
       "      <td>C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(...</td>\n",
       "      <td>InChI=1S/C29H38N4O10/c1-28(42)13-7-6-9-17(34)1...</td>\n",
       "      <td>PZTCVADFMACKLU-UEPZRUIBSA-N</td>\n",
       "      <td>CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O...</td>\n",
       "      <td>CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O...</td>\n",
       "      <td>43.0</td>\n",
       "      <td>46.0</td>\n",
       "      <td>602.258793</td>\n",
       "      <td>CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Alfadolone acetate</th>\n",
       "      <td>Prestw-1001</td>\n",
       "      <td>CID100024733</td>\n",
       "      <td>N01AX</td>\n",
       "      <td>human</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>CC(=O)OCC(=O)C1CCC2C1(CC(=O)C3C2CCC4C3(CCC(C4)...</td>\n",
       "      <td>CC(=O)OCC(=O)[C@H]1CC[C@@H]2[C@@]1(CC(=O)[C@H]...</td>\n",
       "      <td>InChI=1S/C23H34O5/c1-13(24)28-12-20(27)18-7-6-...</td>\n",
       "      <td>QRJOQYLXZPQQMX-FWROMSNXSA-N</td>\n",
       "      <td>CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C</td>\n",
       "      <td>CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C</td>\n",
       "      <td>28.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>390.240624</td>\n",
       "      <td>CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 29 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   prestwick_ID    STITCH4 id ATC codes target species  \\\n",
       "chemical name                                                            \n",
       "Azaguanine-8           Prestw-1  CID100008646     L01BB          human   \n",
       "Sulfaguanidine        Prestw-10  CID100005324   A07AB03       bacteria   \n",
       "Nocodazole           Prestw-100  CID100004122     L01XX          human   \n",
       "Lymecycline         Prestw-1000  CID154687131   J01AA04       bacteria   \n",
       "Alfadolone acetate  Prestw-1001  CID100024733     N01AX          human   \n",
       "\n",
       "                     dose (µmol)  estimated intestine concentration (µM)  \\\n",
       "chemical name                                                              \n",
       "Azaguanine-8                 NaN                                     NaN   \n",
       "Sulfaguanidine      18670.009221                             6223.336407   \n",
       "Nocodazole                   NaN                                     NaN   \n",
       "Lymecycline           995.607694                              331.869231   \n",
       "Alfadolone acetate           NaN                                     NaN   \n",
       "\n",
       "                    plasma concentration (µM) source for plasma concentration  \\\n",
       "chemical name                                                                   \n",
       "Azaguanine-8                              NaN                             NaN   \n",
       "Sulfaguanidine                            NaN                             NaN   \n",
       "Nocodazole                                NaN                             NaN   \n",
       "Lymecycline                               NaN                             NaN   \n",
       "Alfadolone acetate                        NaN                             NaN   \n",
       "\n",
       "                    fraction excreted in feces  fraction excreted in urine  \\\n",
       "chemical name                                                                \n",
       "Azaguanine-8                               NaN                         NaN   \n",
       "Sulfaguanidine                             NaN                         NaN   \n",
       "Nocodazole                                 NaN                         NaN   \n",
       "Lymecycline                                NaN                         NaN   \n",
       "Alfadolone acetate                         NaN                         NaN   \n",
       "\n",
       "                    ...                              pchem_canonical_smile  \\\n",
       "chemical name       ...                                                      \n",
       "Azaguanine-8        ...                              C12=NNN=C1N=C(NC2=O)N   \n",
       "Sulfaguanidine      ...                     C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N   \n",
       "Nocodazole          ...        COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3   \n",
       "Lymecycline         ...  CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4...   \n",
       "Alfadolone acetate  ...  CC(=O)OCC(=O)C1CCC2C1(CC(=O)C3C2CCC4C3(CCC(C4)...   \n",
       "\n",
       "                                                 pchem_isomeric_smile  \\\n",
       "chemical name                                                           \n",
       "Azaguanine-8                                    C12=NNN=C1N=C(NC2=O)N   \n",
       "Sulfaguanidine                         C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N   \n",
       "Nocodazole                COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3   \n",
       "Lymecycline         C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(...   \n",
       "Alfadolone acetate  CC(=O)OCC(=O)[C@H]1CC[C@@H]2[C@@]1(CC(=O)[C@H]...   \n",
       "\n",
       "                                                          pchem_inchi  \\\n",
       "chemical name                                                           \n",
       "Azaguanine-8        InChI=1S/C4H4N6O/c5-4-6-2-1(3(11)7-4)8-10-9-2/...   \n",
       "Sulfaguanidine      InChI=1S/C7H10N4O2S/c8-5-1-3-6(4-2-5)14(12,13)...   \n",
       "Nocodazole          InChI=1S/C14H11N3O3S/c1-20-14(19)17-13-15-9-5-...   \n",
       "Lymecycline         InChI=1S/C29H38N4O10/c1-28(42)13-7-6-9-17(34)1...   \n",
       "Alfadolone acetate  InChI=1S/C23H34O5/c1-13(24)28-12-20(27)18-7-6-...   \n",
       "\n",
       "                                 pchem_inchikey  \\\n",
       "chemical name                                     \n",
       "Azaguanine-8        LPXQRXLUHJKZIE-UHFFFAOYSA-N   \n",
       "Sulfaguanidine      BRBKOPJOKNSWSG-UHFFFAOYSA-N   \n",
       "Nocodazole          KYRVNWMVYQXFEU-UHFFFAOYSA-N   \n",
       "Lymecycline         PZTCVADFMACKLU-UEPZRUIBSA-N   \n",
       "Alfadolone acetate  QRJOQYLXZPQQMX-FWROMSNXSA-N   \n",
       "\n",
       "                                                rdkit_canonical_smile  \\\n",
       "chemical name                                                           \n",
       "Azaguanine-8                                 Nc1nc2n[nH]nc2c(=O)[nH]1   \n",
       "Sulfaguanidine                            NC(N)=NS(=O)(=O)c1ccc(N)cc1   \n",
       "Nocodazole                     COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1   \n",
       "Lymecycline         CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O...   \n",
       "Alfadolone acetate  CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C   \n",
       "\n",
       "                                                 rdkit_isomeric_smile  \\\n",
       "chemical name                                                           \n",
       "Azaguanine-8                                 Nc1nc2n[nH]nc2c(=O)[nH]1   \n",
       "Sulfaguanidine                            NC(N)=NS(=O)(=O)c1ccc(N)cc1   \n",
       "Nocodazole                     COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1   \n",
       "Lymecycline         CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O...   \n",
       "Alfadolone acetate  CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C   \n",
       "\n",
       "                    n_atoms  n_bonds  ExactMolWt  \\\n",
       "chemical name                                      \n",
       "Azaguanine-8           11.0     12.0  152.044659   \n",
       "Sulfaguanidine         14.0     14.0  214.052447   \n",
       "Nocodazole             21.0     23.0  301.052112   \n",
       "Lymecycline            43.0     46.0  602.258793   \n",
       "Alfadolone acetate     28.0     31.0  390.240624   \n",
       "\n",
       "                                                        rdkit_no_salt  \n",
       "chemical name                                                          \n",
       "Azaguanine-8                                 Nc1nc2n[nH]nc2c(=O)[nH]1  \n",
       "Sulfaguanidine                            NC(N)=NS(=O)(=O)c1ccc(N)cc1  \n",
       "Nocodazole                     COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1  \n",
       "Lymecycline         CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O...  \n",
       "Alfadolone acetate  CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C  \n",
       "\n",
       "[5 rows x 29 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "maier_chemicals.reset_index(inplace=True)\n",
    "maier_chemicals.set_index(\"chemical name\", inplace=True)\n",
    "\n",
    "chem_smiles.set_index(\"name\", inplace=True)\n",
    "\n",
    "chemical_metadata = maier_chemicals.join(chem_smiles)\n",
    "chemical_metadata.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "With that, we can write the final output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "chemical_metadata.to_csv(os.path.join(OUTPUT_DIR, \"prestwick_library.tsv.gz\"), sep='\\t')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Molecular representation and data splitting. \n",
    "  \n",
    "Now that we have determined the labels, we can now represent the chemical library using MolE, ECFP4 and Chemical Descriptors. At the same time, we can split the dataset using scaffold splitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "chemical_metadata_screened = chemical_metadata.loc[chemical_metadata[\"prestwick_ID\"].isin(screen_df.index)]\n",
    "chemical_metadata_screened.to_csv(os.path.join(OUTPUT_DIR, \"prestwick_library_screened.tsv.gz\"), sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "About to generate scaffolds\n",
      "About to sort in scaffold sets\n",
      "Representation dimension (1000) - Embedding dimension (8000)\n",
      "../pretrained_model/model_ginconcat_btwin_100k_d8000_l0.0001/model.pth\n",
      "x_embedding1.weight\n",
      "x_embedding2.weight\n",
      "gnns.0.mlp.0.weight\n",
      "gnns.0.mlp.0.bias\n",
      "gnns.0.mlp.1.weight\n",
      "gnns.0.mlp.1.bias\n",
      "gnns.0.mlp.1.running_mean\n",
      "gnns.0.mlp.1.running_var\n",
      "gnns.0.mlp.1.num_batches_tracked\n",
      "gnns.0.mlp.3.weight\n",
      "gnns.0.mlp.3.bias\n",
      "gnns.0.edge_embedding1.weight\n",
      "gnns.0.edge_embedding2.weight\n",
      "gnns.1.mlp.0.weight\n",
      "gnns.1.mlp.0.bias\n",
      "gnns.1.mlp.1.weight\n",
      "gnns.1.mlp.1.bias\n",
      "gnns.1.mlp.1.running_mean\n",
      "gnns.1.mlp.1.running_var\n",
      "gnns.1.mlp.1.num_batches_tracked\n",
      "gnns.1.mlp.3.weight\n",
      "gnns.1.mlp.3.bias\n",
      "gnns.1.edge_embedding1.weight\n",
      "gnns.1.edge_embedding2.weight\n",
      "gnns.2.mlp.0.weight\n",
      "gnns.2.mlp.0.bias\n",
      "gnns.2.mlp.1.weight\n",
      "gnns.2.mlp.1.bias\n",
      "gnns.2.mlp.1.running_mean\n",
      "gnns.2.mlp.1.running_var\n",
      "gnns.2.mlp.1.num_batches_tracked\n",
      "gnns.2.mlp.3.weight\n",
      "gnns.2.mlp.3.bias\n",
      "gnns.2.edge_embedding1.weight\n",
      "gnns.2.edge_embedding2.weight\n",
      "gnns.3.mlp.0.weight\n",
      "gnns.3.mlp.0.bias\n",
      "gnns.3.mlp.1.weight\n",
      "gnns.3.mlp.1.bias\n",
      "gnns.3.mlp.1.running_mean\n",
      "gnns.3.mlp.1.running_var\n",
      "gnns.3.mlp.1.num_batches_tracked\n",
      "gnns.3.mlp.3.weight\n",
      "gnns.3.mlp.3.bias\n",
      "gnns.3.edge_embedding1.weight\n",
      "gnns.3.edge_embedding2.weight\n",
      "gnns.4.mlp.0.weight\n",
      "gnns.4.mlp.0.bias\n",
      "gnns.4.mlp.1.weight\n",
      "gnns.4.mlp.1.bias\n",
      "gnns.4.mlp.1.running_mean\n",
      "gnns.4.mlp.1.running_var\n",
      "gnns.4.mlp.1.num_batches_tracked\n",
      "gnns.4.mlp.3.weight\n",
      "gnns.4.mlp.3.bias\n",
      "gnns.4.edge_embedding1.weight\n",
      "gnns.4.edge_embedding2.weight\n",
      "batch_norms.0.weight\n",
      "batch_norms.0.bias\n",
      "batch_norms.0.running_mean\n",
      "batch_norms.0.running_var\n",
      "batch_norms.0.num_batches_tracked\n",
      "batch_norms.1.weight\n",
      "batch_norms.1.bias\n",
      "batch_norms.1.running_mean\n",
      "batch_norms.1.running_var\n",
      "batch_norms.1.num_batches_tracked\n",
      "batch_norms.2.weight\n",
      "batch_norms.2.bias\n",
      "batch_norms.2.running_mean\n",
      "batch_norms.2.running_var\n",
      "batch_norms.2.num_batches_tracked\n",
      "batch_norms.3.weight\n",
      "batch_norms.3.bias\n",
      "batch_norms.3.running_mean\n",
      "batch_norms.3.running_var\n",
      "batch_norms.3.num_batches_tracked\n",
      "batch_norms.4.weight\n",
      "batch_norms.4.bias\n",
      "batch_norms.4.running_mean\n",
      "batch_norms.4.running_var\n",
      "batch_norms.4.num_batches_tracked\n",
      "feat_lin.weight\n",
      "feat_lin.bias\n",
      "out_lin.0.weight\n",
      "out_lin.0.bias\n",
      "out_lin.1.weight\n",
      "out_lin.1.bias\n",
      "out_lin.1.running_mean\n",
      "out_lin.1.running_var\n",
      "out_lin.1.num_batches_tracked\n",
      "out_lin.3.weight\n",
      "out_lin.3.bias\n",
      "out_lin.4.weight\n",
      "out_lin.4.bias\n",
      "out_lin.4.running_mean\n",
      "out_lin.4.running_var\n",
      "out_lin.4.num_batches_tracked\n",
      "out_lin.6.weight\n",
      "out_lin.6.bias\n"
     ]
    }
   ],
   "source": [
    "# MOLE REPRESENTATION\n",
    "\n",
    "maier_scaffold_split, mole_representation = process_dataset(dataset_path = os.path.join(OUTPUT_DIR, \"prestwick_library_screened.tsv.gz\"), \n",
    "                                                  pretrain_architecture = \"gin_concat\", \n",
    "                                                  pretrained_model = \"model_ginconcat_btwin_100k_d8000_l0.0001\", \n",
    "                                                  \n",
    "                                                  split_approach = \"scaffold\", \n",
    "                                                  validation_proportion = 0.1, \n",
    "                                                  test_proportion = 0.1, \n",
    "                                                  \n",
    "                                                  smile_column_str = \"rdkit_no_salt\", \n",
    "                                                  id_column_str = \"prestwick_ID\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ECFP4\n",
    "\n",
    "ecfp4_representation = process_dataset(dataset_path = os.path.join(OUTPUT_DIR, \"prestwick_library_screened.tsv.gz\"), \n",
    "                                                  \n",
    "                                                  pretrained_model = \"ECFP4\", \n",
    "                                                  dataset_split=False,\n",
    "\n",
    "                                                  smile_column_str = \"rdkit_no_salt\", \n",
    "                                                  id_column_str = \"prestwick_ID\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Could not compute descriptors for Prestw-919\n"
     ]
    }
   ],
   "source": [
    "# CHEMICAL DESCRIPTORS\n",
    "chemdesc_representation = process_dataset(dataset_path = os.path.join(OUTPUT_DIR, \"prestwick_library_screened.tsv.gz\"), \n",
    "                                                  pretrained_model = \"ChemDesc\", \n",
    "                                                  dataset_split=False,\n",
    "\n",
    "                                                  smile_column_str = \"pchem_isomeric_smile\", \n",
    "                                                  id_column_str = \"prestwick_ID\")\n",
    "  \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare representations for 100K molecules from PubChem  \n",
    "  \n",
    "Here I prepare the MolE and ECFP4 representation of 100K randomly selected molecules from PubChem.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "ecfp4_pubchem = process_dataset(dataset_path = os.path.join(RAW_DATA_DIR, \"pubchem_random\", \"pubchem_100k_selected_smiles.tsv.gz\"), \n",
    "                                                  \n",
    "                                                  pretrained_model = \"ECFP4\", \n",
    "                                                  dataset_split=False,\n",
    "\n",
    "                                                  smile_column_str = \"smiles\", \n",
    "                                                  id_column_str = \"chem_id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Representation dimension (1000) - Embedding dimension (8000)\n",
      "../pretrained_model/model_ginconcat_btwin_100k_d8000_l0.0001/model.pth\n",
      "x_embedding1.weight\n",
      "x_embedding2.weight\n",
      "gnns.0.mlp.0.weight\n",
      "gnns.0.mlp.0.bias\n",
      "gnns.0.mlp.1.weight\n",
      "gnns.0.mlp.1.bias\n",
      "gnns.0.mlp.1.running_mean\n",
      "gnns.0.mlp.1.running_var\n",
      "gnns.0.mlp.1.num_batches_tracked\n",
      "gnns.0.mlp.3.weight\n",
      "gnns.0.mlp.3.bias\n",
      "gnns.0.edge_embedding1.weight\n",
      "gnns.0.edge_embedding2.weight\n",
      "gnns.1.mlp.0.weight\n",
      "gnns.1.mlp.0.bias\n",
      "gnns.1.mlp.1.weight\n",
      "gnns.1.mlp.1.bias\n",
      "gnns.1.mlp.1.running_mean\n",
      "gnns.1.mlp.1.running_var\n",
      "gnns.1.mlp.1.num_batches_tracked\n",
      "gnns.1.mlp.3.weight\n",
      "gnns.1.mlp.3.bias\n",
      "gnns.1.edge_embedding1.weight\n",
      "gnns.1.edge_embedding2.weight\n",
      "gnns.2.mlp.0.weight\n",
      "gnns.2.mlp.0.bias\n",
      "gnns.2.mlp.1.weight\n",
      "gnns.2.mlp.1.bias\n",
      "gnns.2.mlp.1.running_mean\n",
      "gnns.2.mlp.1.running_var\n",
      "gnns.2.mlp.1.num_batches_tracked\n",
      "gnns.2.mlp.3.weight\n",
      "gnns.2.mlp.3.bias\n",
      "gnns.2.edge_embedding1.weight\n",
      "gnns.2.edge_embedding2.weight\n",
      "gnns.3.mlp.0.weight\n",
      "gnns.3.mlp.0.bias\n",
      "gnns.3.mlp.1.weight\n",
      "gnns.3.mlp.1.bias\n",
      "gnns.3.mlp.1.running_mean\n",
      "gnns.3.mlp.1.running_var\n",
      "gnns.3.mlp.1.num_batches_tracked\n",
      "gnns.3.mlp.3.weight\n",
      "gnns.3.mlp.3.bias\n",
      "gnns.3.edge_embedding1.weight\n",
      "gnns.3.edge_embedding2.weight\n",
      "gnns.4.mlp.0.weight\n",
      "gnns.4.mlp.0.bias\n",
      "gnns.4.mlp.1.weight\n",
      "gnns.4.mlp.1.bias\n",
      "gnns.4.mlp.1.running_mean\n",
      "gnns.4.mlp.1.running_var\n",
      "gnns.4.mlp.1.num_batches_tracked\n",
      "gnns.4.mlp.3.weight\n",
      "gnns.4.mlp.3.bias\n",
      "gnns.4.edge_embedding1.weight\n",
      "gnns.4.edge_embedding2.weight\n",
      "batch_norms.0.weight\n",
      "batch_norms.0.bias\n",
      "batch_norms.0.running_mean\n",
      "batch_norms.0.running_var\n",
      "batch_norms.0.num_batches_tracked\n",
      "batch_norms.1.weight\n",
      "batch_norms.1.bias\n",
      "batch_norms.1.running_mean\n",
      "batch_norms.1.running_var\n",
      "batch_norms.1.num_batches_tracked\n",
      "batch_norms.2.weight\n",
      "batch_norms.2.bias\n",
      "batch_norms.2.running_mean\n",
      "batch_norms.2.running_var\n",
      "batch_norms.2.num_batches_tracked\n",
      "batch_norms.3.weight\n",
      "batch_norms.3.bias\n",
      "batch_norms.3.running_mean\n",
      "batch_norms.3.running_var\n",
      "batch_norms.3.num_batches_tracked\n",
      "batch_norms.4.weight\n",
      "batch_norms.4.bias\n",
      "batch_norms.4.running_mean\n",
      "batch_norms.4.running_var\n",
      "batch_norms.4.num_batches_tracked\n",
      "feat_lin.weight\n",
      "feat_lin.bias\n",
      "out_lin.0.weight\n",
      "out_lin.0.bias\n",
      "out_lin.1.weight\n",
      "out_lin.1.bias\n",
      "out_lin.1.running_mean\n",
      "out_lin.1.running_var\n",
      "out_lin.1.num_batches_tracked\n",
      "out_lin.3.weight\n",
      "out_lin.3.bias\n",
      "out_lin.4.weight\n",
      "out_lin.4.bias\n",
      "out_lin.4.running_mean\n",
      "out_lin.4.running_var\n",
      "out_lin.4.num_batches_tracked\n",
      "out_lin.6.weight\n",
      "out_lin.6.bias\n"
     ]
    }
   ],
   "source": [
    "mole_pubchem = process_dataset(dataset_path = os.path.join(RAW_DATA_DIR, \"pubchem_random\", \"pubchem_100k_selected_smiles.tsv.gz\"), \n",
    "                                                  \n",
    "                                                  pretrain_architecture = \"gin_concat\", \n",
    "                                                  pretrained_model = \"model_ginconcat_btwin_100k_d8000_l0.0001\",\n",
    "                                                  \n",
    "                                                  dataset_split=False,\n",
    "\n",
    "                                                  smile_column_str = \"smiles\", \n",
    "                                                  id_column_str = \"chem_id\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "#maier_scaffold_split.to_csv(os.path.join(OUTPUT_DIR, \"maier_scaffold_split.tsv.gz\"), sep='\\t')\n",
    "#mole_representation.to_csv(os.path.join(OUTPUT_DIR, \"maier_mole_representation.tsv.gz\"), sep='\\t')\n",
    "#ecfp4_representation.to_csv(os.path.join(OUTPUT_DIR, \"maier_ecfp4_representation.tsv.gz\"), sep='\\t')\n",
    "#chemdesc_representation.to_csv(os.path.join(OUTPUT_DIR, \"maier_chemdesc_representation.tsv.gz\"), sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Breakup MolE pubchem to fit required file size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#mole_pubchem.iloc[0:25_000, :].to_csv(\"../data/01.prepare_training_data/pubchem_mole_representation_p1.tsv.gz\", sep='\\t')\n",
    "#mole_pubchem.iloc[25_000:50_000, :].to_csv(\"../data/01.prepare_training_data/pubchem_mole_representation_p2.tsv.gz\", sep='\\t')\n",
    "#mole_pubchem.iloc[50_000:75_000, :].to_csv(\"../data/01.prepare_training_data/pubchem_mole_representation_p3.tsv.gz\", sep='\\t')\n",
    "#mole_pubchem.iloc[75_000:, :].to_csv(\"../data/01.prepare_training_data/pubchem_mole_representation_p4.tsv.gz\", sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#ecfp4_pubchem.to_csv(os.path.join(OUTPUT_DIR, \"pubchem_ecfp4_representation.tsv.gz\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Written files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['pubchem_mole_representation_p1.tsv.gz',\n",
       " 'maier_ecfp4_representation.tsv.gz',\n",
       " 'prestwick_library.tsv.gz',\n",
       " 'pubchem_mole_representation_p3.tsv.gz',\n",
       " 'prestwick_library_screened.tsv.gz',\n",
       " 'maier_scaffold_split.tsv.gz',\n",
       " 'maier_chemdesc_representation.tsv.gz',\n",
       " 'pubchem_mole_representation_p4.tsv.gz',\n",
       " 'maier_mole_representation.tsv.gz',\n",
       " 'pubchem_mole_representation_p2.tsv.gz',\n",
       " 'pubchem_ecfp4_representation.tsv.gz',\n",
       " 'maier_screening_results.tsv.gz']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.listdir(OUTPUT_DIR)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mole_test",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}