1386 lines
55 KiB
Plaintext
1386 lines
55 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Prepare Training Data \n",
|
||
"\n",
|
||
"### Author: Roberto Olayo-Alarcon\n",
|
||
" \n",
|
||
"In this step we read the data from [Maier, L., et al (2018)](https://www.nature.com/articles/nature25979). And prepare it to be used for training, testing and validation of the XGBoost model. "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Read Libraries\n",
|
||
"import os\n",
|
||
"\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"from tqdm import tqdm\n",
|
||
"import pubchempy as pcp\n",
|
||
"\n",
|
||
"from dataset.dataset_representation import process_dataset\n",
|
||
"\n",
|
||
"\n",
|
||
"from rdkit import Chem\n",
|
||
"from rdkit.Chem import Descriptors\n",
|
||
"from rdkit.Chem.SaltRemover import SaltRemover"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Global variables "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Raw data dir\n",
|
||
"RAW_DATA_DIR = \"../raw_data/\"\n",
|
||
"\n",
|
||
"# Directory from which to read the raw data\n",
|
||
"INPUT_DIR = os.path.join(RAW_DATA_DIR, \"maier_microbiome\")\n",
|
||
"\n",
|
||
"# Create the output directory\n",
|
||
"OUTPUT_DIR = \"../data/01.prepare_training_data\"\n",
|
||
"os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
|
||
"\n",
|
||
"# Pvalue cutoff for label determination\n",
|
||
"PVAL_CUTOFF = 0.05"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Read data and binarize\n",
|
||
" \n",
|
||
"Here we read in Supplementary Table 3 from the study."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Akkermansia muciniphila (NT5021)</th>\n",
|
||
" <th>Bacteroides caccae (NT5050)</th>\n",
|
||
" <th>Bacteroides fragilis (ET) (NT5033)</th>\n",
|
||
" <th>Bacteroides fragilis (NT) (NT5003)</th>\n",
|
||
" <th>Bacteroides ovatus (NT5054)</th>\n",
|
||
" <th>Bacteroides thetaiotaomicron (NT5004)</th>\n",
|
||
" <th>Bacteroides uniformis (NT5002)</th>\n",
|
||
" <th>Bacteroides vulgatus (NT5001)</th>\n",
|
||
" <th>Bacteroides xylanisolvens (NT5064)</th>\n",
|
||
" <th>Bifidobacterium adolescentis (NT5022)</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Parabacteroides merdae (NT5071)</th>\n",
|
||
" <th>Prevotella copri (NT5019)</th>\n",
|
||
" <th>Roseburia hominis (NT5079)</th>\n",
|
||
" <th>Roseburia intestinalis (NT5011)</th>\n",
|
||
" <th>Ruminococcus bromii (NT5045)</th>\n",
|
||
" <th>Ruminococcus gnavus (NT5046)</th>\n",
|
||
" <th>Ruminococcus torques (NT5047)</th>\n",
|
||
" <th>Streptococcus parasanguinis (NT5072)</th>\n",
|
||
" <th>Streptococcus salivarius (NT5038)</th>\n",
|
||
" <th>Veillonella parvula (NT5017)</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>prestwick_ID</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Prestw-1109</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Prestw-1399</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Prestw-145</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Prestw-1464</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Prestw-31</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 40 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Akkermansia muciniphila (NT5021) Bacteroides caccae (NT5050) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 1 \n",
|
||
"Prestw-1399 1 1 \n",
|
||
"Prestw-145 1 1 \n",
|
||
"Prestw-1464 1 1 \n",
|
||
"Prestw-31 1 1 \n",
|
||
"\n",
|
||
" Bacteroides fragilis (ET) (NT5033) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 \n",
|
||
"Prestw-1399 1 \n",
|
||
"Prestw-145 1 \n",
|
||
"Prestw-1464 1 \n",
|
||
"Prestw-31 1 \n",
|
||
"\n",
|
||
" Bacteroides fragilis (NT) (NT5003) Bacteroides ovatus (NT5054) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 1 \n",
|
||
"Prestw-1399 1 1 \n",
|
||
"Prestw-145 1 1 \n",
|
||
"Prestw-1464 1 1 \n",
|
||
"Prestw-31 1 1 \n",
|
||
"\n",
|
||
" Bacteroides thetaiotaomicron (NT5004) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 \n",
|
||
"Prestw-1399 1 \n",
|
||
"Prestw-145 1 \n",
|
||
"Prestw-1464 1 \n",
|
||
"Prestw-31 1 \n",
|
||
"\n",
|
||
" Bacteroides uniformis (NT5002) Bacteroides vulgatus (NT5001) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 1 \n",
|
||
"Prestw-1399 1 1 \n",
|
||
"Prestw-145 1 1 \n",
|
||
"Prestw-1464 1 1 \n",
|
||
"Prestw-31 1 1 \n",
|
||
"\n",
|
||
" Bacteroides xylanisolvens (NT5064) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 \n",
|
||
"Prestw-1399 1 \n",
|
||
"Prestw-145 1 \n",
|
||
"Prestw-1464 1 \n",
|
||
"Prestw-31 1 \n",
|
||
"\n",
|
||
" Bifidobacterium adolescentis (NT5022) ... \\\n",
|
||
"prestwick_ID ... \n",
|
||
"Prestw-1109 1 ... \n",
|
||
"Prestw-1399 1 ... \n",
|
||
"Prestw-145 1 ... \n",
|
||
"Prestw-1464 1 ... \n",
|
||
"Prestw-31 1 ... \n",
|
||
"\n",
|
||
" Parabacteroides merdae (NT5071) Prevotella copri (NT5019) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 1 \n",
|
||
"Prestw-1399 1 1 \n",
|
||
"Prestw-145 1 1 \n",
|
||
"Prestw-1464 1 1 \n",
|
||
"Prestw-31 1 1 \n",
|
||
"\n",
|
||
" Roseburia hominis (NT5079) Roseburia intestinalis (NT5011) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 1 \n",
|
||
"Prestw-1399 1 1 \n",
|
||
"Prestw-145 1 1 \n",
|
||
"Prestw-1464 1 1 \n",
|
||
"Prestw-31 1 1 \n",
|
||
"\n",
|
||
" Ruminococcus bromii (NT5045) Ruminococcus gnavus (NT5046) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 1 \n",
|
||
"Prestw-1399 1 1 \n",
|
||
"Prestw-145 1 1 \n",
|
||
"Prestw-1464 1 1 \n",
|
||
"Prestw-31 1 1 \n",
|
||
"\n",
|
||
" Ruminococcus torques (NT5047) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 \n",
|
||
"Prestw-1399 1 \n",
|
||
"Prestw-145 1 \n",
|
||
"Prestw-1464 1 \n",
|
||
"Prestw-31 1 \n",
|
||
"\n",
|
||
" Streptococcus parasanguinis (NT5072) \\\n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 \n",
|
||
"Prestw-1399 1 \n",
|
||
"Prestw-145 1 \n",
|
||
"Prestw-1464 1 \n",
|
||
"Prestw-31 1 \n",
|
||
"\n",
|
||
" Streptococcus salivarius (NT5038) Veillonella parvula (NT5017) \n",
|
||
"prestwick_ID \n",
|
||
"Prestw-1109 1 1 \n",
|
||
"Prestw-1399 1 1 \n",
|
||
"Prestw-145 1 1 \n",
|
||
"Prestw-1464 1 1 \n",
|
||
"Prestw-31 1 1 \n",
|
||
"\n",
|
||
"[5 rows x 40 columns]"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Read the raw datas\n",
|
||
"screen_df = pd.read_excel(os.path.join(INPUT_DIR, \"screen_results_info_SF3.xlsx\"))\n",
|
||
"\n",
|
||
"# Clean the data\n",
|
||
"screen_df.drop(columns=[\"chemical_name\", \"drug_class\", \"n_hit\"], inplace=True)\n",
|
||
"screen_df.set_index(\"prestwick_ID\", inplace=True)\n",
|
||
"\n",
|
||
"# Convert the data to binary\n",
|
||
"screen_df = screen_df <= PVAL_CUTOFF\n",
|
||
"screen_df = screen_df.astype(int)\n",
|
||
"screen_df.head()\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Gather SMILES\n",
|
||
" \n",
|
||
"We use the chemical names to gather the SMILES from PubChem using [PubChemPy](https://pubchempy.readthedocs.io/en/latest/)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# FUNCTIONS FOR PROCESSING OBTAINING SMILES\n",
|
||
"\n",
|
||
"def clean_names_chemlibrary(original_name):\n",
|
||
" \n",
|
||
" # Remove additional information from name \n",
|
||
" name = original_name.split(\" (\")[0]\n",
|
||
" name = name.split(\" [\")[0]\n",
|
||
" name = name.rstrip()\n",
|
||
" \n",
|
||
" return name\n",
|
||
"\n",
|
||
"def get_pubchemid(name):\n",
|
||
" \n",
|
||
" \"\"\"\n",
|
||
" Retrieve PubChem compound information based on the given name.\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" - name (str): The name of the compound to search for.\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" - result_df (pandas.DataFrame): DataFrame containing PubChem compound information.\n",
|
||
" Columns:\n",
|
||
" - 'name' (str): The name of the compound.\n",
|
||
" - 'cid' (str): PubChem Compound ID. 'not_found' if the compound is not found.\n",
|
||
" - 'pchem_canonical_smile' (str): Canonical SMILES representation of the compound.\n",
|
||
" 'not_found' if the compound is not found.\n",
|
||
" - 'pchem_isomeric_smile' (str): Isomeric SMILES representation of the compound.\n",
|
||
" 'not_found' if the compound is not found.\n",
|
||
" - 'pchem_inchi' (str): InChI representation of the compound.\n",
|
||
" 'not_found' if the compound is not found.\n",
|
||
" - 'pchem_inchikey' (str): InChIKey representation of the compound.\n",
|
||
" 'not_found' if the compound is not found.\n",
|
||
" \"\"\"\n",
|
||
" \n",
|
||
" # Attempt to find result with search name\n",
|
||
" results = pcp.get_compounds(name, 'name')\n",
|
||
" result_dict = {}\n",
|
||
"\n",
|
||
" # If that did not work, use the clean name\n",
|
||
" if len(results) == 0:\n",
|
||
" clean_name = clean_names_chemlibrary(name)\n",
|
||
" results = pcp.get_compounds(clean_name, \"name\")\n",
|
||
" \n",
|
||
" # Now prepare the output\n",
|
||
" if len(results) > 0:\n",
|
||
" result_dict[\"name\"] = name \n",
|
||
" result_dict[\"cid\"] = results[0].cid\n",
|
||
" result_dict[\"pchem_canonical_smile\"] = results[0].canonical_smiles\n",
|
||
" result_dict[\"pchem_isomeric_smile\"] = results[0].isomeric_smiles\n",
|
||
" result_dict[\"pchem_inchi\"] = results[0].inchi\n",
|
||
" result_dict[\"pchem_inchikey\"] = results[0].inchikey\n",
|
||
" \n",
|
||
" else:\n",
|
||
" result_dict[\"name\"] = name\n",
|
||
" result_dict[\"cid\"] = \"not_found\"\n",
|
||
" result_dict[\"pchem_canonical_smile\"] = \"not_found\"\n",
|
||
" result_dict[\"pchem_isomeric_smile\"] = \"not_found\"\n",
|
||
" result_dict[\"pchem_inchi\"] = \"not_found\"\n",
|
||
" result_dict[\"pchem_inchikey\"] = \"not_found\"\n",
|
||
" \n",
|
||
" result_df = pd.DataFrame(result_dict, index=[0])\n",
|
||
" \n",
|
||
" return result_df\n",
|
||
"\n",
|
||
"\n",
|
||
"def cid_info(cid, df):\n",
|
||
" \"\"\"\n",
|
||
" Retrieve information about a compound from PubChem using its CID (Compound ID).\n",
|
||
"\n",
|
||
" Parameters:\n",
|
||
" - cid (str): PubChem Compound ID of the compound to query.\n",
|
||
" - df (pandas.DataFrame): DataFrame containing compound information with 'cid' column.\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" - result_df (pandas.DataFrame): DataFrame containing PubChem compound information.\n",
|
||
" Columns:\n",
|
||
" - 'name' (str): The name of the compound.\n",
|
||
" - 'cid' (str): PubChem Compound ID.\n",
|
||
" - 'pchem_canonical_smile' (str): Canonical SMILES representation of the compound.\n",
|
||
" - 'pchem_isomeric_smile' (str): Isomeric SMILES representation of the compound.\n",
|
||
" - 'pchem_inchi' (str): InChI representation of the compound.\n",
|
||
" - 'pchem_inchikey' (str): InChIKey representation of the compound.\n",
|
||
" \"\"\"\n",
|
||
" \n",
|
||
" # Query Pubchem\n",
|
||
" results = pcp.Compound.from_cid(cid)\n",
|
||
" \n",
|
||
" # Init dictionary\n",
|
||
" result_dict = {}\n",
|
||
" \n",
|
||
" # Prepare output\n",
|
||
" result_dict[\"name\"] = df.loc[df[\"cid\"]==cid, \"name\"].values[0]\n",
|
||
" result_dict[\"cid\"] = cid\n",
|
||
" result_dict[\"pchem_canonical_smile\"] = results.canonical_smiles\n",
|
||
" result_dict[\"pchem_isomeric_smile\"] = results.isomeric_smiles\n",
|
||
" result_dict[\"pchem_inchi\"] = results.inchi\n",
|
||
" result_dict[\"pchem_inchikey\"] = results.inchikey\n",
|
||
" \n",
|
||
" return pd.DataFrame(result_dict, index=[0])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# READ THE DATA\n",
|
||
"\n",
|
||
"maier_chemicals = pd.read_excel(os.path.join(INPUT_DIR, \"chem_library_info_SF1.xlsx\"))\n",
|
||
"maier_chemicals.set_index(\"prestwick_ID\", inplace=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"I will use the **chemical_name** field to query PubChem (via pubchempy) to find their SMILES and other relevant information (this might take some time)."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"100%|██████████| 1200/1200 [10:43<00:00, 1.87it/s]\n",
|
||
"100%|██████████| 9/9 [00:04<00:00, 2.14it/s]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Iterate over the unique names in the dataset\n",
|
||
"cid_search = pd.concat([get_pubchemid(name) for name in tqdm(maier_chemicals[\"chemical name\"].unique())])\n",
|
||
"\n",
|
||
"# Couldn't find these chemicals using PCP, so we will manually add them\n",
|
||
"manual_cid = pd.DataFrame([[\"(-)-Eseroline fumarate salt\", 16219298], [\"Clonixin Lysinate\", 3080836], [\"Ziprasidone Hydrochloride\", 219099],\n",
|
||
" [\"Clavulanate potassium salt\", 23665591], [\"Oxibendazol\", 4622], \n",
|
||
" [\"Morpholinoethylamino-3-benzocyclohepta-(5,6-c)-pyridazine dihydrochloride\", 195164],\n",
|
||
" [\"Gabazine bromide\", 71316800], [\"Colistin sulfate\", 91885449], [\"Bacitracin\", 11980094]], \n",
|
||
" columns=[\"name\", \"cid\"])\n",
|
||
"\n",
|
||
"manual_try = pd.concat([cid_info(c, df=manual_cid) for c in tqdm(manual_cid.cid.to_list())])\n",
|
||
"\n",
|
||
"# Concatenate the results\n",
|
||
"chem_smiles = pd.concat([cid_search[cid_search[\"cid\"] != \"not_found\"], manual_try]).drop_duplicates()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"We can now get the RDKit versions of these SMILES"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Obtiain canonical smiles\n",
|
||
"chem_smiles[\"rdkit_canonical_smile\"] = chem_smiles[\"pchem_canonical_smile\"].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x),\n",
|
||
" canonical=True, \n",
|
||
" isomericSmiles=False))\n",
|
||
"\n",
|
||
"# Obtain isomeric smiles\n",
|
||
"chem_smiles[\"rdkit_isomeric_smile\"] = chem_smiles[\"pchem_canonical_smile\"].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x),\n",
|
||
" canonical=True, \n",
|
||
" isomericSmiles=True))\n",
|
||
"\n",
|
||
"# Obtain chemical metadata\n",
|
||
"chem_smiles[\"n_atoms\"] = chem_smiles[\"pchem_canonical_smile\"].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms())\n",
|
||
"chem_smiles[\"n_bonds\"] = chem_smiles[\"pchem_canonical_smile\"].apply(lambda x: Chem.MolFromSmiles(x).GetNumBonds())\n",
|
||
"chem_smiles[\"ExactMolWt\"] = chem_smiles[\"pchem_canonical_smile\"].apply(lambda x: Descriptors.ExactMolWt(Chem.MolFromSmiles(x)))\n",
|
||
"\n",
|
||
"\n",
|
||
"# Remove the salts\n",
|
||
"remover = SaltRemover()\n",
|
||
"chem_smiles[\"rdkit_no_salt\"] = chem_smiles[\"rdkit_canonical_smile\"].apply(lambda x: Chem.MolToSmiles(remover.StripMol(Chem.MolFromSmiles(x))))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Now we can combine this information with the provided data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>prestwick_ID</th>\n",
|
||
" <th>STITCH4 id</th>\n",
|
||
" <th>ATC codes</th>\n",
|
||
" <th>target species</th>\n",
|
||
" <th>dose (µmol)</th>\n",
|
||
" <th>estimated intestine concentration (µM)</th>\n",
|
||
" <th>plasma concentration (µM)</th>\n",
|
||
" <th>source for plasma concentration</th>\n",
|
||
" <th>fraction excreted in feces</th>\n",
|
||
" <th>fraction excreted in urine</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>pchem_canonical_smile</th>\n",
|
||
" <th>pchem_isomeric_smile</th>\n",
|
||
" <th>pchem_inchi</th>\n",
|
||
" <th>pchem_inchikey</th>\n",
|
||
" <th>rdkit_canonical_smile</th>\n",
|
||
" <th>rdkit_isomeric_smile</th>\n",
|
||
" <th>n_atoms</th>\n",
|
||
" <th>n_bonds</th>\n",
|
||
" <th>ExactMolWt</th>\n",
|
||
" <th>rdkit_no_salt</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>chemical name</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Azaguanine-8</th>\n",
|
||
" <td>Prestw-1</td>\n",
|
||
" <td>CID100008646</td>\n",
|
||
" <td>L01BB</td>\n",
|
||
" <td>human</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>C12=NNN=C1N=C(NC2=O)N</td>\n",
|
||
" <td>C12=NNN=C1N=C(NC2=O)N</td>\n",
|
||
" <td>InChI=1S/C4H4N6O/c5-4-6-2-1(3(11)7-4)8-10-9-2/...</td>\n",
|
||
" <td>LPXQRXLUHJKZIE-UHFFFAOYSA-N</td>\n",
|
||
" <td>Nc1nc2n[nH]nc2c(=O)[nH]1</td>\n",
|
||
" <td>Nc1nc2n[nH]nc2c(=O)[nH]1</td>\n",
|
||
" <td>11.0</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>152.044659</td>\n",
|
||
" <td>Nc1nc2n[nH]nc2c(=O)[nH]1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Sulfaguanidine</th>\n",
|
||
" <td>Prestw-10</td>\n",
|
||
" <td>CID100005324</td>\n",
|
||
" <td>A07AB03</td>\n",
|
||
" <td>bacteria</td>\n",
|
||
" <td>18670.009221</td>\n",
|
||
" <td>6223.336407</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N</td>\n",
|
||
" <td>C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N</td>\n",
|
||
" <td>InChI=1S/C7H10N4O2S/c8-5-1-3-6(4-2-5)14(12,13)...</td>\n",
|
||
" <td>BRBKOPJOKNSWSG-UHFFFAOYSA-N</td>\n",
|
||
" <td>NC(N)=NS(=O)(=O)c1ccc(N)cc1</td>\n",
|
||
" <td>NC(N)=NS(=O)(=O)c1ccc(N)cc1</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>214.052447</td>\n",
|
||
" <td>NC(N)=NS(=O)(=O)c1ccc(N)cc1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Nocodazole</th>\n",
|
||
" <td>Prestw-100</td>\n",
|
||
" <td>CID100004122</td>\n",
|
||
" <td>L01XX</td>\n",
|
||
" <td>human</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3</td>\n",
|
||
" <td>COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3</td>\n",
|
||
" <td>InChI=1S/C14H11N3O3S/c1-20-14(19)17-13-15-9-5-...</td>\n",
|
||
" <td>KYRVNWMVYQXFEU-UHFFFAOYSA-N</td>\n",
|
||
" <td>COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1</td>\n",
|
||
" <td>COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1</td>\n",
|
||
" <td>21.0</td>\n",
|
||
" <td>23.0</td>\n",
|
||
" <td>301.052112</td>\n",
|
||
" <td>COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Lymecycline</th>\n",
|
||
" <td>Prestw-1000</td>\n",
|
||
" <td>CID154687131</td>\n",
|
||
" <td>J01AA04</td>\n",
|
||
" <td>bacteria</td>\n",
|
||
" <td>995.607694</td>\n",
|
||
" <td>331.869231</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4...</td>\n",
|
||
" <td>C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(...</td>\n",
|
||
" <td>InChI=1S/C29H38N4O10/c1-28(42)13-7-6-9-17(34)1...</td>\n",
|
||
" <td>PZTCVADFMACKLU-UEPZRUIBSA-N</td>\n",
|
||
" <td>CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O...</td>\n",
|
||
" <td>CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O...</td>\n",
|
||
" <td>43.0</td>\n",
|
||
" <td>46.0</td>\n",
|
||
" <td>602.258793</td>\n",
|
||
" <td>CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Alfadolone acetate</th>\n",
|
||
" <td>Prestw-1001</td>\n",
|
||
" <td>CID100024733</td>\n",
|
||
" <td>N01AX</td>\n",
|
||
" <td>human</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>CC(=O)OCC(=O)C1CCC2C1(CC(=O)C3C2CCC4C3(CCC(C4)...</td>\n",
|
||
" <td>CC(=O)OCC(=O)[C@H]1CC[C@@H]2[C@@]1(CC(=O)[C@H]...</td>\n",
|
||
" <td>InChI=1S/C23H34O5/c1-13(24)28-12-20(27)18-7-6-...</td>\n",
|
||
" <td>QRJOQYLXZPQQMX-FWROMSNXSA-N</td>\n",
|
||
" <td>CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C</td>\n",
|
||
" <td>CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C</td>\n",
|
||
" <td>28.0</td>\n",
|
||
" <td>31.0</td>\n",
|
||
" <td>390.240624</td>\n",
|
||
" <td>CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 29 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" prestwick_ID STITCH4 id ATC codes target species \\\n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 Prestw-1 CID100008646 L01BB human \n",
|
||
"Sulfaguanidine Prestw-10 CID100005324 A07AB03 bacteria \n",
|
||
"Nocodazole Prestw-100 CID100004122 L01XX human \n",
|
||
"Lymecycline Prestw-1000 CID154687131 J01AA04 bacteria \n",
|
||
"Alfadolone acetate Prestw-1001 CID100024733 N01AX human \n",
|
||
"\n",
|
||
" dose (µmol) estimated intestine concentration (µM) \\\n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 NaN NaN \n",
|
||
"Sulfaguanidine 18670.009221 6223.336407 \n",
|
||
"Nocodazole NaN NaN \n",
|
||
"Lymecycline 995.607694 331.869231 \n",
|
||
"Alfadolone acetate NaN NaN \n",
|
||
"\n",
|
||
" plasma concentration (µM) source for plasma concentration \\\n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 NaN NaN \n",
|
||
"Sulfaguanidine NaN NaN \n",
|
||
"Nocodazole NaN NaN \n",
|
||
"Lymecycline NaN NaN \n",
|
||
"Alfadolone acetate NaN NaN \n",
|
||
"\n",
|
||
" fraction excreted in feces fraction excreted in urine \\\n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 NaN NaN \n",
|
||
"Sulfaguanidine NaN NaN \n",
|
||
"Nocodazole NaN NaN \n",
|
||
"Lymecycline NaN NaN \n",
|
||
"Alfadolone acetate NaN NaN \n",
|
||
"\n",
|
||
" ... pchem_canonical_smile \\\n",
|
||
"chemical name ... \n",
|
||
"Azaguanine-8 ... C12=NNN=C1N=C(NC2=O)N \n",
|
||
"Sulfaguanidine ... C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N \n",
|
||
"Nocodazole ... COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3 \n",
|
||
"Lymecycline ... CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4... \n",
|
||
"Alfadolone acetate ... CC(=O)OCC(=O)C1CCC2C1(CC(=O)C3C2CCC4C3(CCC(C4)... \n",
|
||
"\n",
|
||
" pchem_isomeric_smile \\\n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 C12=NNN=C1N=C(NC2=O)N \n",
|
||
"Sulfaguanidine C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N \n",
|
||
"Nocodazole COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3 \n",
|
||
"Lymecycline C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(... \n",
|
||
"Alfadolone acetate CC(=O)OCC(=O)[C@H]1CC[C@@H]2[C@@]1(CC(=O)[C@H]... \n",
|
||
"\n",
|
||
" pchem_inchi \\\n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 InChI=1S/C4H4N6O/c5-4-6-2-1(3(11)7-4)8-10-9-2/... \n",
|
||
"Sulfaguanidine InChI=1S/C7H10N4O2S/c8-5-1-3-6(4-2-5)14(12,13)... \n",
|
||
"Nocodazole InChI=1S/C14H11N3O3S/c1-20-14(19)17-13-15-9-5-... \n",
|
||
"Lymecycline InChI=1S/C29H38N4O10/c1-28(42)13-7-6-9-17(34)1... \n",
|
||
"Alfadolone acetate InChI=1S/C23H34O5/c1-13(24)28-12-20(27)18-7-6-... \n",
|
||
"\n",
|
||
" pchem_inchikey \\\n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 LPXQRXLUHJKZIE-UHFFFAOYSA-N \n",
|
||
"Sulfaguanidine BRBKOPJOKNSWSG-UHFFFAOYSA-N \n",
|
||
"Nocodazole KYRVNWMVYQXFEU-UHFFFAOYSA-N \n",
|
||
"Lymecycline PZTCVADFMACKLU-UEPZRUIBSA-N \n",
|
||
"Alfadolone acetate QRJOQYLXZPQQMX-FWROMSNXSA-N \n",
|
||
"\n",
|
||
" rdkit_canonical_smile \\\n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 Nc1nc2n[nH]nc2c(=O)[nH]1 \n",
|
||
"Sulfaguanidine NC(N)=NS(=O)(=O)c1ccc(N)cc1 \n",
|
||
"Nocodazole COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1 \n",
|
||
"Lymecycline CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O... \n",
|
||
"Alfadolone acetate CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C \n",
|
||
"\n",
|
||
" rdkit_isomeric_smile \\\n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 Nc1nc2n[nH]nc2c(=O)[nH]1 \n",
|
||
"Sulfaguanidine NC(N)=NS(=O)(=O)c1ccc(N)cc1 \n",
|
||
"Nocodazole COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1 \n",
|
||
"Lymecycline CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O... \n",
|
||
"Alfadolone acetate CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C \n",
|
||
"\n",
|
||
" n_atoms n_bonds ExactMolWt \\\n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 11.0 12.0 152.044659 \n",
|
||
"Sulfaguanidine 14.0 14.0 214.052447 \n",
|
||
"Nocodazole 21.0 23.0 301.052112 \n",
|
||
"Lymecycline 43.0 46.0 602.258793 \n",
|
||
"Alfadolone acetate 28.0 31.0 390.240624 \n",
|
||
"\n",
|
||
" rdkit_no_salt \n",
|
||
"chemical name \n",
|
||
"Azaguanine-8 Nc1nc2n[nH]nc2c(=O)[nH]1 \n",
|
||
"Sulfaguanidine NC(N)=NS(=O)(=O)c1ccc(N)cc1 \n",
|
||
"Nocodazole COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1 \n",
|
||
"Lymecycline CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O... \n",
|
||
"Alfadolone acetate CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C \n",
|
||
"\n",
|
||
"[5 rows x 29 columns]"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"maier_chemicals.reset_index(inplace=True)\n",
|
||
"maier_chemicals.set_index(\"chemical name\", inplace=True)\n",
|
||
"\n",
|
||
"chem_smiles.set_index(\"name\", inplace=True)\n",
|
||
"\n",
|
||
"chemical_metadata = maier_chemicals.join(chem_smiles)\n",
|
||
"chemical_metadata.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"With that, we can write the final output"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"chemical_metadata.to_csv(os.path.join(OUTPUT_DIR, \"prestwick_library.tsv.gz\"), sep='\\t')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Molecular representation and data splitting. \n",
|
||
" \n",
|
||
"Now that we have determined the labels, we can now represent the chemical library using MolE, ECFP4 and Chemical Descriptors. At the same time, we can split the dataset using scaffold splitting"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"chemical_metadata_screened = chemical_metadata.loc[chemical_metadata[\"prestwick_ID\"].isin(screen_df.index)]\n",
|
||
"chemical_metadata_screened.to_csv(os.path.join(OUTPUT_DIR, \"prestwick_library_screened.tsv.gz\"), sep='\\t')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"About to generate scaffolds\n",
|
||
"About to sort in scaffold sets\n",
|
||
"Representation dimension (1000) - Embedding dimension (8000)\n",
|
||
"../pretrained_model/model_ginconcat_btwin_100k_d8000_l0.0001/model.pth\n",
|
||
"x_embedding1.weight\n",
|
||
"x_embedding2.weight\n",
|
||
"gnns.0.mlp.0.weight\n",
|
||
"gnns.0.mlp.0.bias\n",
|
||
"gnns.0.mlp.1.weight\n",
|
||
"gnns.0.mlp.1.bias\n",
|
||
"gnns.0.mlp.1.running_mean\n",
|
||
"gnns.0.mlp.1.running_var\n",
|
||
"gnns.0.mlp.1.num_batches_tracked\n",
|
||
"gnns.0.mlp.3.weight\n",
|
||
"gnns.0.mlp.3.bias\n",
|
||
"gnns.0.edge_embedding1.weight\n",
|
||
"gnns.0.edge_embedding2.weight\n",
|
||
"gnns.1.mlp.0.weight\n",
|
||
"gnns.1.mlp.0.bias\n",
|
||
"gnns.1.mlp.1.weight\n",
|
||
"gnns.1.mlp.1.bias\n",
|
||
"gnns.1.mlp.1.running_mean\n",
|
||
"gnns.1.mlp.1.running_var\n",
|
||
"gnns.1.mlp.1.num_batches_tracked\n",
|
||
"gnns.1.mlp.3.weight\n",
|
||
"gnns.1.mlp.3.bias\n",
|
||
"gnns.1.edge_embedding1.weight\n",
|
||
"gnns.1.edge_embedding2.weight\n",
|
||
"gnns.2.mlp.0.weight\n",
|
||
"gnns.2.mlp.0.bias\n",
|
||
"gnns.2.mlp.1.weight\n",
|
||
"gnns.2.mlp.1.bias\n",
|
||
"gnns.2.mlp.1.running_mean\n",
|
||
"gnns.2.mlp.1.running_var\n",
|
||
"gnns.2.mlp.1.num_batches_tracked\n",
|
||
"gnns.2.mlp.3.weight\n",
|
||
"gnns.2.mlp.3.bias\n",
|
||
"gnns.2.edge_embedding1.weight\n",
|
||
"gnns.2.edge_embedding2.weight\n",
|
||
"gnns.3.mlp.0.weight\n",
|
||
"gnns.3.mlp.0.bias\n",
|
||
"gnns.3.mlp.1.weight\n",
|
||
"gnns.3.mlp.1.bias\n",
|
||
"gnns.3.mlp.1.running_mean\n",
|
||
"gnns.3.mlp.1.running_var\n",
|
||
"gnns.3.mlp.1.num_batches_tracked\n",
|
||
"gnns.3.mlp.3.weight\n",
|
||
"gnns.3.mlp.3.bias\n",
|
||
"gnns.3.edge_embedding1.weight\n",
|
||
"gnns.3.edge_embedding2.weight\n",
|
||
"gnns.4.mlp.0.weight\n",
|
||
"gnns.4.mlp.0.bias\n",
|
||
"gnns.4.mlp.1.weight\n",
|
||
"gnns.4.mlp.1.bias\n",
|
||
"gnns.4.mlp.1.running_mean\n",
|
||
"gnns.4.mlp.1.running_var\n",
|
||
"gnns.4.mlp.1.num_batches_tracked\n",
|
||
"gnns.4.mlp.3.weight\n",
|
||
"gnns.4.mlp.3.bias\n",
|
||
"gnns.4.edge_embedding1.weight\n",
|
||
"gnns.4.edge_embedding2.weight\n",
|
||
"batch_norms.0.weight\n",
|
||
"batch_norms.0.bias\n",
|
||
"batch_norms.0.running_mean\n",
|
||
"batch_norms.0.running_var\n",
|
||
"batch_norms.0.num_batches_tracked\n",
|
||
"batch_norms.1.weight\n",
|
||
"batch_norms.1.bias\n",
|
||
"batch_norms.1.running_mean\n",
|
||
"batch_norms.1.running_var\n",
|
||
"batch_norms.1.num_batches_tracked\n",
|
||
"batch_norms.2.weight\n",
|
||
"batch_norms.2.bias\n",
|
||
"batch_norms.2.running_mean\n",
|
||
"batch_norms.2.running_var\n",
|
||
"batch_norms.2.num_batches_tracked\n",
|
||
"batch_norms.3.weight\n",
|
||
"batch_norms.3.bias\n",
|
||
"batch_norms.3.running_mean\n",
|
||
"batch_norms.3.running_var\n",
|
||
"batch_norms.3.num_batches_tracked\n",
|
||
"batch_norms.4.weight\n",
|
||
"batch_norms.4.bias\n",
|
||
"batch_norms.4.running_mean\n",
|
||
"batch_norms.4.running_var\n",
|
||
"batch_norms.4.num_batches_tracked\n",
|
||
"feat_lin.weight\n",
|
||
"feat_lin.bias\n",
|
||
"out_lin.0.weight\n",
|
||
"out_lin.0.bias\n",
|
||
"out_lin.1.weight\n",
|
||
"out_lin.1.bias\n",
|
||
"out_lin.1.running_mean\n",
|
||
"out_lin.1.running_var\n",
|
||
"out_lin.1.num_batches_tracked\n",
|
||
"out_lin.3.weight\n",
|
||
"out_lin.3.bias\n",
|
||
"out_lin.4.weight\n",
|
||
"out_lin.4.bias\n",
|
||
"out_lin.4.running_mean\n",
|
||
"out_lin.4.running_var\n",
|
||
"out_lin.4.num_batches_tracked\n",
|
||
"out_lin.6.weight\n",
|
||
"out_lin.6.bias\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# MOLE REPRESENTATION\n",
|
||
"\n",
|
||
"maier_scaffold_split, mole_representation = process_dataset(dataset_path = os.path.join(OUTPUT_DIR, \"prestwick_library_screened.tsv.gz\"), \n",
|
||
" pretrain_architecture = \"gin_concat\", \n",
|
||
" pretrained_model = \"model_ginconcat_btwin_100k_d8000_l0.0001\", \n",
|
||
" \n",
|
||
" split_approach = \"scaffold\", \n",
|
||
" validation_proportion = 0.1, \n",
|
||
" test_proportion = 0.1, \n",
|
||
" \n",
|
||
" smile_column_str = \"rdkit_no_salt\", \n",
|
||
" id_column_str = \"prestwick_ID\") "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# ECFP4\n",
|
||
"\n",
|
||
"ecfp4_representation = process_dataset(dataset_path = os.path.join(OUTPUT_DIR, \"prestwick_library_screened.tsv.gz\"), \n",
|
||
" \n",
|
||
" pretrained_model = \"ECFP4\", \n",
|
||
" dataset_split=False,\n",
|
||
"\n",
|
||
" smile_column_str = \"rdkit_no_salt\", \n",
|
||
" id_column_str = \"prestwick_ID\") "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Could not compute descriptors for Prestw-919\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# CHEMICAL DESCRIPTORS\n",
|
||
"chemdesc_representation = process_dataset(dataset_path = os.path.join(OUTPUT_DIR, \"prestwick_library_screened.tsv.gz\"), \n",
|
||
" pretrained_model = \"ChemDesc\", \n",
|
||
" dataset_split=False,\n",
|
||
"\n",
|
||
" smile_column_str = \"pchem_isomeric_smile\", \n",
|
||
" id_column_str = \"prestwick_ID\")\n",
|
||
" \n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Prepare representations for 100K molecules from PubChem \n",
|
||
" \n",
|
||
"Here I prepare the MolE and ECFP4 representation of 100K randomly selected molecules from PubChem.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"ecfp4_pubchem = process_dataset(dataset_path = os.path.join(RAW_DATA_DIR, \"pubchem_random\", \"pubchem_100k_selected_smiles.tsv.gz\"), \n",
|
||
" \n",
|
||
" pretrained_model = \"ECFP4\", \n",
|
||
" dataset_split=False,\n",
|
||
"\n",
|
||
" smile_column_str = \"smiles\", \n",
|
||
" id_column_str = \"chem_id\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Representation dimension (1000) - Embedding dimension (8000)\n",
|
||
"../pretrained_model/model_ginconcat_btwin_100k_d8000_l0.0001/model.pth\n",
|
||
"x_embedding1.weight\n",
|
||
"x_embedding2.weight\n",
|
||
"gnns.0.mlp.0.weight\n",
|
||
"gnns.0.mlp.0.bias\n",
|
||
"gnns.0.mlp.1.weight\n",
|
||
"gnns.0.mlp.1.bias\n",
|
||
"gnns.0.mlp.1.running_mean\n",
|
||
"gnns.0.mlp.1.running_var\n",
|
||
"gnns.0.mlp.1.num_batches_tracked\n",
|
||
"gnns.0.mlp.3.weight\n",
|
||
"gnns.0.mlp.3.bias\n",
|
||
"gnns.0.edge_embedding1.weight\n",
|
||
"gnns.0.edge_embedding2.weight\n",
|
||
"gnns.1.mlp.0.weight\n",
|
||
"gnns.1.mlp.0.bias\n",
|
||
"gnns.1.mlp.1.weight\n",
|
||
"gnns.1.mlp.1.bias\n",
|
||
"gnns.1.mlp.1.running_mean\n",
|
||
"gnns.1.mlp.1.running_var\n",
|
||
"gnns.1.mlp.1.num_batches_tracked\n",
|
||
"gnns.1.mlp.3.weight\n",
|
||
"gnns.1.mlp.3.bias\n",
|
||
"gnns.1.edge_embedding1.weight\n",
|
||
"gnns.1.edge_embedding2.weight\n",
|
||
"gnns.2.mlp.0.weight\n",
|
||
"gnns.2.mlp.0.bias\n",
|
||
"gnns.2.mlp.1.weight\n",
|
||
"gnns.2.mlp.1.bias\n",
|
||
"gnns.2.mlp.1.running_mean\n",
|
||
"gnns.2.mlp.1.running_var\n",
|
||
"gnns.2.mlp.1.num_batches_tracked\n",
|
||
"gnns.2.mlp.3.weight\n",
|
||
"gnns.2.mlp.3.bias\n",
|
||
"gnns.2.edge_embedding1.weight\n",
|
||
"gnns.2.edge_embedding2.weight\n",
|
||
"gnns.3.mlp.0.weight\n",
|
||
"gnns.3.mlp.0.bias\n",
|
||
"gnns.3.mlp.1.weight\n",
|
||
"gnns.3.mlp.1.bias\n",
|
||
"gnns.3.mlp.1.running_mean\n",
|
||
"gnns.3.mlp.1.running_var\n",
|
||
"gnns.3.mlp.1.num_batches_tracked\n",
|
||
"gnns.3.mlp.3.weight\n",
|
||
"gnns.3.mlp.3.bias\n",
|
||
"gnns.3.edge_embedding1.weight\n",
|
||
"gnns.3.edge_embedding2.weight\n",
|
||
"gnns.4.mlp.0.weight\n",
|
||
"gnns.4.mlp.0.bias\n",
|
||
"gnns.4.mlp.1.weight\n",
|
||
"gnns.4.mlp.1.bias\n",
|
||
"gnns.4.mlp.1.running_mean\n",
|
||
"gnns.4.mlp.1.running_var\n",
|
||
"gnns.4.mlp.1.num_batches_tracked\n",
|
||
"gnns.4.mlp.3.weight\n",
|
||
"gnns.4.mlp.3.bias\n",
|
||
"gnns.4.edge_embedding1.weight\n",
|
||
"gnns.4.edge_embedding2.weight\n",
|
||
"batch_norms.0.weight\n",
|
||
"batch_norms.0.bias\n",
|
||
"batch_norms.0.running_mean\n",
|
||
"batch_norms.0.running_var\n",
|
||
"batch_norms.0.num_batches_tracked\n",
|
||
"batch_norms.1.weight\n",
|
||
"batch_norms.1.bias\n",
|
||
"batch_norms.1.running_mean\n",
|
||
"batch_norms.1.running_var\n",
|
||
"batch_norms.1.num_batches_tracked\n",
|
||
"batch_norms.2.weight\n",
|
||
"batch_norms.2.bias\n",
|
||
"batch_norms.2.running_mean\n",
|
||
"batch_norms.2.running_var\n",
|
||
"batch_norms.2.num_batches_tracked\n",
|
||
"batch_norms.3.weight\n",
|
||
"batch_norms.3.bias\n",
|
||
"batch_norms.3.running_mean\n",
|
||
"batch_norms.3.running_var\n",
|
||
"batch_norms.3.num_batches_tracked\n",
|
||
"batch_norms.4.weight\n",
|
||
"batch_norms.4.bias\n",
|
||
"batch_norms.4.running_mean\n",
|
||
"batch_norms.4.running_var\n",
|
||
"batch_norms.4.num_batches_tracked\n",
|
||
"feat_lin.weight\n",
|
||
"feat_lin.bias\n",
|
||
"out_lin.0.weight\n",
|
||
"out_lin.0.bias\n",
|
||
"out_lin.1.weight\n",
|
||
"out_lin.1.bias\n",
|
||
"out_lin.1.running_mean\n",
|
||
"out_lin.1.running_var\n",
|
||
"out_lin.1.num_batches_tracked\n",
|
||
"out_lin.3.weight\n",
|
||
"out_lin.3.bias\n",
|
||
"out_lin.4.weight\n",
|
||
"out_lin.4.bias\n",
|
||
"out_lin.4.running_mean\n",
|
||
"out_lin.4.running_var\n",
|
||
"out_lin.4.num_batches_tracked\n",
|
||
"out_lin.6.weight\n",
|
||
"out_lin.6.bias\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"mole_pubchem = process_dataset(dataset_path = os.path.join(RAW_DATA_DIR, \"pubchem_random\", \"pubchem_100k_selected_smiles.tsv.gz\"), \n",
|
||
" \n",
|
||
" pretrain_architecture = \"gin_concat\", \n",
|
||
" pretrained_model = \"model_ginconcat_btwin_100k_d8000_l0.0001\",\n",
|
||
" \n",
|
||
" dataset_split=False,\n",
|
||
"\n",
|
||
" smile_column_str = \"smiles\", \n",
|
||
" id_column_str = \"chem_id\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Write files"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#maier_scaffold_split.to_csv(os.path.join(OUTPUT_DIR, \"maier_scaffold_split.tsv.gz\"), sep='\\t')\n",
|
||
"#mole_representation.to_csv(os.path.join(OUTPUT_DIR, \"maier_mole_representation.tsv.gz\"), sep='\\t')\n",
|
||
"#ecfp4_representation.to_csv(os.path.join(OUTPUT_DIR, \"maier_ecfp4_representation.tsv.gz\"), sep='\\t')\n",
|
||
"#chemdesc_representation.to_csv(os.path.join(OUTPUT_DIR, \"maier_chemdesc_representation.tsv.gz\"), sep='\\t', index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Breakup MolE pubchem to fit required file size"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#mole_pubchem.iloc[0:25_000, :].to_csv(\"../data/01.prepare_training_data/pubchem_mole_representation_p1.tsv.gz\", sep='\\t')\n",
|
||
"#mole_pubchem.iloc[25_000:50_000, :].to_csv(\"../data/01.prepare_training_data/pubchem_mole_representation_p2.tsv.gz\", sep='\\t')\n",
|
||
"#mole_pubchem.iloc[50_000:75_000, :].to_csv(\"../data/01.prepare_training_data/pubchem_mole_representation_p3.tsv.gz\", sep='\\t')\n",
|
||
"#mole_pubchem.iloc[75_000:, :].to_csv(\"../data/01.prepare_training_data/pubchem_mole_representation_p4.tsv.gz\", sep='\\t')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#ecfp4_pubchem.to_csv(os.path.join(OUTPUT_DIR, \"pubchem_ecfp4_representation.tsv.gz\"))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Written files"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['pubchem_mole_representation_p1.tsv.gz',\n",
|
||
" 'maier_ecfp4_representation.tsv.gz',\n",
|
||
" 'prestwick_library.tsv.gz',\n",
|
||
" 'pubchem_mole_representation_p3.tsv.gz',\n",
|
||
" 'prestwick_library_screened.tsv.gz',\n",
|
||
" 'maier_scaffold_split.tsv.gz',\n",
|
||
" 'maier_chemdesc_representation.tsv.gz',\n",
|
||
" 'pubchem_mole_representation_p4.tsv.gz',\n",
|
||
" 'maier_mole_representation.tsv.gz',\n",
|
||
" 'pubchem_mole_representation_p2.tsv.gz',\n",
|
||
" 'pubchem_ecfp4_representation.tsv.gz',\n",
|
||
" 'maier_screening_results.tsv.gz']"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"os.listdir(OUTPUT_DIR)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "mole_test",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.20"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|