{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prepare Training Data \n", "\n", "### Author: Roberto Olayo-Alarcon\n", " \n", "In this step we read the data from [Maier, L., et al (2018)](https://www.nature.com/articles/nature25979). And prepare it to be used for training, testing and validation of the XGBoost model. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read Libraries\n", "import os\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from tqdm import tqdm\n", "import pubchempy as pcp\n", "\n", "from dataset.dataset_representation import process_dataset\n", "\n", "\n", "from rdkit import Chem\n", "from rdkit.Chem import Descriptors\n", "from rdkit.Chem.SaltRemover import SaltRemover" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Global variables " ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Raw data dir\n", "RAW_DATA_DIR = \"../raw_data/\"\n", "\n", "# Directory from which to read the raw data\n", "INPUT_DIR = os.path.join(RAW_DATA_DIR, \"maier_microbiome\")\n", "\n", "# Create the output directory\n", "OUTPUT_DIR = \"../data/01.prepare_training_data\"\n", "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", "\n", "# Pvalue cutoff for label determination\n", "PVAL_CUTOFF = 0.05" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read data and binarize\n", " \n", "Here we read in Supplementary Table 3 from the study." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Akkermansia muciniphila (NT5021) | \n", "Bacteroides caccae (NT5050) | \n", "Bacteroides fragilis (ET) (NT5033) | \n", "Bacteroides fragilis (NT) (NT5003) | \n", "Bacteroides ovatus (NT5054) | \n", "Bacteroides thetaiotaomicron (NT5004) | \n", "Bacteroides uniformis (NT5002) | \n", "Bacteroides vulgatus (NT5001) | \n", "Bacteroides xylanisolvens (NT5064) | \n", "Bifidobacterium adolescentis (NT5022) | \n", "... | \n", "Parabacteroides merdae (NT5071) | \n", "Prevotella copri (NT5019) | \n", "Roseburia hominis (NT5079) | \n", "Roseburia intestinalis (NT5011) | \n", "Ruminococcus bromii (NT5045) | \n", "Ruminococcus gnavus (NT5046) | \n", "Ruminococcus torques (NT5047) | \n", "Streptococcus parasanguinis (NT5072) | \n", "Streptococcus salivarius (NT5038) | \n", "Veillonella parvula (NT5017) | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| prestwick_ID | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
| Prestw-1109 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "... | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
| Prestw-1399 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "... | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
| Prestw-145 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "... | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
| Prestw-1464 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "... | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
| Prestw-31 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "... | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
5 rows × 40 columns
\n", "| \n", " | prestwick_ID | \n", "STITCH4 id | \n", "ATC codes | \n", "target species | \n", "dose (µmol) | \n", "estimated intestine concentration (µM) | \n", "plasma concentration (µM) | \n", "source for plasma concentration | \n", "fraction excreted in feces | \n", "fraction excreted in urine | \n", "... | \n", "pchem_canonical_smile | \n", "pchem_isomeric_smile | \n", "pchem_inchi | \n", "pchem_inchikey | \n", "rdkit_canonical_smile | \n", "rdkit_isomeric_smile | \n", "n_atoms | \n", "n_bonds | \n", "ExactMolWt | \n", "rdkit_no_salt | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| chemical name | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
| Azaguanine-8 | \n", "Prestw-1 | \n", "CID100008646 | \n", "L01BB | \n", "human | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "C12=NNN=C1N=C(NC2=O)N | \n", "C12=NNN=C1N=C(NC2=O)N | \n", "InChI=1S/C4H4N6O/c5-4-6-2-1(3(11)7-4)8-10-9-2/... | \n", "LPXQRXLUHJKZIE-UHFFFAOYSA-N | \n", "Nc1nc2n[nH]nc2c(=O)[nH]1 | \n", "Nc1nc2n[nH]nc2c(=O)[nH]1 | \n", "11.0 | \n", "12.0 | \n", "152.044659 | \n", "Nc1nc2n[nH]nc2c(=O)[nH]1 | \n", "
| Sulfaguanidine | \n", "Prestw-10 | \n", "CID100005324 | \n", "A07AB03 | \n", "bacteria | \n", "18670.009221 | \n", "6223.336407 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N | \n", "C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N | \n", "InChI=1S/C7H10N4O2S/c8-5-1-3-6(4-2-5)14(12,13)... | \n", "BRBKOPJOKNSWSG-UHFFFAOYSA-N | \n", "NC(N)=NS(=O)(=O)c1ccc(N)cc1 | \n", "NC(N)=NS(=O)(=O)c1ccc(N)cc1 | \n", "14.0 | \n", "14.0 | \n", "214.052447 | \n", "NC(N)=NS(=O)(=O)c1ccc(N)cc1 | \n", "
| Nocodazole | \n", "Prestw-100 | \n", "CID100004122 | \n", "L01XX | \n", "human | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3 | \n", "COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3 | \n", "InChI=1S/C14H11N3O3S/c1-20-14(19)17-13-15-9-5-... | \n", "KYRVNWMVYQXFEU-UHFFFAOYSA-N | \n", "COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1 | \n", "COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1 | \n", "21.0 | \n", "23.0 | \n", "301.052112 | \n", "COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1 | \n", "
| Lymecycline | \n", "Prestw-1000 | \n", "CID154687131 | \n", "J01AA04 | \n", "bacteria | \n", "995.607694 | \n", "331.869231 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4... | \n", "C[C@@]1([C@H]2C[C@H]3[C@@H](C(=O)C(=C([C@]3(C(... | \n", "InChI=1S/C29H38N4O10/c1-28(42)13-7-6-9-17(34)1... | \n", "PZTCVADFMACKLU-UEPZRUIBSA-N | \n", "CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O... | \n", "CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O... | \n", "43.0 | \n", "46.0 | \n", "602.258793 | \n", "CN(C)C1C(=O)C(C(=O)NCNCCCCC(N)C(=O)O)=C(O)C2(O... | \n", "
| Alfadolone acetate | \n", "Prestw-1001 | \n", "CID100024733 | \n", "N01AX | \n", "human | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "CC(=O)OCC(=O)C1CCC2C1(CC(=O)C3C2CCC4C3(CCC(C4)... | \n", "CC(=O)OCC(=O)[C@H]1CC[C@@H]2[C@@]1(CC(=O)[C@H]... | \n", "InChI=1S/C23H34O5/c1-13(24)28-12-20(27)18-7-6-... | \n", "QRJOQYLXZPQQMX-FWROMSNXSA-N | \n", "CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C | \n", "CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C | \n", "28.0 | \n", "31.0 | \n", "390.240624 | \n", "CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C | \n", "
5 rows × 29 columns
\n", "