{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import pandas as pd\n", "\n", "data_file = Path('../../data/Macro16_SIME_Synthesis/fixed_macrolides_2025.csv')\n", "macrolactondb_file = Path('../../data/MacrolactoneDB/ring16/temp.csv')\n", "synthesis_df = pd.read_csv(data_file)\n", "synthesis_df['synthesis'] = 'SIME'\n", "macrolactondb_df = pd.read_csv(macrolactondb_file)\n", "macrolactondb_df['synthesis'] = 'MacrolactonDB'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | IDs | \n", "molecule_pref_name | \n", "max_pChEMBL | \n", "max_pChEMBL_target | \n", "# Target Organisms | \n", "Target Organisms | \n", "# Known Targets | \n", "Known Targets | \n", "target_pref_name | \n", "smiles | \n", "synthesis | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "CHEMBL94657 | \n", "PATUPILONE | \n", "10.67 | \n", "CHEMBL1075590 | \n", "695 | \n", "Sus scrofa, Mus musculus, None, Plasmodium fal... | \n", "695 | \n", "CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... | \n", "AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... | \n", "C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... | \n", "MacrolactonDB | \n", "
| 1 | \n", "CHEMBL94657 | \n", "PATUPILONE | \n", "10.67 | \n", "CHEMBL1075590 | \n", "695 | \n", "Sus scrofa, Mus musculus, None, Plasmodium fal... | \n", "695 | \n", "CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... | \n", "AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... | \n", "C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... | \n", "MacrolactonDB | \n", "
| 2 | \n", "CHEMBL1554 | \n", "DACTINOMYCIN | \n", "10.10 | \n", "CHEMBL614533 | \n", "177 | \n", "Giardia intestinalis, Trypanosoma cruzi, Equus... | \n", "177 | \n", "CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... | \n", "HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... | \n", "Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... | \n", "MacrolactonDB | \n", "
| 3 | \n", "CHEMBL1173445 | \n", "LARGAZOLE | \n", "8.80 | \n", "CHEMBL612545 | \n", "45 | \n", "Homo sapiens, None | \n", "45 | \n", "CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,... | \n", "Histone deacetylase 9, Ubiquitin-like modifier... | \n", "CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2... | \n", "MacrolactonDB | \n", "
| 4 | \n", "CHEMBL3902498 | \n", "NaN | \n", "9.37 | \n", "CHEMBL2095194,CHEMBL3991 | \n", "17 | \n", "Homo sapiens, None | \n", "17 | \n", "CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,... | \n", "Coagulation factor X, Kallikrein 1, Coagulatio... | \n", "Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC... | \n", "MacrolactonDB | \n", "
| \n", " | IDs | \n", "molecule_pref_name | \n", "max_pChEMBL | \n", "max_pChEMBL_target | \n", "# Target Organisms | \n", "Target Organisms | \n", "# Known Targets | \n", "Known Targets | \n", "target_pref_name | \n", "smiles | \n", "synthesis | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 284 | \n", "CHEMBL3342324 | \n", "NaN | \n", "6.52 | \n", "CHEMBL3341578 | \n", "1 | \n", "Escherichia coli (strain K12) | \n", "1 | \n", "CHEMBL3341578 | \n", "ATP-dependent Clp protease proteolytic subunit | \n", "C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@... | \n", "MacrolactonDB | \n", "
| 338 | \n", "CHEMBL3342325 | \n", "NaN | \n", "6.16 | \n", "CHEMBL3341578 | \n", "1 | \n", "Escherichia coli (strain K12) | \n", "1 | \n", "CHEMBL3341578 | \n", "ATP-dependent Clp protease proteolytic subunit | \n", "C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@... | \n", "MacrolactonDB | \n", "
| 341 | \n", "CHEMBL3742171 | \n", "NaN | \n", "6.52 | \n", "CHEMBL1681616 | \n", "1 | \n", "Staphylococcus aureus | \n", "1 | \n", "CHEMBL1681616 | \n", "Accessory gene regulator protein A | \n", "CC(C)[C@@H]1NC(=O)C[C@H](/C=C/c2ccccc2)OC(=O)[... | \n", "MacrolactonDB | \n", "
| 350 | \n", "CHEMBL1784527 | \n", "TURNAGAINOLIDE A | \n", "6.10 | \n", "CHEMBL1681616 | \n", "1 | \n", "Staphylococcus aureus | \n", "1 | \n", "CHEMBL1681616 | \n", "Accessory gene regulator protein A | \n", "CC[C@H](C)[C@@H]1NC(=O)[C@@H](C)NC(=O)[C@H](C(... | \n", "MacrolactonDB | \n", "
| 392 | \n", "CHEMBL1317175 | \n", "NaN | \n", "5.55 | \n", "CHEMBL1293248 | \n", "1 | \n", "Bacillus subtilis | \n", "1 | \n", "CHEMBL1293248 | \n", "4'-phosphopantetheinyl transferase ffp | \n", "CO/C1=C\\C(C)=C\\[C@@H](C)[C@@H](O)[C@@H](C)C/C(... | \n", "MacrolactonDB | \n", "
| \n", " | original_smiles | \n", "fixed_smiles | \n", "status | \n", "message | \n", "synthesis | \n", "
|---|---|---|---|---|---|
| 0 | \n", "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... | \n", "CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... | \n", "corrected | \n", "修复原子 28 | \n", "SIME | \n", "
| 1 | \n", "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... | \n", "CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... | \n", "corrected | \n", "修复原子 28 | \n", "SIME | \n", "
| 2 | \n", "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... | \n", "CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... | \n", "corrected | \n", "修复原子 28 | \n", "SIME | \n", "
| 3 | \n", "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... | \n", "CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... | \n", "corrected | \n", "修复原子 28 | \n", "SIME | \n", "
| 4 | \n", "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... | \n", "CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... | \n", "corrected | \n", "修复原子 28 | \n", "SIME | \n", "
| \n", " | original_smiles | \n", "smiles | \n", "status | \n", "message | \n", "synthesis | \n", "
|---|---|---|---|---|---|
| 0 | \n", "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... | \n", "CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... | \n", "corrected | \n", "修复原子 28 | \n", "SIME | \n", "
| 1 | \n", "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... | \n", "CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... | \n", "corrected | \n", "修复原子 28 | \n", "SIME | \n", "
| 2 | \n", "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... | \n", "CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... | \n", "corrected | \n", "修复原子 28 | \n", "SIME | \n", "
| 3 | \n", "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... | \n", "CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... | \n", "corrected | \n", "修复原子 28 | \n", "SIME | \n", "
| 4 | \n", "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... | \n", "CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... | \n", "corrected | \n", "修复原子 28 | \n", "SIME | \n", "
| \n", " | IDs | \n", "molecule_pref_name | \n", "max_pChEMBL | \n", "max_pChEMBL_target | \n", "# Target Organisms | \n", "Target Organisms | \n", "# Known Targets | \n", "Known Targets | \n", "target_pref_name | \n", "smiles | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "CHEMBL94657 | \n", "PATUPILONE | \n", "10.67 | \n", "CHEMBL1075590 | \n", "695 | \n", "Sus scrofa, Mus musculus, None, Plasmodium fal... | \n", "695 | \n", "CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... | \n", "AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... | \n", "C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... | \n", "
| 1 | \n", "CHEMBL94657 | \n", "PATUPILONE | \n", "10.67 | \n", "CHEMBL1075590 | \n", "695 | \n", "Sus scrofa, Mus musculus, None, Plasmodium fal... | \n", "695 | \n", "CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... | \n", "AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... | \n", "C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... | \n", "
| 2 | \n", "CHEMBL449158 | \n", "BRYOSTATIN | \n", "9.36 | \n", "CHEMBL2996 | \n", "664 | \n", "Homo sapiens, None, Rattus norvegicus | \n", "664 | \n", "CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... | \n", "AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... | \n", "CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C... | \n", "
| 3 | \n", "CHEMBL1554 | \n", "DACTINOMYCIN | \n", "10.10 | \n", "CHEMBL614533 | \n", "177 | \n", "Giardia intestinalis, Trypanosoma cruzi, Equus... | \n", "177 | \n", "CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... | \n", "HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... | \n", "Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... | \n", "
| 4 | \n", "CHEMBL529 | \n", "AZITHROMYCIN | \n", "8.59 | \n", "CHEMBL3301413 | \n", "70 | \n", "None, Plasmodium falciparum, Escherichia coli,... | \n", "70 | \n", "CHEMBL347, CHEMBL612313, CHEMBL354, CHEMBL3301... | \n", "Streptococcus, Unchecked, Cytochrome P450 3A4,... | \n", "CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(... | \n", "