{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import pandas as pd\n", "\n", "data_file = Path('../../data/Macro16_SIME_Synthesis/fixed_macrolides_2025.csv')\n", "macrolactondb_file = Path('../../data/MacrolactoneDB/ring16/temp.csv')\n", "synthesis_df = pd.read_csv(data_file)\n", "synthesis_df['synthesis'] = 'SIME'\n", "macrolactondb_df = pd.read_csv(macrolactondb_file)\n", "macrolactondb_df['synthesis'] = 'MacrolactonDB'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDsmolecule_pref_namemax_pChEMBLmax_pChEMBL_target# Target OrganismsTarget Organisms# Known TargetsKnown Targetstarget_pref_namesmilessynthesis
0CHEMBL94657PATUPILONE10.67CHEMBL1075590695Sus scrofa, Mus musculus, None, Plasmodium fal...695CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...MacrolactonDB
1CHEMBL94657PATUPILONE10.67CHEMBL1075590695Sus scrofa, Mus musculus, None, Plasmodium fal...695CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...MacrolactonDB
2CHEMBL1554DACTINOMYCIN10.10CHEMBL614533177Giardia intestinalis, Trypanosoma cruzi, Equus...177CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551...HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7...Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C...MacrolactonDB
3CHEMBL1173445LARGAZOLE8.80CHEMBL61254545Homo sapiens, None45CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,...Histone deacetylase 9, Ubiquitin-like modifier...CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2...MacrolactonDB
4CHEMBL3902498NaN9.37CHEMBL2095194,CHEMBL399117Homo sapiens, None17CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,...Coagulation factor X, Kallikrein 1, Coagulatio...Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC...MacrolactonDB
\n", "
" ], "text/plain": [ " IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n", "0 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n", "1 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n", "2 CHEMBL1554 DACTINOMYCIN 10.10 CHEMBL614533 \n", "3 CHEMBL1173445 LARGAZOLE 8.80 CHEMBL612545 \n", "4 CHEMBL3902498 NaN 9.37 CHEMBL2095194,CHEMBL3991 \n", "\n", " # Target Organisms Target Organisms \\\n", "0 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n", "1 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n", "2 177 Giardia intestinalis, Trypanosoma cruzi, Equus... \n", "3 45 Homo sapiens, None \n", "4 17 Homo sapiens, None \n", "\n", " # Known Targets Known Targets \\\n", "0 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n", "1 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n", "2 177 CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... \n", "3 45 CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,... \n", "4 17 CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,... \n", "\n", " target_pref_name \\\n", "0 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n", "1 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n", "2 HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... \n", "3 Histone deacetylase 9, Ubiquitin-like modifier... \n", "4 Coagulation factor X, Kallikrein 1, Coagulatio... \n", "\n", " smiles synthesis \n", "0 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... MacrolactonDB \n", "1 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... MacrolactonDB \n", "2 Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... MacrolactonDB \n", "3 CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2... MacrolactonDB \n", "4 Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC... MacrolactonDB " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "macrolactondb_df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# Target Organisms\n", "0 1587\n", "1 195\n", "2 99\n", "3 36\n", "4 31\n", "5 27\n", "6 18\n", "7 9\n", "8 7\n", "9 5\n", "15 2\n", "695 2\n", "10 1\n", "177 1\n", "45 1\n", "17 1\n", "Name: count, dtype: int64\n" ] } ], "source": [ "# 统计 '# Target Organisms' 列的唯一值计数\n", "counts = macrolactondb_df['# Target Organisms'].value_counts()\n", "\n", "# 打印结果(默认按计数降序排列)\n", "print(counts)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "是否所有值都相等: True\n" ] } ], "source": [ "# 判断两个列是否相等,返回一个布尔Series\n", "equal_mask = macrolactondb_df[\"# Target Organisms\"] == macrolactondb_df[\"# Known Targets\"]\n", "\n", "# 查看是否所有行都相等\n", "all_equal = equal_mask.all()\n", "\n", "# 输出是否所有值都相等\n", "print(\"是否所有值都相等:\", all_equal)\n", "\n", "# 如果不全相等,打印出不相等的行\n", "if not all_equal:\n", " unequal_rows = macrolactondb_df[~equal_mask]\n", " print(\"不相等的行:\")\n", " print(unequal_rows)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Target Organisms\n", "Homo sapiens 259\n", "Homo sapiens, None 52\n", "Mus musculus 19\n", "Human immunodeficiency virus 1 7\n", "Plasmodium falciparum 4\n", "Mus musculus, Homo sapiens, None 3\n", "Mus musculus, Homo sapiens 3\n", "Staphylococcus aureus 2\n", "Sus scrofa, Mus musculus, None, Plasmodium falciparum, Homo sapiens, Rattus norvegicus 2\n", "None, Rattus norvegicus 2\n", "Escherichia coli (strain K12) 2\n", "Giardia intestinalis, Trypanosoma cruzi, Equus caballus, Bos taurus, Mus musculus, None, Plasmodium falciparum, Chlorocebus aethiops, Homo sapiens 1\n", "Homo sapiens, None, Trypanosoma brucei brucei 1\n", "Plasmodium falciparum NF54, Trypanosoma cruzi, Trypanosoma brucei rhodesiense 1\n", "Homo sapiens, Equus caballus 1\n", "Mus musculus, Homo sapiens, Rattus norvegicus 1\n", "Plasmodium falciparum NF54, Trypanosoma cruzi, Trypanosoma brucei rhodesiense, Rattus norvegicus 1\n", "Schistosoma mansoni, Influenza A virus 1\n", "Homo sapiens, Gallus gallus 1\n", "Homo sapiens, Sus scrofa 1\n", "Bacillus subtilis 1\n", "Name: count, dtype: int64\n" ] } ], "source": [ "# 统计 '# Target Organisms' 列的唯一值计数\n", "counts = macrolactondb_df['Target Organisms'].value_counts()\n", "\n", "# 打印结果(默认按计数降序排列)\n", "print(counts)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDsmolecule_pref_namemax_pChEMBLmax_pChEMBL_target# Target OrganismsTarget Organisms# Known TargetsKnown Targetstarget_pref_namesmilessynthesis
284CHEMBL3342324NaN6.52CHEMBL33415781Escherichia coli (strain K12)1CHEMBL3341578ATP-dependent Clp protease proteolytic subunitC/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@...MacrolactonDB
338CHEMBL3342325NaN6.16CHEMBL33415781Escherichia coli (strain K12)1CHEMBL3341578ATP-dependent Clp protease proteolytic subunitC/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@...MacrolactonDB
341CHEMBL3742171NaN6.52CHEMBL16816161Staphylococcus aureus1CHEMBL1681616Accessory gene regulator protein ACC(C)[C@@H]1NC(=O)C[C@H](/C=C/c2ccccc2)OC(=O)[...MacrolactonDB
350CHEMBL1784527TURNAGAINOLIDE A6.10CHEMBL16816161Staphylococcus aureus1CHEMBL1681616Accessory gene regulator protein ACC[C@H](C)[C@@H]1NC(=O)[C@@H](C)NC(=O)[C@H](C(...MacrolactonDB
392CHEMBL1317175NaN5.55CHEMBL12932481Bacillus subtilis1CHEMBL12932484'-phosphopantetheinyl transferase ffpCO/C1=C\\C(C)=C\\[C@@H](C)[C@@H](O)[C@@H](C)C/C(...MacrolactonDB
\n", "
" ], "text/plain": [ " IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n", "284 CHEMBL3342324 NaN 6.52 CHEMBL3341578 \n", "338 CHEMBL3342325 NaN 6.16 CHEMBL3341578 \n", "341 CHEMBL3742171 NaN 6.52 CHEMBL1681616 \n", "350 CHEMBL1784527 TURNAGAINOLIDE A 6.10 CHEMBL1681616 \n", "392 CHEMBL1317175 NaN 5.55 CHEMBL1293248 \n", "\n", " # Target Organisms Target Organisms # Known Targets \\\n", "284 1 Escherichia coli (strain K12) 1 \n", "338 1 Escherichia coli (strain K12) 1 \n", "341 1 Staphylococcus aureus 1 \n", "350 1 Staphylococcus aureus 1 \n", "392 1 Bacillus subtilis 1 \n", "\n", " Known Targets target_pref_name \\\n", "284 CHEMBL3341578 ATP-dependent Clp protease proteolytic subunit \n", "338 CHEMBL3341578 ATP-dependent Clp protease proteolytic subunit \n", "341 CHEMBL1681616 Accessory gene regulator protein A \n", "350 CHEMBL1681616 Accessory gene regulator protein A \n", "392 CHEMBL1293248 4'-phosphopantetheinyl transferase ffp \n", "\n", " smiles synthesis \n", "284 C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@... MacrolactonDB \n", "338 C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@... MacrolactonDB \n", "341 CC(C)[C@@H]1NC(=O)C[C@H](/C=C/c2ccccc2)OC(=O)[... MacrolactonDB \n", "350 CC[C@H](C)[C@@H]1NC(=O)[C@@H](C)NC(=O)[C@H](C(... MacrolactonDB \n", "392 CO/C1=C\\C(C)=C\\[C@@H](C)[C@@H](O)[C@@H](C)C/C(... MacrolactonDB " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 保留抗菌的数据\n", "\n", "# 定义包含细菌靶点名称的列表\n", "bacteria = ['Staphylococcus aureus', 'Escherichia coli', 'Bacillus subtilis']\n", "\n", "# 构造正则表达式,用 | 分隔多个目标\n", "pattern = '|'.join(bacteria)\n", "\n", "# 筛选出 'Target Organisms' 列中包含任意细菌名称的行\n", "macrolide_16_antibacterial_df = macrolactondb_df[macrolactondb_df['Target Organisms'].str.contains(pattern, case=False, na=False)]\n", "\n", "# 打印筛选后的数据\n", "macrolide_16_antibacterial_df\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 695\n", "1 695\n", "2 177\n", "3 45\n", "4 17\n", " ... \n", "2017 0\n", "2018 0\n", "2019 0\n", "2020 0\n", "2021 0\n", "Name: # Target Organisms, Length: 2022, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "macrolactondb_df['# Target Organisms']" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 45, 15, 177, 17, 695}\n" ] } ], "source": [ "print(set(macrolactondb_df['# Target Organisms'].to_list()))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "16" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(macrolactondb_df['# Target Organisms'].to_list()))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# macrolactondb_df[\"synthesis\"] = macrolactondb_df[\"# Target Organisms\"]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
original_smilesfixed_smilesstatusmessagesynthesis
0O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...corrected修复原子 28SIME
1O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...corrected修复原子 28SIME
2O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...corrected修复原子 28SIME
3O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...corrected修复原子 28SIME
4O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...corrected修复原子 28SIME
\n", "
" ], "text/plain": [ " original_smiles \\\n", "0 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n", "1 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n", "2 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n", "3 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n", "4 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n", "\n", " fixed_smiles status message \\\n", "0 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n", "1 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n", "2 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n", "3 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n", "4 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n", "\n", " synthesis \n", "0 SIME \n", "1 SIME \n", "2 SIME \n", "3 SIME \n", "4 SIME " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "synthesis_df.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "synthesis_df = synthesis_df.rename(columns={'fixed_smiles': 'smiles'})" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
original_smilessmilesstatusmessagesynthesis
0O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...corrected修复原子 28SIME
1O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...corrected修复原子 28SIME
2O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...corrected修复原子 28SIME
3O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...corrected修复原子 28SIME
4O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...corrected修复原子 28SIME
\n", "
" ], "text/plain": [ " original_smiles \\\n", "0 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n", "1 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n", "2 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n", "3 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n", "4 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n", "\n", " smiles status message \\\n", "0 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n", "1 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n", "2 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n", "3 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n", "4 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n", "\n", " synthesis \n", "0 SIME \n", "1 SIME \n", "2 SIME \n", "3 SIME \n", "4 SIME " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "synthesis_df.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "merge_df = pd.concat([macrolactondb_df, synthesis_df], ignore_index=True, axis=0)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "252023" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(merge_df)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 12-20 环大环内酯分析\n", "\n", "每一种环的抗菌药物数量,看下是哪一种环的抗菌药物最多。\n", "\n", "整体12-20环大环内酯的,抗菌药物有多少。" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "总计数据:11036\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# 读取数据和统计\n", "macrolactondb_df_ring12_20 = pd.read_csv('../../data/MacrolactoneDB/ring12_20/temp.csv')\n", "counts = macrolactondb_df_ring12_20['Target Organisms'].value_counts()\n", "\n", "# 将 counts Series 转换成字符串并写入 txt 文件\n", "with open('../../data/MacrolactoneDB/ring12_20/counts.txt', 'w') as file:\n", " file.write(counts.to_string())\n", "\n", "\n", "print(f\"总计数据:{len(macrolactondb_df_ring12_20)}\")\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDsmolecule_pref_namemax_pChEMBLmax_pChEMBL_target# Target OrganismsTarget Organisms# Known TargetsKnown Targetstarget_pref_namesmiles
0CHEMBL94657PATUPILONE10.67CHEMBL1075590695Sus scrofa, Mus musculus, None, Plasmodium fal...695CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...
1CHEMBL94657PATUPILONE10.67CHEMBL1075590695Sus scrofa, Mus musculus, None, Plasmodium fal...695CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...
2CHEMBL449158BRYOSTATIN9.36CHEMBL2996664Homo sapiens, None, Rattus norvegicus664CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C...
3CHEMBL1554DACTINOMYCIN10.10CHEMBL614533177Giardia intestinalis, Trypanosoma cruzi, Equus...177CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551...HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7...Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C...
4CHEMBL529AZITHROMYCIN8.59CHEMBL330141370None, Plasmodium falciparum, Escherichia coli,...70CHEMBL347, CHEMBL612313, CHEMBL354, CHEMBL3301...Streptococcus, Unchecked, Cytochrome P450 3A4,...CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...
\n", "
" ], "text/plain": [ " IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n", "0 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n", "1 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n", "2 CHEMBL449158 BRYOSTATIN 9.36 CHEMBL2996 \n", "3 CHEMBL1554 DACTINOMYCIN 10.10 CHEMBL614533 \n", "4 CHEMBL529 AZITHROMYCIN 8.59 CHEMBL3301413 \n", "\n", " # Target Organisms Target Organisms \\\n", "0 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n", "1 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n", "2 664 Homo sapiens, None, Rattus norvegicus \n", "3 177 Giardia intestinalis, Trypanosoma cruzi, Equus... \n", "4 70 None, Plasmodium falciparum, Escherichia coli,... \n", "\n", " # Known Targets Known Targets \\\n", "0 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n", "1 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n", "2 664 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n", "3 177 CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... \n", "4 70 CHEMBL347, CHEMBL612313, CHEMBL354, CHEMBL3301... \n", "\n", " target_pref_name \\\n", "0 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n", "1 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n", "2 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n", "3 HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... \n", "4 Streptococcus, Unchecked, Cytochrome P450 3A4,... \n", "\n", " smiles \n", "0 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n", "1 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n", "2 CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C... \n", "3 Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... \n", "4 CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(... " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "macrolactondb_df_ring12_20.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Target Organisms\n", "Homo sapiens 815\n", "Homo sapiens, None 180\n", "Plasmodium falciparum 161\n", "Hepatitis C virus, None 112\n", "Homo sapiens, Plasmodium falciparum 63\n", " ... \n", "Giardia intestinalis, Schistosoma mansoni, Mus musculus, None, Homo sapiens, Saccharomyces cerevisiae 1\n", "Trypanosoma cruzi 1\n", "Influenza A virus 1\n", "Escherichia coli K-12 1\n", "Human herpesvirus 4 (strain B95-8) 1\n", "Name: count, Length: 88, dtype: int64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "macrolactondb_df_ring12_20['Target Organisms'].value_counts()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "synthesis_df.to_parquet(\"../../data/Macro16_SIME_Synthesis/synthesis_with_sa.parquet\") # 比CSV更快" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## chempot 绘制\n", "\n", "参考 utils/chem_viz.py" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "merge_df[[\"synthesis\", \"smiles\"]].to_csv('../../data/tSNE/merge.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "“RDKit设计的分子和天然分子在结构上有何不同?” → 用 tailored。\n", "\n", "“RDKit设计的分子是否覆盖了天然分子的化学空间?” → 用 structural。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 合成分子减少为1w进行可视化\n", "\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "synthesis_df 中 synthesis=1 的样本少于10,000个。", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[16], line 13\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# 3. 随机抽取1万个样本\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(synthesis_1_df) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m10000\u001b[39m:\n\u001b[0;32m---> 13\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msynthesis_df 中 synthesis=1 的样本少于10,000个。\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 15\u001b[0m sampled_synthesis_1_df \u001b[38;5;241m=\u001b[39m synthesis_1_df\u001b[38;5;241m.\u001b[39msample(n\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m)\u001b[38;5;241m.\u001b[39mreset_index(drop\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# 4. 合并抽取的样本与 macrolactondb_df\u001b[39;00m\n", "\u001b[0;31mValueError\u001b[0m: synthesis_df 中 synthesis=1 的样本少于10,000个。" ] } ], "source": [ "import pandas as pd\n", "from chemplot import Plotter\n", "import numpy as np\n", "\n", "# 1. 合并数据框\n", "merge_df = pd.concat([macrolactondb_df, synthesis_df], ignore_index=True, axis=0)\n", "\n", "# 2. 过滤出 synthesis=1 的样本\n", "synthesis_1_df = synthesis_df[synthesis_df['synthesis'] == 1]\n", "\n", "# 3. 随机抽取1万个样本\n", "if len(synthesis_1_df) < 10000:\n", " raise ValueError(\"synthesis_df 中 synthesis=1 的样本少于10,000个。\")\n", "\n", "sampled_synthesis_1_df = synthesis_1_df.sample(n=10000, random_state=42).reset_index(drop=True)\n", "\n", "# 4. 合并抽取的样本与 macrolactondb_df\n", "final_df = pd.concat([macrolactondb_df, sampled_synthesis_1_df], ignore_index=True, axis=0).reset_index(drop=True)\n", "\n", "# 5. 使用 ChemPlot 进行可视化\n", "cp = Plotter.from_smiles(\n", " final_df[\"smiles\"],\n", " target=final_df[\"synthesis\"],\n", " target_type=\"C\",\n", " sim_type=\"structural\"\n", ")\n", "\n", "# 进行 t-SNE 降维\n", "cp.tsne()\n", "\n", "# 可视化结果\n", "cp.visualize_plot(kind=\"scatter\", size=20, remove_outliers=False, is_colored=True, colorbar=False)\n", "\n", "# 如果需要保存图像,可以取消下一行的注释\n", "# cp.visualize_plot(kind=\"scatter\", size=20, remove_outliers=False, is_colored=True, colorbar=False, filename=\"scatter_plot.png\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "qsarml", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 2 }