Files
SIME/test/macro16_SIME_synthesis_tSNE.ipynb
mm644706215 ea218a3a39 update
2025-10-16 17:26:35 +08:00

1145 lines
44 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import pandas as pd\n",
"\n",
"data_file = Path('../../data/Macro16_SIME_Synthesis/fixed_macrolides_2025.csv')\n",
"macrolactondb_file = Path('../../data/MacrolactoneDB/ring16/temp.csv')\n",
"synthesis_df = pd.read_csv(data_file)\n",
"synthesis_df['synthesis'] = 'SIME'\n",
"macrolactondb_df = pd.read_csv(macrolactondb_file)\n",
"macrolactondb_df['synthesis'] = 'MacrolactonDB'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>IDs</th>\n",
" <th>molecule_pref_name</th>\n",
" <th>max_pChEMBL</th>\n",
" <th>max_pChEMBL_target</th>\n",
" <th># Target Organisms</th>\n",
" <th>Target Organisms</th>\n",
" <th># Known Targets</th>\n",
" <th>Known Targets</th>\n",
" <th>target_pref_name</th>\n",
" <th>smiles</th>\n",
" <th>synthesis</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CHEMBL94657</td>\n",
" <td>PATUPILONE</td>\n",
" <td>10.67</td>\n",
" <td>CHEMBL1075590</td>\n",
" <td>695</td>\n",
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
" <td>695</td>\n",
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
" <td>MacrolactonDB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CHEMBL94657</td>\n",
" <td>PATUPILONE</td>\n",
" <td>10.67</td>\n",
" <td>CHEMBL1075590</td>\n",
" <td>695</td>\n",
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
" <td>695</td>\n",
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
" <td>MacrolactonDB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CHEMBL1554</td>\n",
" <td>DACTINOMYCIN</td>\n",
" <td>10.10</td>\n",
" <td>CHEMBL614533</td>\n",
" <td>177</td>\n",
" <td>Giardia intestinalis, Trypanosoma cruzi, Equus...</td>\n",
" <td>177</td>\n",
" <td>CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551...</td>\n",
" <td>HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7...</td>\n",
" <td>Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C...</td>\n",
" <td>MacrolactonDB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CHEMBL1173445</td>\n",
" <td>LARGAZOLE</td>\n",
" <td>8.80</td>\n",
" <td>CHEMBL612545</td>\n",
" <td>45</td>\n",
" <td>Homo sapiens, None</td>\n",
" <td>45</td>\n",
" <td>CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,...</td>\n",
" <td>Histone deacetylase 9, Ubiquitin-like modifier...</td>\n",
" <td>CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2...</td>\n",
" <td>MacrolactonDB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CHEMBL3902498</td>\n",
" <td>NaN</td>\n",
" <td>9.37</td>\n",
" <td>CHEMBL2095194,CHEMBL3991</td>\n",
" <td>17</td>\n",
" <td>Homo sapiens, None</td>\n",
" <td>17</td>\n",
" <td>CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,...</td>\n",
" <td>Coagulation factor X, Kallikrein 1, Coagulatio...</td>\n",
" <td>Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC...</td>\n",
" <td>MacrolactonDB</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n",
"0 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
"1 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
"2 CHEMBL1554 DACTINOMYCIN 10.10 CHEMBL614533 \n",
"3 CHEMBL1173445 LARGAZOLE 8.80 CHEMBL612545 \n",
"4 CHEMBL3902498 NaN 9.37 CHEMBL2095194,CHEMBL3991 \n",
"\n",
" # Target Organisms Target Organisms \\\n",
"0 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
"1 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
"2 177 Giardia intestinalis, Trypanosoma cruzi, Equus... \n",
"3 45 Homo sapiens, None \n",
"4 17 Homo sapiens, None \n",
"\n",
" # Known Targets Known Targets \\\n",
"0 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
"1 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
"2 177 CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... \n",
"3 45 CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,... \n",
"4 17 CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,... \n",
"\n",
" target_pref_name \\\n",
"0 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
"1 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
"2 HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... \n",
"3 Histone deacetylase 9, Ubiquitin-like modifier... \n",
"4 Coagulation factor X, Kallikrein 1, Coagulatio... \n",
"\n",
" smiles synthesis \n",
"0 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... MacrolactonDB \n",
"1 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... MacrolactonDB \n",
"2 Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... MacrolactonDB \n",
"3 CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2... MacrolactonDB \n",
"4 Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC... MacrolactonDB "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"macrolactondb_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# Target Organisms\n",
"0 1587\n",
"1 195\n",
"2 99\n",
"3 36\n",
"4 31\n",
"5 27\n",
"6 18\n",
"7 9\n",
"8 7\n",
"9 5\n",
"15 2\n",
"695 2\n",
"10 1\n",
"177 1\n",
"45 1\n",
"17 1\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# 统计 '# Target Organisms' 列的唯一值计数\n",
"counts = macrolactondb_df['# Target Organisms'].value_counts()\n",
"\n",
"# 打印结果(默认按计数降序排列)\n",
"print(counts)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"是否所有值都相等: True\n"
]
}
],
"source": [
"# 判断两个列是否相等返回一个布尔Series\n",
"equal_mask = macrolactondb_df[\"# Target Organisms\"] == macrolactondb_df[\"# Known Targets\"]\n",
"\n",
"# 查看是否所有行都相等\n",
"all_equal = equal_mask.all()\n",
"\n",
"# 输出是否所有值都相等\n",
"print(\"是否所有值都相等:\", all_equal)\n",
"\n",
"# 如果不全相等,打印出不相等的行\n",
"if not all_equal:\n",
" unequal_rows = macrolactondb_df[~equal_mask]\n",
" print(\"不相等的行:\")\n",
" print(unequal_rows)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Target Organisms\n",
"Homo sapiens 259\n",
"Homo sapiens, None 52\n",
"Mus musculus 19\n",
"Human immunodeficiency virus 1 7\n",
"Plasmodium falciparum 4\n",
"Mus musculus, Homo sapiens, None 3\n",
"Mus musculus, Homo sapiens 3\n",
"Staphylococcus aureus 2\n",
"Sus scrofa, Mus musculus, None, Plasmodium falciparum, Homo sapiens, Rattus norvegicus 2\n",
"None, Rattus norvegicus 2\n",
"Escherichia coli (strain K12) 2\n",
"Giardia intestinalis, Trypanosoma cruzi, Equus caballus, Bos taurus, Mus musculus, None, Plasmodium falciparum, Chlorocebus aethiops, Homo sapiens 1\n",
"Homo sapiens, None, Trypanosoma brucei brucei 1\n",
"Plasmodium falciparum NF54, Trypanosoma cruzi, Trypanosoma brucei rhodesiense 1\n",
"Homo sapiens, Equus caballus 1\n",
"Mus musculus, Homo sapiens, Rattus norvegicus 1\n",
"Plasmodium falciparum NF54, Trypanosoma cruzi, Trypanosoma brucei rhodesiense, Rattus norvegicus 1\n",
"Schistosoma mansoni, Influenza A virus 1\n",
"Homo sapiens, Gallus gallus 1\n",
"Homo sapiens, Sus scrofa 1\n",
"Bacillus subtilis 1\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# 统计 '# Target Organisms' 列的唯一值计数\n",
"counts = macrolactondb_df['Target Organisms'].value_counts()\n",
"\n",
"# 打印结果(默认按计数降序排列)\n",
"print(counts)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>IDs</th>\n",
" <th>molecule_pref_name</th>\n",
" <th>max_pChEMBL</th>\n",
" <th>max_pChEMBL_target</th>\n",
" <th># Target Organisms</th>\n",
" <th>Target Organisms</th>\n",
" <th># Known Targets</th>\n",
" <th>Known Targets</th>\n",
" <th>target_pref_name</th>\n",
" <th>smiles</th>\n",
" <th>synthesis</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>284</th>\n",
" <td>CHEMBL3342324</td>\n",
" <td>NaN</td>\n",
" <td>6.52</td>\n",
" <td>CHEMBL3341578</td>\n",
" <td>1</td>\n",
" <td>Escherichia coli (strain K12)</td>\n",
" <td>1</td>\n",
" <td>CHEMBL3341578</td>\n",
" <td>ATP-dependent Clp protease proteolytic subunit</td>\n",
" <td>C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@...</td>\n",
" <td>MacrolactonDB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>338</th>\n",
" <td>CHEMBL3342325</td>\n",
" <td>NaN</td>\n",
" <td>6.16</td>\n",
" <td>CHEMBL3341578</td>\n",
" <td>1</td>\n",
" <td>Escherichia coli (strain K12)</td>\n",
" <td>1</td>\n",
" <td>CHEMBL3341578</td>\n",
" <td>ATP-dependent Clp protease proteolytic subunit</td>\n",
" <td>C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@...</td>\n",
" <td>MacrolactonDB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>341</th>\n",
" <td>CHEMBL3742171</td>\n",
" <td>NaN</td>\n",
" <td>6.52</td>\n",
" <td>CHEMBL1681616</td>\n",
" <td>1</td>\n",
" <td>Staphylococcus aureus</td>\n",
" <td>1</td>\n",
" <td>CHEMBL1681616</td>\n",
" <td>Accessory gene regulator protein A</td>\n",
" <td>CC(C)[C@@H]1NC(=O)C[C@H](/C=C/c2ccccc2)OC(=O)[...</td>\n",
" <td>MacrolactonDB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>350</th>\n",
" <td>CHEMBL1784527</td>\n",
" <td>TURNAGAINOLIDE A</td>\n",
" <td>6.10</td>\n",
" <td>CHEMBL1681616</td>\n",
" <td>1</td>\n",
" <td>Staphylococcus aureus</td>\n",
" <td>1</td>\n",
" <td>CHEMBL1681616</td>\n",
" <td>Accessory gene regulator protein A</td>\n",
" <td>CC[C@H](C)[C@@H]1NC(=O)[C@@H](C)NC(=O)[C@H](C(...</td>\n",
" <td>MacrolactonDB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>392</th>\n",
" <td>CHEMBL1317175</td>\n",
" <td>NaN</td>\n",
" <td>5.55</td>\n",
" <td>CHEMBL1293248</td>\n",
" <td>1</td>\n",
" <td>Bacillus subtilis</td>\n",
" <td>1</td>\n",
" <td>CHEMBL1293248</td>\n",
" <td>4'-phosphopantetheinyl transferase ffp</td>\n",
" <td>CO/C1=C\\C(C)=C\\[C@@H](C)[C@@H](O)[C@@H](C)C/C(...</td>\n",
" <td>MacrolactonDB</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n",
"284 CHEMBL3342324 NaN 6.52 CHEMBL3341578 \n",
"338 CHEMBL3342325 NaN 6.16 CHEMBL3341578 \n",
"341 CHEMBL3742171 NaN 6.52 CHEMBL1681616 \n",
"350 CHEMBL1784527 TURNAGAINOLIDE A 6.10 CHEMBL1681616 \n",
"392 CHEMBL1317175 NaN 5.55 CHEMBL1293248 \n",
"\n",
" # Target Organisms Target Organisms # Known Targets \\\n",
"284 1 Escherichia coli (strain K12) 1 \n",
"338 1 Escherichia coli (strain K12) 1 \n",
"341 1 Staphylococcus aureus 1 \n",
"350 1 Staphylococcus aureus 1 \n",
"392 1 Bacillus subtilis 1 \n",
"\n",
" Known Targets target_pref_name \\\n",
"284 CHEMBL3341578 ATP-dependent Clp protease proteolytic subunit \n",
"338 CHEMBL3341578 ATP-dependent Clp protease proteolytic subunit \n",
"341 CHEMBL1681616 Accessory gene regulator protein A \n",
"350 CHEMBL1681616 Accessory gene regulator protein A \n",
"392 CHEMBL1293248 4'-phosphopantetheinyl transferase ffp \n",
"\n",
" smiles synthesis \n",
"284 C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@... MacrolactonDB \n",
"338 C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@... MacrolactonDB \n",
"341 CC(C)[C@@H]1NC(=O)C[C@H](/C=C/c2ccccc2)OC(=O)[... MacrolactonDB \n",
"350 CC[C@H](C)[C@@H]1NC(=O)[C@@H](C)NC(=O)[C@H](C(... MacrolactonDB \n",
"392 CO/C1=C\\C(C)=C\\[C@@H](C)[C@@H](O)[C@@H](C)C/C(... MacrolactonDB "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 保留抗菌的数据\n",
"\n",
"# 定义包含细菌靶点名称的列表\n",
"bacteria = ['Staphylococcus aureus', 'Escherichia coli', 'Bacillus subtilis']\n",
"\n",
"# 构造正则表达式,用 | 分隔多个目标\n",
"pattern = '|'.join(bacteria)\n",
"\n",
"# 筛选出 'Target Organisms' 列中包含任意细菌名称的行\n",
"macrolide_16_antibacterial_df = macrolactondb_df[macrolactondb_df['Target Organisms'].str.contains(pattern, case=False, na=False)]\n",
"\n",
"# 打印筛选后的数据\n",
"macrolide_16_antibacterial_df\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 695\n",
"1 695\n",
"2 177\n",
"3 45\n",
"4 17\n",
" ... \n",
"2017 0\n",
"2018 0\n",
"2019 0\n",
"2020 0\n",
"2021 0\n",
"Name: # Target Organisms, Length: 2022, dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"macrolactondb_df['# Target Organisms']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 45, 15, 177, 17, 695}\n"
]
}
],
"source": [
"print(set(macrolactondb_df['# Target Organisms'].to_list()))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"16"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(set(macrolactondb_df['# Target Organisms'].to_list()))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# macrolactondb_df[\"synthesis\"] = macrolactondb_df[\"# Target Organisms\"]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>original_smiles</th>\n",
" <th>fixed_smiles</th>\n",
" <th>status</th>\n",
" <th>message</th>\n",
" <th>synthesis</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
" <td>corrected</td>\n",
" <td>修复原子 28</td>\n",
" <td>SIME</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
" <td>corrected</td>\n",
" <td>修复原子 28</td>\n",
" <td>SIME</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
" <td>corrected</td>\n",
" <td>修复原子 28</td>\n",
" <td>SIME</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
" <td>corrected</td>\n",
" <td>修复原子 28</td>\n",
" <td>SIME</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
" <td>corrected</td>\n",
" <td>修复原子 28</td>\n",
" <td>SIME</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" original_smiles \\\n",
"0 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
"1 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
"2 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
"3 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
"4 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
"\n",
" fixed_smiles status message \\\n",
"0 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
"1 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
"2 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
"3 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
"4 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
"\n",
" synthesis \n",
"0 SIME \n",
"1 SIME \n",
"2 SIME \n",
"3 SIME \n",
"4 SIME "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"synthesis_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"synthesis_df = synthesis_df.rename(columns={'fixed_smiles': 'smiles'})"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>original_smiles</th>\n",
" <th>smiles</th>\n",
" <th>status</th>\n",
" <th>message</th>\n",
" <th>synthesis</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
" <td>corrected</td>\n",
" <td>修复原子 28</td>\n",
" <td>SIME</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
" <td>corrected</td>\n",
" <td>修复原子 28</td>\n",
" <td>SIME</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
" <td>corrected</td>\n",
" <td>修复原子 28</td>\n",
" <td>SIME</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
" <td>corrected</td>\n",
" <td>修复原子 28</td>\n",
" <td>SIME</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
" <td>corrected</td>\n",
" <td>修复原子 28</td>\n",
" <td>SIME</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" original_smiles \\\n",
"0 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
"1 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
"2 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
"3 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
"4 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
"\n",
" smiles status message \\\n",
"0 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
"1 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
"2 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
"3 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
"4 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
"\n",
" synthesis \n",
"0 SIME \n",
"1 SIME \n",
"2 SIME \n",
"3 SIME \n",
"4 SIME "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"synthesis_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"merge_df = pd.concat([macrolactondb_df, synthesis_df], ignore_index=True, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"252023"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(merge_df)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 12-20 环大环内酯分析\n",
"\n",
"每一种环的抗菌药物数量,看下是哪一种环的抗菌药物最多。\n",
"\n",
"整体12-20环大环内酯的抗菌药物有多少。"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"总计数据11036\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# 读取数据和统计\n",
"macrolactondb_df_ring12_20 = pd.read_csv('../../data/MacrolactoneDB/ring12_20/temp.csv')\n",
"counts = macrolactondb_df_ring12_20['Target Organisms'].value_counts()\n",
"\n",
"# 将 counts Series 转换成字符串并写入 txt 文件\n",
"with open('../../data/MacrolactoneDB/ring12_20/counts.txt', 'w') as file:\n",
" file.write(counts.to_string())\n",
"\n",
"\n",
"print(f\"总计数据:{len(macrolactondb_df_ring12_20)}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>IDs</th>\n",
" <th>molecule_pref_name</th>\n",
" <th>max_pChEMBL</th>\n",
" <th>max_pChEMBL_target</th>\n",
" <th># Target Organisms</th>\n",
" <th>Target Organisms</th>\n",
" <th># Known Targets</th>\n",
" <th>Known Targets</th>\n",
" <th>target_pref_name</th>\n",
" <th>smiles</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CHEMBL94657</td>\n",
" <td>PATUPILONE</td>\n",
" <td>10.67</td>\n",
" <td>CHEMBL1075590</td>\n",
" <td>695</td>\n",
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
" <td>695</td>\n",
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CHEMBL94657</td>\n",
" <td>PATUPILONE</td>\n",
" <td>10.67</td>\n",
" <td>CHEMBL1075590</td>\n",
" <td>695</td>\n",
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
" <td>695</td>\n",
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CHEMBL449158</td>\n",
" <td>BRYOSTATIN</td>\n",
" <td>9.36</td>\n",
" <td>CHEMBL2996</td>\n",
" <td>664</td>\n",
" <td>Homo sapiens, None, Rattus norvegicus</td>\n",
" <td>664</td>\n",
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
" <td>CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CHEMBL1554</td>\n",
" <td>DACTINOMYCIN</td>\n",
" <td>10.10</td>\n",
" <td>CHEMBL614533</td>\n",
" <td>177</td>\n",
" <td>Giardia intestinalis, Trypanosoma cruzi, Equus...</td>\n",
" <td>177</td>\n",
" <td>CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551...</td>\n",
" <td>HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7...</td>\n",
" <td>Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CHEMBL529</td>\n",
" <td>AZITHROMYCIN</td>\n",
" <td>8.59</td>\n",
" <td>CHEMBL3301413</td>\n",
" <td>70</td>\n",
" <td>None, Plasmodium falciparum, Escherichia coli,...</td>\n",
" <td>70</td>\n",
" <td>CHEMBL347, CHEMBL612313, CHEMBL354, CHEMBL3301...</td>\n",
" <td>Streptococcus, Unchecked, Cytochrome P450 3A4,...</td>\n",
" <td>CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n",
"0 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
"1 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
"2 CHEMBL449158 BRYOSTATIN 9.36 CHEMBL2996 \n",
"3 CHEMBL1554 DACTINOMYCIN 10.10 CHEMBL614533 \n",
"4 CHEMBL529 AZITHROMYCIN 8.59 CHEMBL3301413 \n",
"\n",
" # Target Organisms Target Organisms \\\n",
"0 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
"1 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
"2 664 Homo sapiens, None, Rattus norvegicus \n",
"3 177 Giardia intestinalis, Trypanosoma cruzi, Equus... \n",
"4 70 None, Plasmodium falciparum, Escherichia coli,... \n",
"\n",
" # Known Targets Known Targets \\\n",
"0 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
"1 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
"2 664 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
"3 177 CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... \n",
"4 70 CHEMBL347, CHEMBL612313, CHEMBL354, CHEMBL3301... \n",
"\n",
" target_pref_name \\\n",
"0 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
"1 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
"2 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
"3 HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... \n",
"4 Streptococcus, Unchecked, Cytochrome P450 3A4,... \n",
"\n",
" smiles \n",
"0 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n",
"1 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n",
"2 CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C... \n",
"3 Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... \n",
"4 CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(... "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"macrolactondb_df_ring12_20.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Target Organisms\n",
"Homo sapiens 815\n",
"Homo sapiens, None 180\n",
"Plasmodium falciparum 161\n",
"Hepatitis C virus, None 112\n",
"Homo sapiens, Plasmodium falciparum 63\n",
" ... \n",
"Giardia intestinalis, Schistosoma mansoni, Mus musculus, None, Homo sapiens, Saccharomyces cerevisiae 1\n",
"Trypanosoma cruzi 1\n",
"Influenza A virus 1\n",
"Escherichia coli K-12 1\n",
"Human herpesvirus 4 (strain B95-8) 1\n",
"Name: count, Length: 88, dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"macrolactondb_df_ring12_20['Target Organisms'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"synthesis_df.to_parquet(\"../../data/Macro16_SIME_Synthesis/synthesis_with_sa.parquet\") # 比CSV更快"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## chempot 绘制\n",
"\n",
"参考 utils/chem_viz.py"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"merge_df[[\"synthesis\", \"smiles\"]].to_csv('../../data/tSNE/merge.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"“RDKit设计的分子和天然分子在结构上有何不同” → 用 tailored。\n",
"\n",
"“RDKit设计的分子是否覆盖了天然分子的化学空间” → 用 structural。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 合成分子减少为1w进行可视化\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "synthesis_df 中 synthesis=1 的样本少于10,000个。",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[16], line 13\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# 3. 随机抽取1万个样本\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(synthesis_1_df) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m10000\u001b[39m:\n\u001b[0;32m---> 13\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msynthesis_df 中 synthesis=1 的样本少于10,000个。\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 15\u001b[0m sampled_synthesis_1_df \u001b[38;5;241m=\u001b[39m synthesis_1_df\u001b[38;5;241m.\u001b[39msample(n\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m)\u001b[38;5;241m.\u001b[39mreset_index(drop\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# 4. 合并抽取的样本与 macrolactondb_df\u001b[39;00m\n",
"\u001b[0;31mValueError\u001b[0m: synthesis_df 中 synthesis=1 的样本少于10,000个。"
]
}
],
"source": [
"import pandas as pd\n",
"from chemplot import Plotter\n",
"import numpy as np\n",
"\n",
"# 1. 合并数据框\n",
"merge_df = pd.concat([macrolactondb_df, synthesis_df], ignore_index=True, axis=0)\n",
"\n",
"# 2. 过滤出 synthesis=1 的样本\n",
"synthesis_1_df = synthesis_df[synthesis_df['synthesis'] == 1]\n",
"\n",
"# 3. 随机抽取1万个样本\n",
"if len(synthesis_1_df) < 10000:\n",
" raise ValueError(\"synthesis_df 中 synthesis=1 的样本少于10,000个。\")\n",
"\n",
"sampled_synthesis_1_df = synthesis_1_df.sample(n=10000, random_state=42).reset_index(drop=True)\n",
"\n",
"# 4. 合并抽取的样本与 macrolactondb_df\n",
"final_df = pd.concat([macrolactondb_df, sampled_synthesis_1_df], ignore_index=True, axis=0).reset_index(drop=True)\n",
"\n",
"# 5. 使用 ChemPlot 进行可视化\n",
"cp = Plotter.from_smiles(\n",
" final_df[\"smiles\"],\n",
" target=final_df[\"synthesis\"],\n",
" target_type=\"C\",\n",
" sim_type=\"structural\"\n",
")\n",
"\n",
"# 进行 t-SNE 降维\n",
"cp.tsne()\n",
"\n",
"# 可视化结果\n",
"cp.visualize_plot(kind=\"scatter\", size=20, remove_outliers=False, is_colored=True, colorbar=False)\n",
"\n",
"# 如果需要保存图像,可以取消下一行的注释\n",
"# cp.visualize_plot(kind=\"scatter\", size=20, remove_outliers=False, is_colored=True, colorbar=False, filename=\"scatter_plot.png\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "qsarml",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
}