1145 lines
44 KiB
Plaintext
1145 lines
44 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from pathlib import Path\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"data_file = Path('../../data/Macro16_SIME_Synthesis/fixed_macrolides_2025.csv')\n",
|
||
"macrolactondb_file = Path('../../data/MacrolactoneDB/ring16/temp.csv')\n",
|
||
"synthesis_df = pd.read_csv(data_file)\n",
|
||
"synthesis_df['synthesis'] = 'SIME'\n",
|
||
"macrolactondb_df = pd.read_csv(macrolactondb_file)\n",
|
||
"macrolactondb_df['synthesis'] = 'MacrolactonDB'"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>IDs</th>\n",
|
||
" <th>molecule_pref_name</th>\n",
|
||
" <th>max_pChEMBL</th>\n",
|
||
" <th>max_pChEMBL_target</th>\n",
|
||
" <th># Target Organisms</th>\n",
|
||
" <th>Target Organisms</th>\n",
|
||
" <th># Known Targets</th>\n",
|
||
" <th>Known Targets</th>\n",
|
||
" <th>target_pref_name</th>\n",
|
||
" <th>smiles</th>\n",
|
||
" <th>synthesis</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>CHEMBL94657</td>\n",
|
||
" <td>PATUPILONE</td>\n",
|
||
" <td>10.67</td>\n",
|
||
" <td>CHEMBL1075590</td>\n",
|
||
" <td>695</td>\n",
|
||
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
|
||
" <td>695</td>\n",
|
||
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
|
||
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
|
||
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
|
||
" <td>MacrolactonDB</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>CHEMBL94657</td>\n",
|
||
" <td>PATUPILONE</td>\n",
|
||
" <td>10.67</td>\n",
|
||
" <td>CHEMBL1075590</td>\n",
|
||
" <td>695</td>\n",
|
||
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
|
||
" <td>695</td>\n",
|
||
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
|
||
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
|
||
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
|
||
" <td>MacrolactonDB</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>CHEMBL1554</td>\n",
|
||
" <td>DACTINOMYCIN</td>\n",
|
||
" <td>10.10</td>\n",
|
||
" <td>CHEMBL614533</td>\n",
|
||
" <td>177</td>\n",
|
||
" <td>Giardia intestinalis, Trypanosoma cruzi, Equus...</td>\n",
|
||
" <td>177</td>\n",
|
||
" <td>CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551...</td>\n",
|
||
" <td>HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7...</td>\n",
|
||
" <td>Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C...</td>\n",
|
||
" <td>MacrolactonDB</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>CHEMBL1173445</td>\n",
|
||
" <td>LARGAZOLE</td>\n",
|
||
" <td>8.80</td>\n",
|
||
" <td>CHEMBL612545</td>\n",
|
||
" <td>45</td>\n",
|
||
" <td>Homo sapiens, None</td>\n",
|
||
" <td>45</td>\n",
|
||
" <td>CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,...</td>\n",
|
||
" <td>Histone deacetylase 9, Ubiquitin-like modifier...</td>\n",
|
||
" <td>CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2...</td>\n",
|
||
" <td>MacrolactonDB</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>CHEMBL3902498</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>9.37</td>\n",
|
||
" <td>CHEMBL2095194,CHEMBL3991</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>Homo sapiens, None</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,...</td>\n",
|
||
" <td>Coagulation factor X, Kallikrein 1, Coagulatio...</td>\n",
|
||
" <td>Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC...</td>\n",
|
||
" <td>MacrolactonDB</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n",
|
||
"0 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
|
||
"1 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
|
||
"2 CHEMBL1554 DACTINOMYCIN 10.10 CHEMBL614533 \n",
|
||
"3 CHEMBL1173445 LARGAZOLE 8.80 CHEMBL612545 \n",
|
||
"4 CHEMBL3902498 NaN 9.37 CHEMBL2095194,CHEMBL3991 \n",
|
||
"\n",
|
||
" # Target Organisms Target Organisms \\\n",
|
||
"0 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
|
||
"1 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
|
||
"2 177 Giardia intestinalis, Trypanosoma cruzi, Equus... \n",
|
||
"3 45 Homo sapiens, None \n",
|
||
"4 17 Homo sapiens, None \n",
|
||
"\n",
|
||
" # Known Targets Known Targets \\\n",
|
||
"0 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
|
||
"1 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
|
||
"2 177 CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... \n",
|
||
"3 45 CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,... \n",
|
||
"4 17 CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,... \n",
|
||
"\n",
|
||
" target_pref_name \\\n",
|
||
"0 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
|
||
"1 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
|
||
"2 HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... \n",
|
||
"3 Histone deacetylase 9, Ubiquitin-like modifier... \n",
|
||
"4 Coagulation factor X, Kallikrein 1, Coagulatio... \n",
|
||
"\n",
|
||
" smiles synthesis \n",
|
||
"0 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... MacrolactonDB \n",
|
||
"1 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... MacrolactonDB \n",
|
||
"2 Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... MacrolactonDB \n",
|
||
"3 CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2... MacrolactonDB \n",
|
||
"4 Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC... MacrolactonDB "
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"macrolactondb_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"# Target Organisms\n",
|
||
"0 1587\n",
|
||
"1 195\n",
|
||
"2 99\n",
|
||
"3 36\n",
|
||
"4 31\n",
|
||
"5 27\n",
|
||
"6 18\n",
|
||
"7 9\n",
|
||
"8 7\n",
|
||
"9 5\n",
|
||
"15 2\n",
|
||
"695 2\n",
|
||
"10 1\n",
|
||
"177 1\n",
|
||
"45 1\n",
|
||
"17 1\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 统计 '# Target Organisms' 列的唯一值计数\n",
|
||
"counts = macrolactondb_df['# Target Organisms'].value_counts()\n",
|
||
"\n",
|
||
"# 打印结果(默认按计数降序排列)\n",
|
||
"print(counts)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"是否所有值都相等: True\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 判断两个列是否相等,返回一个布尔Series\n",
|
||
"equal_mask = macrolactondb_df[\"# Target Organisms\"] == macrolactondb_df[\"# Known Targets\"]\n",
|
||
"\n",
|
||
"# 查看是否所有行都相等\n",
|
||
"all_equal = equal_mask.all()\n",
|
||
"\n",
|
||
"# 输出是否所有值都相等\n",
|
||
"print(\"是否所有值都相等:\", all_equal)\n",
|
||
"\n",
|
||
"# 如果不全相等,打印出不相等的行\n",
|
||
"if not all_equal:\n",
|
||
" unequal_rows = macrolactondb_df[~equal_mask]\n",
|
||
" print(\"不相等的行:\")\n",
|
||
" print(unequal_rows)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Target Organisms\n",
|
||
"Homo sapiens 259\n",
|
||
"Homo sapiens, None 52\n",
|
||
"Mus musculus 19\n",
|
||
"Human immunodeficiency virus 1 7\n",
|
||
"Plasmodium falciparum 4\n",
|
||
"Mus musculus, Homo sapiens, None 3\n",
|
||
"Mus musculus, Homo sapiens 3\n",
|
||
"Staphylococcus aureus 2\n",
|
||
"Sus scrofa, Mus musculus, None, Plasmodium falciparum, Homo sapiens, Rattus norvegicus 2\n",
|
||
"None, Rattus norvegicus 2\n",
|
||
"Escherichia coli (strain K12) 2\n",
|
||
"Giardia intestinalis, Trypanosoma cruzi, Equus caballus, Bos taurus, Mus musculus, None, Plasmodium falciparum, Chlorocebus aethiops, Homo sapiens 1\n",
|
||
"Homo sapiens, None, Trypanosoma brucei brucei 1\n",
|
||
"Plasmodium falciparum NF54, Trypanosoma cruzi, Trypanosoma brucei rhodesiense 1\n",
|
||
"Homo sapiens, Equus caballus 1\n",
|
||
"Mus musculus, Homo sapiens, Rattus norvegicus 1\n",
|
||
"Plasmodium falciparum NF54, Trypanosoma cruzi, Trypanosoma brucei rhodesiense, Rattus norvegicus 1\n",
|
||
"Schistosoma mansoni, Influenza A virus 1\n",
|
||
"Homo sapiens, Gallus gallus 1\n",
|
||
"Homo sapiens, Sus scrofa 1\n",
|
||
"Bacillus subtilis 1\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 统计 '# Target Organisms' 列的唯一值计数\n",
|
||
"counts = macrolactondb_df['Target Organisms'].value_counts()\n",
|
||
"\n",
|
||
"# 打印结果(默认按计数降序排列)\n",
|
||
"print(counts)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>IDs</th>\n",
|
||
" <th>molecule_pref_name</th>\n",
|
||
" <th>max_pChEMBL</th>\n",
|
||
" <th>max_pChEMBL_target</th>\n",
|
||
" <th># Target Organisms</th>\n",
|
||
" <th>Target Organisms</th>\n",
|
||
" <th># Known Targets</th>\n",
|
||
" <th>Known Targets</th>\n",
|
||
" <th>target_pref_name</th>\n",
|
||
" <th>smiles</th>\n",
|
||
" <th>synthesis</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>284</th>\n",
|
||
" <td>CHEMBL3342324</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>6.52</td>\n",
|
||
" <td>CHEMBL3341578</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Escherichia coli (strain K12)</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>CHEMBL3341578</td>\n",
|
||
" <td>ATP-dependent Clp protease proteolytic subunit</td>\n",
|
||
" <td>C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@...</td>\n",
|
||
" <td>MacrolactonDB</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>338</th>\n",
|
||
" <td>CHEMBL3342325</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>6.16</td>\n",
|
||
" <td>CHEMBL3341578</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Escherichia coli (strain K12)</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>CHEMBL3341578</td>\n",
|
||
" <td>ATP-dependent Clp protease proteolytic subunit</td>\n",
|
||
" <td>C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@...</td>\n",
|
||
" <td>MacrolactonDB</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>341</th>\n",
|
||
" <td>CHEMBL3742171</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>6.52</td>\n",
|
||
" <td>CHEMBL1681616</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Staphylococcus aureus</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>CHEMBL1681616</td>\n",
|
||
" <td>Accessory gene regulator protein A</td>\n",
|
||
" <td>CC(C)[C@@H]1NC(=O)C[C@H](/C=C/c2ccccc2)OC(=O)[...</td>\n",
|
||
" <td>MacrolactonDB</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>350</th>\n",
|
||
" <td>CHEMBL1784527</td>\n",
|
||
" <td>TURNAGAINOLIDE A</td>\n",
|
||
" <td>6.10</td>\n",
|
||
" <td>CHEMBL1681616</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Staphylococcus aureus</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>CHEMBL1681616</td>\n",
|
||
" <td>Accessory gene regulator protein A</td>\n",
|
||
" <td>CC[C@H](C)[C@@H]1NC(=O)[C@@H](C)NC(=O)[C@H](C(...</td>\n",
|
||
" <td>MacrolactonDB</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>392</th>\n",
|
||
" <td>CHEMBL1317175</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>5.55</td>\n",
|
||
" <td>CHEMBL1293248</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Bacillus subtilis</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>CHEMBL1293248</td>\n",
|
||
" <td>4'-phosphopantetheinyl transferase ffp</td>\n",
|
||
" <td>CO/C1=C\\C(C)=C\\[C@@H](C)[C@@H](O)[C@@H](C)C/C(...</td>\n",
|
||
" <td>MacrolactonDB</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n",
|
||
"284 CHEMBL3342324 NaN 6.52 CHEMBL3341578 \n",
|
||
"338 CHEMBL3342325 NaN 6.16 CHEMBL3341578 \n",
|
||
"341 CHEMBL3742171 NaN 6.52 CHEMBL1681616 \n",
|
||
"350 CHEMBL1784527 TURNAGAINOLIDE A 6.10 CHEMBL1681616 \n",
|
||
"392 CHEMBL1317175 NaN 5.55 CHEMBL1293248 \n",
|
||
"\n",
|
||
" # Target Organisms Target Organisms # Known Targets \\\n",
|
||
"284 1 Escherichia coli (strain K12) 1 \n",
|
||
"338 1 Escherichia coli (strain K12) 1 \n",
|
||
"341 1 Staphylococcus aureus 1 \n",
|
||
"350 1 Staphylococcus aureus 1 \n",
|
||
"392 1 Bacillus subtilis 1 \n",
|
||
"\n",
|
||
" Known Targets target_pref_name \\\n",
|
||
"284 CHEMBL3341578 ATP-dependent Clp protease proteolytic subunit \n",
|
||
"338 CHEMBL3341578 ATP-dependent Clp protease proteolytic subunit \n",
|
||
"341 CHEMBL1681616 Accessory gene regulator protein A \n",
|
||
"350 CHEMBL1681616 Accessory gene regulator protein A \n",
|
||
"392 CHEMBL1293248 4'-phosphopantetheinyl transferase ffp \n",
|
||
"\n",
|
||
" smiles synthesis \n",
|
||
"284 C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@... MacrolactonDB \n",
|
||
"338 C/C=C/C=C/C=C/C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@... MacrolactonDB \n",
|
||
"341 CC(C)[C@@H]1NC(=O)C[C@H](/C=C/c2ccccc2)OC(=O)[... MacrolactonDB \n",
|
||
"350 CC[C@H](C)[C@@H]1NC(=O)[C@@H](C)NC(=O)[C@H](C(... MacrolactonDB \n",
|
||
"392 CO/C1=C\\C(C)=C\\[C@@H](C)[C@@H](O)[C@@H](C)C/C(... MacrolactonDB "
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# 保留抗菌的数据\n",
|
||
"\n",
|
||
"# 定义包含细菌靶点名称的列表\n",
|
||
"bacteria = ['Staphylococcus aureus', 'Escherichia coli', 'Bacillus subtilis']\n",
|
||
"\n",
|
||
"# 构造正则表达式,用 | 分隔多个目标\n",
|
||
"pattern = '|'.join(bacteria)\n",
|
||
"\n",
|
||
"# 筛选出 'Target Organisms' 列中包含任意细菌名称的行\n",
|
||
"macrolide_16_antibacterial_df = macrolactondb_df[macrolactondb_df['Target Organisms'].str.contains(pattern, case=False, na=False)]\n",
|
||
"\n",
|
||
"# 打印筛选后的数据\n",
|
||
"macrolide_16_antibacterial_df\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0 695\n",
|
||
"1 695\n",
|
||
"2 177\n",
|
||
"3 45\n",
|
||
"4 17\n",
|
||
" ... \n",
|
||
"2017 0\n",
|
||
"2018 0\n",
|
||
"2019 0\n",
|
||
"2020 0\n",
|
||
"2021 0\n",
|
||
"Name: # Target Organisms, Length: 2022, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"macrolactondb_df['# Target Organisms']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 45, 15, 177, 17, 695}\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(set(macrolactondb_df['# Target Organisms'].to_list()))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"16"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(set(macrolactondb_df['# Target Organisms'].to_list()))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# macrolactondb_df[\"synthesis\"] = macrolactondb_df[\"# Target Organisms\"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>original_smiles</th>\n",
|
||
" <th>fixed_smiles</th>\n",
|
||
" <th>status</th>\n",
|
||
" <th>message</th>\n",
|
||
" <th>synthesis</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
|
||
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
|
||
" <td>corrected</td>\n",
|
||
" <td>修复原子 28</td>\n",
|
||
" <td>SIME</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
|
||
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
|
||
" <td>corrected</td>\n",
|
||
" <td>修复原子 28</td>\n",
|
||
" <td>SIME</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
|
||
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
|
||
" <td>corrected</td>\n",
|
||
" <td>修复原子 28</td>\n",
|
||
" <td>SIME</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
|
||
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
|
||
" <td>corrected</td>\n",
|
||
" <td>修复原子 28</td>\n",
|
||
" <td>SIME</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
|
||
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
|
||
" <td>corrected</td>\n",
|
||
" <td>修复原子 28</td>\n",
|
||
" <td>SIME</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" original_smiles \\\n",
|
||
"0 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
|
||
"1 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
|
||
"2 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
|
||
"3 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
|
||
"4 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
|
||
"\n",
|
||
" fixed_smiles status message \\\n",
|
||
"0 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
|
||
"1 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
|
||
"2 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
|
||
"3 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
|
||
"4 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
|
||
"\n",
|
||
" synthesis \n",
|
||
"0 SIME \n",
|
||
"1 SIME \n",
|
||
"2 SIME \n",
|
||
"3 SIME \n",
|
||
"4 SIME "
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"synthesis_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"synthesis_df = synthesis_df.rename(columns={'fixed_smiles': 'smiles'})"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>original_smiles</th>\n",
|
||
" <th>smiles</th>\n",
|
||
" <th>status</th>\n",
|
||
" <th>message</th>\n",
|
||
" <th>synthesis</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
|
||
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
|
||
" <td>corrected</td>\n",
|
||
" <td>修复原子 28</td>\n",
|
||
" <td>SIME</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
|
||
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
|
||
" <td>corrected</td>\n",
|
||
" <td>修复原子 28</td>\n",
|
||
" <td>SIME</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
|
||
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
|
||
" <td>corrected</td>\n",
|
||
" <td>修复原子 28</td>\n",
|
||
" <td>SIME</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
|
||
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
|
||
" <td>corrected</td>\n",
|
||
" <td>修复原子 28</td>\n",
|
||
" <td>SIME</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H...</td>\n",
|
||
" <td>CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H...</td>\n",
|
||
" <td>corrected</td>\n",
|
||
" <td>修复原子 28</td>\n",
|
||
" <td>SIME</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" original_smiles \\\n",
|
||
"0 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
|
||
"1 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
|
||
"2 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
|
||
"3 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
|
||
"4 O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H... \n",
|
||
"\n",
|
||
" smiles status message \\\n",
|
||
"0 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
|
||
"1 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
|
||
"2 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
|
||
"3 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
|
||
"4 CCC1=C\\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H... corrected 修复原子 28 \n",
|
||
"\n",
|
||
" synthesis \n",
|
||
"0 SIME \n",
|
||
"1 SIME \n",
|
||
"2 SIME \n",
|
||
"3 SIME \n",
|
||
"4 SIME "
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"synthesis_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"merge_df = pd.concat([macrolactondb_df, synthesis_df], ignore_index=True, axis=0)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"252023"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(merge_df)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 12-20 环大环内酯分析\n",
|
||
"\n",
|
||
"每一种环的抗菌药物数量,看下是哪一种环的抗菌药物最多。\n",
|
||
"\n",
|
||
"整体12-20环大环内酯的,抗菌药物有多少。"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"总计数据:11036\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# 读取数据和统计\n",
|
||
"macrolactondb_df_ring12_20 = pd.read_csv('../../data/MacrolactoneDB/ring12_20/temp.csv')\n",
|
||
"counts = macrolactondb_df_ring12_20['Target Organisms'].value_counts()\n",
|
||
"\n",
|
||
"# 将 counts Series 转换成字符串并写入 txt 文件\n",
|
||
"with open('../../data/MacrolactoneDB/ring12_20/counts.txt', 'w') as file:\n",
|
||
" file.write(counts.to_string())\n",
|
||
"\n",
|
||
"\n",
|
||
"print(f\"总计数据:{len(macrolactondb_df_ring12_20)}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>IDs</th>\n",
|
||
" <th>molecule_pref_name</th>\n",
|
||
" <th>max_pChEMBL</th>\n",
|
||
" <th>max_pChEMBL_target</th>\n",
|
||
" <th># Target Organisms</th>\n",
|
||
" <th>Target Organisms</th>\n",
|
||
" <th># Known Targets</th>\n",
|
||
" <th>Known Targets</th>\n",
|
||
" <th>target_pref_name</th>\n",
|
||
" <th>smiles</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>CHEMBL94657</td>\n",
|
||
" <td>PATUPILONE</td>\n",
|
||
" <td>10.67</td>\n",
|
||
" <td>CHEMBL1075590</td>\n",
|
||
" <td>695</td>\n",
|
||
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
|
||
" <td>695</td>\n",
|
||
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
|
||
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
|
||
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>CHEMBL94657</td>\n",
|
||
" <td>PATUPILONE</td>\n",
|
||
" <td>10.67</td>\n",
|
||
" <td>CHEMBL1075590</td>\n",
|
||
" <td>695</td>\n",
|
||
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
|
||
" <td>695</td>\n",
|
||
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
|
||
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
|
||
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>CHEMBL449158</td>\n",
|
||
" <td>BRYOSTATIN</td>\n",
|
||
" <td>9.36</td>\n",
|
||
" <td>CHEMBL2996</td>\n",
|
||
" <td>664</td>\n",
|
||
" <td>Homo sapiens, None, Rattus norvegicus</td>\n",
|
||
" <td>664</td>\n",
|
||
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
|
||
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
|
||
" <td>CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>CHEMBL1554</td>\n",
|
||
" <td>DACTINOMYCIN</td>\n",
|
||
" <td>10.10</td>\n",
|
||
" <td>CHEMBL614533</td>\n",
|
||
" <td>177</td>\n",
|
||
" <td>Giardia intestinalis, Trypanosoma cruzi, Equus...</td>\n",
|
||
" <td>177</td>\n",
|
||
" <td>CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551...</td>\n",
|
||
" <td>HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7...</td>\n",
|
||
" <td>Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>CHEMBL529</td>\n",
|
||
" <td>AZITHROMYCIN</td>\n",
|
||
" <td>8.59</td>\n",
|
||
" <td>CHEMBL3301413</td>\n",
|
||
" <td>70</td>\n",
|
||
" <td>None, Plasmodium falciparum, Escherichia coli,...</td>\n",
|
||
" <td>70</td>\n",
|
||
" <td>CHEMBL347, CHEMBL612313, CHEMBL354, CHEMBL3301...</td>\n",
|
||
" <td>Streptococcus, Unchecked, Cytochrome P450 3A4,...</td>\n",
|
||
" <td>CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n",
|
||
"0 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
|
||
"1 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
|
||
"2 CHEMBL449158 BRYOSTATIN 9.36 CHEMBL2996 \n",
|
||
"3 CHEMBL1554 DACTINOMYCIN 10.10 CHEMBL614533 \n",
|
||
"4 CHEMBL529 AZITHROMYCIN 8.59 CHEMBL3301413 \n",
|
||
"\n",
|
||
" # Target Organisms Target Organisms \\\n",
|
||
"0 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
|
||
"1 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
|
||
"2 664 Homo sapiens, None, Rattus norvegicus \n",
|
||
"3 177 Giardia intestinalis, Trypanosoma cruzi, Equus... \n",
|
||
"4 70 None, Plasmodium falciparum, Escherichia coli,... \n",
|
||
"\n",
|
||
" # Known Targets Known Targets \\\n",
|
||
"0 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
|
||
"1 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
|
||
"2 664 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
|
||
"3 177 CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... \n",
|
||
"4 70 CHEMBL347, CHEMBL612313, CHEMBL354, CHEMBL3301... \n",
|
||
"\n",
|
||
" target_pref_name \\\n",
|
||
"0 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
|
||
"1 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
|
||
"2 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
|
||
"3 HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... \n",
|
||
"4 Streptococcus, Unchecked, Cytochrome P450 3A4,... \n",
|
||
"\n",
|
||
" smiles \n",
|
||
"0 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n",
|
||
"1 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n",
|
||
"2 CCC/C=C/C=C/C(=O)O[C@H]1/C(=C/C(=O)OC)C[C@H]2C... \n",
|
||
"3 Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... \n",
|
||
"4 CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(... "
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"macrolactondb_df_ring12_20.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Target Organisms\n",
|
||
"Homo sapiens 815\n",
|
||
"Homo sapiens, None 180\n",
|
||
"Plasmodium falciparum 161\n",
|
||
"Hepatitis C virus, None 112\n",
|
||
"Homo sapiens, Plasmodium falciparum 63\n",
|
||
" ... \n",
|
||
"Giardia intestinalis, Schistosoma mansoni, Mus musculus, None, Homo sapiens, Saccharomyces cerevisiae 1\n",
|
||
"Trypanosoma cruzi 1\n",
|
||
"Influenza A virus 1\n",
|
||
"Escherichia coli K-12 1\n",
|
||
"Human herpesvirus 4 (strain B95-8) 1\n",
|
||
"Name: count, Length: 88, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"macrolactondb_df_ring12_20['Target Organisms'].value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"synthesis_df.to_parquet(\"../../data/Macro16_SIME_Synthesis/synthesis_with_sa.parquet\") # 比CSV更快"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## chempot 绘制\n",
|
||
"\n",
|
||
"参考 utils/chem_viz.py"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"merge_df[[\"synthesis\", \"smiles\"]].to_csv('../../data/tSNE/merge.csv', index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"“RDKit设计的分子和天然分子在结构上有何不同?” → 用 tailored。\n",
|
||
"\n",
|
||
"“RDKit设计的分子是否覆盖了天然分子的化学空间?” → 用 structural。"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 合成分子减少为1w进行可视化\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "ValueError",
|
||
"evalue": "synthesis_df 中 synthesis=1 的样本少于10,000个。",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[0;32mIn[16], line 13\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# 3. 随机抽取1万个样本\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(synthesis_1_df) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m10000\u001b[39m:\n\u001b[0;32m---> 13\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msynthesis_df 中 synthesis=1 的样本少于10,000个。\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 15\u001b[0m sampled_synthesis_1_df \u001b[38;5;241m=\u001b[39m synthesis_1_df\u001b[38;5;241m.\u001b[39msample(n\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m)\u001b[38;5;241m.\u001b[39mreset_index(drop\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# 4. 合并抽取的样本与 macrolactondb_df\u001b[39;00m\n",
|
||
"\u001b[0;31mValueError\u001b[0m: synthesis_df 中 synthesis=1 的样本少于10,000个。"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from chemplot import Plotter\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"# 1. 合并数据框\n",
|
||
"merge_df = pd.concat([macrolactondb_df, synthesis_df], ignore_index=True, axis=0)\n",
|
||
"\n",
|
||
"# 2. 过滤出 synthesis=1 的样本\n",
|
||
"synthesis_1_df = synthesis_df[synthesis_df['synthesis'] == 1]\n",
|
||
"\n",
|
||
"# 3. 随机抽取1万个样本\n",
|
||
"if len(synthesis_1_df) < 10000:\n",
|
||
" raise ValueError(\"synthesis_df 中 synthesis=1 的样本少于10,000个。\")\n",
|
||
"\n",
|
||
"sampled_synthesis_1_df = synthesis_1_df.sample(n=10000, random_state=42).reset_index(drop=True)\n",
|
||
"\n",
|
||
"# 4. 合并抽取的样本与 macrolactondb_df\n",
|
||
"final_df = pd.concat([macrolactondb_df, sampled_synthesis_1_df], ignore_index=True, axis=0).reset_index(drop=True)\n",
|
||
"\n",
|
||
"# 5. 使用 ChemPlot 进行可视化\n",
|
||
"cp = Plotter.from_smiles(\n",
|
||
" final_df[\"smiles\"],\n",
|
||
" target=final_df[\"synthesis\"],\n",
|
||
" target_type=\"C\",\n",
|
||
" sim_type=\"structural\"\n",
|
||
")\n",
|
||
"\n",
|
||
"# 进行 t-SNE 降维\n",
|
||
"cp.tsne()\n",
|
||
"\n",
|
||
"# 可视化结果\n",
|
||
"cp.visualize_plot(kind=\"scatter\", size=20, remove_outliers=False, is_colored=True, colorbar=False)\n",
|
||
"\n",
|
||
"# 如果需要保存图像,可以取消下一行的注释\n",
|
||
"# cp.visualize_plot(kind=\"scatter\", size=20, remove_outliers=False, is_colored=True, colorbar=False, filename=\"scatter_plot.png\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "qsarml",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.16"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|