Files
macrolactone-toolkit/notebooks/smarts_match_visualization.ipynb
2025-11-14 20:34:58 +08:00

286 lines
12 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SMARTS匹配检测与可视化\n",
"\n",
"本notebook用于\n",
"1. 读取ring16/temp.csv中的smiles列\n",
"2. 对SMARTS模式进行匹配检测`O=C1C[C@@H](O)[*:15][*:17][*:18]C[*:23]C(=O)/C=C/[*:28]=C/[*:7][*:8]O1`\n",
"3. 处理dummy原子[*:X]),尝试两种方式:\n",
" - 不替换dummy原子\n",
" - 将dummy原子替换为C\n",
"4. 可视化匹配的原子高亮显示\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. 导入必要的库\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"所有模块导入成功!\n"
]
}
],
"source": [
"import sys\n",
"from pathlib import Path\n",
"import re\n",
"\n",
"# 添加项目根目录到 Python 路径\n",
"notebook_dir = Path().resolve()\n",
"project_root = notebook_dir.parent\n",
"sys.path.insert(0, str(project_root))\n",
"\n",
"from rdkit import Chem\n",
"from rdkit.Chem import Draw\n",
"from rdkit.Chem.Draw import rdMolDraw2D\n",
"from IPython.display import SVG, display, HTML\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from collections import Counter\n",
"\n",
"print(\"所有模块导入成功!\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. 读取数据\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"数据集大小: 2022 个分子\n",
"列名: ['IDs', 'molecule_pref_name', 'max_pChEMBL', 'max_pChEMBL_target', '# Target Organisms', 'Target Organisms', '# Known Targets', 'Known Targets', 'target_pref_name', 'smiles']\n",
"\n",
"SMILES列存在共 2022 个有效SMILES\n",
"\n",
"前5个SMILES示例:\n",
"['C/C(=C\\\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[C@H](C)[C@H](O)[C@@H](C)C(=O)C(C)(C)[C@@H](O)CC(=O)O1', 'C/C(=C\\\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[C@H](C)[C@H](O)[C@@H](C)C(=O)C(C)(C)[C@@H](O)CC(=O)O1', 'Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C)C(=O)N5CCC[C@H]5C(=O)N(C)CC(=O)N(C)[C@@H](C(C)C)C(=O)O[C@@H]4C)c3nc-2c(C(=O)N[C@@H]2C(=O)N[C@H](C(C)C)C(=O)N3CCC[C@H]3C(=O)N(C)CC(=O)N(C)[C@@H](C(C)C)C(=O)O[C@@H]2C)c(N)c1=O', 'CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2=N[C@@](C)(CS2)C(=O)N[C@@H](C(C)C)C(=O)O1', 'Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC3)c(c1)CN(C)C(=O)[C@@H]2Nc1ccc2c(N)ncc(F)c2c1']\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>IDs</th>\n",
" <th>molecule_pref_name</th>\n",
" <th>max_pChEMBL</th>\n",
" <th>max_pChEMBL_target</th>\n",
" <th># Target Organisms</th>\n",
" <th>Target Organisms</th>\n",
" <th># Known Targets</th>\n",
" <th>Known Targets</th>\n",
" <th>target_pref_name</th>\n",
" <th>smiles</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CHEMBL94657</td>\n",
" <td>PATUPILONE</td>\n",
" <td>10.67</td>\n",
" <td>CHEMBL1075590</td>\n",
" <td>695</td>\n",
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
" <td>695</td>\n",
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CHEMBL94657</td>\n",
" <td>PATUPILONE</td>\n",
" <td>10.67</td>\n",
" <td>CHEMBL1075590</td>\n",
" <td>695</td>\n",
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
" <td>695</td>\n",
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CHEMBL1554</td>\n",
" <td>DACTINOMYCIN</td>\n",
" <td>10.10</td>\n",
" <td>CHEMBL614533</td>\n",
" <td>177</td>\n",
" <td>Giardia intestinalis, Trypanosoma cruzi, Equus...</td>\n",
" <td>177</td>\n",
" <td>CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551...</td>\n",
" <td>HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7...</td>\n",
" <td>Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CHEMBL1173445</td>\n",
" <td>LARGAZOLE</td>\n",
" <td>8.80</td>\n",
" <td>CHEMBL612545</td>\n",
" <td>45</td>\n",
" <td>Homo sapiens, None</td>\n",
" <td>45</td>\n",
" <td>CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,...</td>\n",
" <td>Histone deacetylase 9, Ubiquitin-like modifier...</td>\n",
" <td>CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CHEMBL3902498</td>\n",
" <td>NaN</td>\n",
" <td>9.37</td>\n",
" <td>CHEMBL2095194,CHEMBL3991</td>\n",
" <td>17</td>\n",
" <td>Homo sapiens, None</td>\n",
" <td>17</td>\n",
" <td>CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,...</td>\n",
" <td>Coagulation factor X, Kallikrein 1, Coagulatio...</td>\n",
" <td>Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n",
"0 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
"1 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
"2 CHEMBL1554 DACTINOMYCIN 10.10 CHEMBL614533 \n",
"3 CHEMBL1173445 LARGAZOLE 8.80 CHEMBL612545 \n",
"4 CHEMBL3902498 NaN 9.37 CHEMBL2095194,CHEMBL3991 \n",
"\n",
" # Target Organisms Target Organisms \\\n",
"0 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
"1 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
"2 177 Giardia intestinalis, Trypanosoma cruzi, Equus... \n",
"3 45 Homo sapiens, None \n",
"4 17 Homo sapiens, None \n",
"\n",
" # Known Targets Known Targets \\\n",
"0 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
"1 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
"2 177 CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... \n",
"3 45 CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,... \n",
"4 17 CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,... \n",
"\n",
" target_pref_name \\\n",
"0 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
"1 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
"2 HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... \n",
"3 Histone deacetylase 9, Ubiquitin-like modifier... \n",
"4 Coagulation factor X, Kallikrein 1, Coagulatio... \n",
"\n",
" smiles \n",
"0 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n",
"1 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n",
"2 Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... \n",
"3 CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2... \n",
"4 Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC... "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 读取CSV文件\n",
"input_file = project_root / 'ring16' / 'temp.csv'\n",
"df = pd.read_csv(input_file)\n",
"\n",
"print(f\"数据集大小: {len(df)} 个分子\")\n",
"print(f\"列名: {df.columns.tolist()}\")\n",
"\n",
"# 检查smiles列\n",
"if 'smiles' in df.columns:\n",
" print(f\"\\nSMILES列存在共 {df['smiles'].notna().sum()} 个有效SMILES\")\n",
" print(f\"\\n前5个SMILES示例:\")\n",
" print(df['smiles'].head().tolist())\n",
"else:\n",
" print(\"错误: 未找到smiles列\")\n",
" \n",
"df.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. 定义SMARTS模式和处理函数\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}