first add
This commit is contained in:
285
notebooks/smarts_match_visualization.ipynb
Normal file
285
notebooks/smarts_match_visualization.ipynb
Normal file
@@ -0,0 +1,285 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# SMARTS匹配检测与可视化\n",
|
||||
"\n",
|
||||
"本notebook用于:\n",
|
||||
"1. 读取ring16/temp.csv中的smiles列\n",
|
||||
"2. 对SMARTS模式进行匹配检测:`O=C1C[C@@H](O)[*:15][*:17][*:18]C[*:23]C(=O)/C=C/[*:28]=C/[*:7][*:8]O1`\n",
|
||||
"3. 处理dummy原子([*:X]),尝试两种方式:\n",
|
||||
" - 不替换dummy原子\n",
|
||||
" - 将dummy原子替换为C\n",
|
||||
"4. 可视化匹配的原子高亮显示\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1. 导入必要的库\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"所有模块导入成功!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"from pathlib import Path\n",
|
||||
"import re\n",
|
||||
"\n",
|
||||
"# 添加项目根目录到 Python 路径\n",
|
||||
"notebook_dir = Path().resolve()\n",
|
||||
"project_root = notebook_dir.parent\n",
|
||||
"sys.path.insert(0, str(project_root))\n",
|
||||
"\n",
|
||||
"from rdkit import Chem\n",
|
||||
"from rdkit.Chem import Draw\n",
|
||||
"from rdkit.Chem.Draw import rdMolDraw2D\n",
|
||||
"from IPython.display import SVG, display, HTML\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"print(\"所有模块导入成功!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 2. 读取数据\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"数据集大小: 2022 个分子\n",
|
||||
"列名: ['IDs', 'molecule_pref_name', 'max_pChEMBL', 'max_pChEMBL_target', '# Target Organisms', 'Target Organisms', '# Known Targets', 'Known Targets', 'target_pref_name', 'smiles']\n",
|
||||
"\n",
|
||||
"SMILES列存在,共 2022 个有效SMILES\n",
|
||||
"\n",
|
||||
"前5个SMILES示例:\n",
|
||||
"['C/C(=C\\\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[C@H](C)[C@H](O)[C@@H](C)C(=O)C(C)(C)[C@@H](O)CC(=O)O1', 'C/C(=C\\\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[C@H](C)[C@H](O)[C@@H](C)C(=O)C(C)(C)[C@@H](O)CC(=O)O1', 'Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C)C(=O)N5CCC[C@H]5C(=O)N(C)CC(=O)N(C)[C@@H](C(C)C)C(=O)O[C@@H]4C)c3nc-2c(C(=O)N[C@@H]2C(=O)N[C@H](C(C)C)C(=O)N3CCC[C@H]3C(=O)N(C)CC(=O)N(C)[C@@H](C(C)C)C(=O)O[C@@H]2C)c(N)c1=O', 'CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2=N[C@@](C)(CS2)C(=O)N[C@@H](C(C)C)C(=O)O1', 'Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC3)c(c1)CN(C)C(=O)[C@@H]2Nc1ccc2c(N)ncc(F)c2c1']\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>IDs</th>\n",
|
||||
" <th>molecule_pref_name</th>\n",
|
||||
" <th>max_pChEMBL</th>\n",
|
||||
" <th>max_pChEMBL_target</th>\n",
|
||||
" <th># Target Organisms</th>\n",
|
||||
" <th>Target Organisms</th>\n",
|
||||
" <th># Known Targets</th>\n",
|
||||
" <th>Known Targets</th>\n",
|
||||
" <th>target_pref_name</th>\n",
|
||||
" <th>smiles</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>CHEMBL94657</td>\n",
|
||||
" <td>PATUPILONE</td>\n",
|
||||
" <td>10.67</td>\n",
|
||||
" <td>CHEMBL1075590</td>\n",
|
||||
" <td>695</td>\n",
|
||||
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
|
||||
" <td>695</td>\n",
|
||||
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
|
||||
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
|
||||
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>CHEMBL94657</td>\n",
|
||||
" <td>PATUPILONE</td>\n",
|
||||
" <td>10.67</td>\n",
|
||||
" <td>CHEMBL1075590</td>\n",
|
||||
" <td>695</td>\n",
|
||||
" <td>Sus scrofa, Mus musculus, None, Plasmodium fal...</td>\n",
|
||||
" <td>695</td>\n",
|
||||
" <td>CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...</td>\n",
|
||||
" <td>AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...</td>\n",
|
||||
" <td>C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>CHEMBL1554</td>\n",
|
||||
" <td>DACTINOMYCIN</td>\n",
|
||||
" <td>10.10</td>\n",
|
||||
" <td>CHEMBL614533</td>\n",
|
||||
" <td>177</td>\n",
|
||||
" <td>Giardia intestinalis, Trypanosoma cruzi, Equus...</td>\n",
|
||||
" <td>177</td>\n",
|
||||
" <td>CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551...</td>\n",
|
||||
" <td>HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7...</td>\n",
|
||||
" <td>Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>CHEMBL1173445</td>\n",
|
||||
" <td>LARGAZOLE</td>\n",
|
||||
" <td>8.80</td>\n",
|
||||
" <td>CHEMBL612545</td>\n",
|
||||
" <td>45</td>\n",
|
||||
" <td>Homo sapiens, None</td>\n",
|
||||
" <td>45</td>\n",
|
||||
" <td>CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,...</td>\n",
|
||||
" <td>Histone deacetylase 9, Ubiquitin-like modifier...</td>\n",
|
||||
" <td>CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>CHEMBL3902498</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>9.37</td>\n",
|
||||
" <td>CHEMBL2095194,CHEMBL3991</td>\n",
|
||||
" <td>17</td>\n",
|
||||
" <td>Homo sapiens, None</td>\n",
|
||||
" <td>17</td>\n",
|
||||
" <td>CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,...</td>\n",
|
||||
" <td>Coagulation factor X, Kallikrein 1, Coagulatio...</td>\n",
|
||||
" <td>Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n",
|
||||
"0 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
|
||||
"1 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n",
|
||||
"2 CHEMBL1554 DACTINOMYCIN 10.10 CHEMBL614533 \n",
|
||||
"3 CHEMBL1173445 LARGAZOLE 8.80 CHEMBL612545 \n",
|
||||
"4 CHEMBL3902498 NaN 9.37 CHEMBL2095194,CHEMBL3991 \n",
|
||||
"\n",
|
||||
" # Target Organisms Target Organisms \\\n",
|
||||
"0 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
|
||||
"1 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n",
|
||||
"2 177 Giardia intestinalis, Trypanosoma cruzi, Equus... \n",
|
||||
"3 45 Homo sapiens, None \n",
|
||||
"4 17 Homo sapiens, None \n",
|
||||
"\n",
|
||||
" # Known Targets Known Targets \\\n",
|
||||
"0 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
|
||||
"1 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n",
|
||||
"2 177 CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... \n",
|
||||
"3 45 CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,... \n",
|
||||
"4 17 CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,... \n",
|
||||
"\n",
|
||||
" target_pref_name \\\n",
|
||||
"0 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
|
||||
"1 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n",
|
||||
"2 HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... \n",
|
||||
"3 Histone deacetylase 9, Ubiquitin-like modifier... \n",
|
||||
"4 Coagulation factor X, Kallikrein 1, Coagulatio... \n",
|
||||
"\n",
|
||||
" smiles \n",
|
||||
"0 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n",
|
||||
"1 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n",
|
||||
"2 Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... \n",
|
||||
"3 CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2... \n",
|
||||
"4 Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC... "
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# 读取CSV文件\n",
|
||||
"input_file = project_root / 'ring16' / 'temp.csv'\n",
|
||||
"df = pd.read_csv(input_file)\n",
|
||||
"\n",
|
||||
"print(f\"数据集大小: {len(df)} 个分子\")\n",
|
||||
"print(f\"列名: {df.columns.tolist()}\")\n",
|
||||
"\n",
|
||||
"# 检查smiles列\n",
|
||||
"if 'smiles' in df.columns:\n",
|
||||
" print(f\"\\nSMILES列存在,共 {df['smiles'].notna().sum()} 个有效SMILES\")\n",
|
||||
" print(f\"\\n前5个SMILES示例:\")\n",
|
||||
" print(df['smiles'].head().tolist())\n",
|
||||
"else:\n",
|
||||
" print(\"错误: 未找到smiles列\")\n",
|
||||
" \n",
|
||||
"df.head()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3. 定义SMARTS模式和处理函数\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user