{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# SMARTS匹配检测与可视化\n", "\n", "本notebook用于:\n", "1. 读取ring16/temp.csv中的smiles列\n", "2. 对SMARTS模式进行匹配检测:`O=C1C[C@@H](O)[*:15][*:17][*:18]C[*:23]C(=O)/C=C/[*:28]=C/[*:7][*:8]O1`\n", "3. 处理dummy原子([*:X]),尝试两种方式:\n", " - 不替换dummy原子\n", " - 将dummy原子替换为C\n", "4. 可视化匹配的原子高亮显示\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. 导入必要的库\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "所有模块导入成功!\n" ] } ], "source": [ "import sys\n", "from pathlib import Path\n", "import re\n", "\n", "# 添加项目根目录到 Python 路径\n", "notebook_dir = Path().resolve()\n", "project_root = notebook_dir.parent\n", "sys.path.insert(0, str(project_root))\n", "\n", "from rdkit import Chem\n", "from rdkit.Chem import Draw\n", "from rdkit.Chem.Draw import rdMolDraw2D\n", "from IPython.display import SVG, display, HTML\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from collections import Counter\n", "\n", "print(\"所有模块导入成功!\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. 读取数据\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "数据集大小: 2022 个分子\n", "列名: ['IDs', 'molecule_pref_name', 'max_pChEMBL', 'max_pChEMBL_target', '# Target Organisms', 'Target Organisms', '# Known Targets', 'Known Targets', 'target_pref_name', 'smiles']\n", "\n", "SMILES列存在,共 2022 个有效SMILES\n", "\n", "前5个SMILES示例:\n", "['C/C(=C\\\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[C@H](C)[C@H](O)[C@@H](C)C(=O)C(C)(C)[C@@H](O)CC(=O)O1', 'C/C(=C\\\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[C@H](C)[C@H](O)[C@@H](C)C(=O)C(C)(C)[C@@H](O)CC(=O)O1', 'Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C)C(=O)N5CCC[C@H]5C(=O)N(C)CC(=O)N(C)[C@@H](C(C)C)C(=O)O[C@@H]4C)c3nc-2c(C(=O)N[C@@H]2C(=O)N[C@H](C(C)C)C(=O)N3CCC[C@H]3C(=O)N(C)CC(=O)N(C)[C@@H](C(C)C)C(=O)O[C@@H]2C)c(N)c1=O', 'CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2=N[C@@](C)(CS2)C(=O)N[C@@H](C(C)C)C(=O)O1', 'Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC3)c(c1)CN(C)C(=O)[C@@H]2Nc1ccc2c(N)ncc(F)c2c1']\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDsmolecule_pref_namemax_pChEMBLmax_pChEMBL_target# Target OrganismsTarget Organisms# Known TargetsKnown Targetstarget_pref_namesmiles
0CHEMBL94657PATUPILONE10.67CHEMBL1075590695Sus scrofa, Mus musculus, None, Plasmodium fal...695CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...
1CHEMBL94657PATUPILONE10.67CHEMBL1075590695Sus scrofa, Mus musculus, None, Plasmodium fal...695CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE...AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6...C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[...
2CHEMBL1554DACTINOMYCIN10.10CHEMBL614533177Giardia intestinalis, Trypanosoma cruzi, Equus...177CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551...HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7...Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C...
3CHEMBL1173445LARGAZOLE8.80CHEMBL61254545Homo sapiens, None45CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,...Histone deacetylase 9, Ubiquitin-like modifier...CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2...
4CHEMBL3902498NaN9.37CHEMBL2095194,CHEMBL399117Homo sapiens, None17CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,...Coagulation factor X, Kallikrein 1, Coagulatio...Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC...
\n", "
" ], "text/plain": [ " IDs molecule_pref_name max_pChEMBL max_pChEMBL_target \\\n", "0 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n", "1 CHEMBL94657 PATUPILONE 10.67 CHEMBL1075590 \n", "2 CHEMBL1554 DACTINOMYCIN 10.10 CHEMBL614533 \n", "3 CHEMBL1173445 LARGAZOLE 8.80 CHEMBL612545 \n", "4 CHEMBL3902498 NaN 9.37 CHEMBL2095194,CHEMBL3991 \n", "\n", " # Target Organisms Target Organisms \\\n", "0 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n", "1 695 Sus scrofa, Mus musculus, None, Plasmodium fal... \n", "2 177 Giardia intestinalis, Trypanosoma cruzi, Equus... \n", "3 45 Homo sapiens, None \n", "4 17 Homo sapiens, None \n", "\n", " # Known Targets Known Targets \\\n", "0 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n", "1 695 CHEMBL612519, CHEMBL614129, CHEMBL1075484, CHE... \n", "2 177 CHEMBL388, CHEMBL614151, CHEMBL3577, CHEMBL551... \n", "3 45 CHEMBL392, CHEMBL3192, CHEMBL3524, CHEMBL5103,... \n", "4 17 CHEMBL2820, CHEMBL3991, CHEMBL1801, CHEMBL204,... \n", "\n", " target_pref_name \\\n", "0 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n", "1 AGS, NCI-H1703, MKN-7, HT-1080, NCI-H226, Lu-6... \n", "2 HT-29, CCRF-CEM, WIL2-NS, Unchecked, Caspase-7... \n", "3 Histone deacetylase 9, Ubiquitin-like modifier... \n", "4 Coagulation factor X, Kallikrein 1, Coagulatio... \n", "\n", " smiles \n", "0 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n", "1 C/C(=C\\c1csc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[... \n", "2 Cc1c2oc3c(C)ccc(C(=O)N[C@@H]4C(=O)N[C@H](C(C)C... \n", "3 CCCCCCCC(=O)SCC/C=C/[C@@H]1CC(=O)NCc2nc(cs2)C2... \n", "4 Cc1cc2ccc1[C@@H](C)COC(=O)Nc1ccc(S(=O)(=O)C3CC... " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 读取CSV文件\n", "input_file = project_root / 'ring16' / 'temp.csv'\n", "df = pd.read_csv(input_file)\n", "\n", "print(f\"数据集大小: {len(df)} 个分子\")\n", "print(f\"列名: {df.columns.tolist()}\")\n", "\n", "# 检查smiles列\n", "if 'smiles' in df.columns:\n", " print(f\"\\nSMILES列存在,共 {df['smiles'].notna().sum()} 个有效SMILES\")\n", " print(f\"\\n前5个SMILES示例:\")\n", " print(df['smiles'].head().tolist())\n", "else:\n", " print(\"错误: 未找到smiles列\")\n", " \n", "df.head()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. 定义SMARTS模式和处理函数\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.0" } }, "nbformat": 4, "nbformat_minor": 2 }