update
This commit is contained in:
4
.gitignore
vendored
Normal file → Executable file
4
.gitignore
vendored
Normal file → Executable file
@@ -1,3 +1,5 @@
|
||||
uploads/
|
||||
LIBRARIES/
|
||||
*.pyc
|
||||
*.pyc
|
||||
Data/fragment/Frags-Enamine-18M.csv
|
||||
Data/fragment/GDB11-27M.csv
|
||||
File diff suppressed because one or more lines are too long
2752
Data/MacrolactoneDB/Macrolactone_Filtered_16Ring_Active_435.csv
Normal file
2752
Data/MacrolactoneDB/Macrolactone_Filtered_16Ring_Active_435.csv
Normal file
File diff suppressed because one or more lines are too long
3
Data/MacrolactoneDB/README.md
Normal file
3
Data/MacrolactoneDB/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
website: https://macrolact.collaborationspharma.com/
|
||||
|
||||
通过MacrolactoneDB页面筛选的结果并不准确,我限定16环,但是在下载的csv里面有14环的结果,所以还是要自己进行在筛选一遍。
|
||||
3
Data/MacrolactoneDB/ring12_20/README.md
Normal file
3
Data/MacrolactoneDB/ring12_20/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
Your Filtered Macrolactone Database
|
||||
|
||||
11036 compounds have been filtered from MacrolactoneDB based on your specified inputs.
|
||||
89
Data/MacrolactoneDB/ring12_20/counts.txt
Normal file
89
Data/MacrolactoneDB/ring12_20/counts.txt
Normal file
@@ -0,0 +1,89 @@
|
||||
Target Organisms
|
||||
Homo sapiens 815
|
||||
Homo sapiens, None 180
|
||||
Plasmodium falciparum 161
|
||||
Hepatitis C virus, None 112
|
||||
Homo sapiens, Plasmodium falciparum 63
|
||||
Oryctolagus cuniculus 62
|
||||
Mus musculus 60
|
||||
Toxoplasma gondii 39
|
||||
Homo sapiens, Rattus norvegicus 27
|
||||
Mus musculus, Homo sapiens 24
|
||||
None, Rattus norvegicus 23
|
||||
Human immunodeficiency virus 1 20
|
||||
Hepatitis C virus 18
|
||||
Rattus norvegicus 17
|
||||
Homo sapiens, Sus scrofa 11
|
||||
Homo sapiens, Chlorocebus aethiops 10
|
||||
Serratia marcescens 9
|
||||
Escherichia coli 8
|
||||
Oryctolagus cuniculus, Homo sapiens 7
|
||||
Streptococcus pneumoniae 6
|
||||
Oryctolagus cuniculus, Staphylococcus aureus, Raoultella planticola, Bacillus subtilis, Mus musculus, Micrococcus luteus, None, Escherichia coli, Plasmodium falciparum, Streptococcus pneumoniae, Homo sapiens, Escherichia coli K-12, Toxoplasma gondii 6
|
||||
Plasmodium falciparum K1 5
|
||||
Bacillus anthracis 5
|
||||
Mus musculus, Homo sapiens, None 5
|
||||
Bacillus anthracis, Homo sapiens 4
|
||||
Candida albicans, Cryptococcus neoformans, Aspergillus fumigatus 4
|
||||
Mus musculus, None 4
|
||||
Plasmodium falciparum, Homo sapiens, None 4
|
||||
None, Homo sapiens, Plasmodium falciparum 3
|
||||
Bacillus subtilis, Homo sapiens 3
|
||||
Oryctolagus cuniculus, Homo sapiens, None 3
|
||||
Sus scrofa, Mus musculus, None, Plasmodium falciparum, Homo sapiens, Rattus norvegicus 2
|
||||
Homo sapiens, None, Rattus norvegicus 2
|
||||
Cryptococcus neoformans 2
|
||||
Homo sapiens, None, Chlorocebus aethiops 2
|
||||
Staphylococcus aureus 2
|
||||
Candida albicans, Cryptococcus neoformans, Mycobacterium intracellulare, Aspergillus fumigatus 2
|
||||
Mus musculus, None, Human immunodeficiency virus 1 2
|
||||
Escherichia coli (strain K12) 2
|
||||
Plasmodium falciparum 3D7, Homo sapiens 2
|
||||
Aspergillus fumigatus 1
|
||||
Sus scrofa 1
|
||||
Saccharomyces cerevisiae S288c, Human immunodeficiency virus 1, Human herpesvirus 1, Plasmodium falciparum, None, Homo sapiens, Rattus norvegicus 1
|
||||
Hepatitis C virus, Homo sapiens, None 1
|
||||
Plasmodium falciparum 3D7 1
|
||||
Bacillus subtilis 1
|
||||
Mus musculus, Homo sapiens, None, Saccharomyces cerevisiae 1
|
||||
Chlorocebus aethiops 1
|
||||
Homo sapiens, Escherichia coli K-12, None 1
|
||||
Hepatitis C virus, Homo sapiens, None, Rattus norvegicus 1
|
||||
None, Homo sapiens, Human herpesvirus 1 1
|
||||
Homo sapiens, None, Trypanosoma brucei brucei 1
|
||||
Homo sapiens, None, Cryptococcus neoformans 1
|
||||
Homo sapiens, Rattus norvegicus, Human immunodeficiency virus 1 1
|
||||
None, Plasmodium falciparum, Escherichia coli, Streptococcus pneumoniae, Naegleria fowleri, Homo sapiens, Streptococcus, Toxoplasma gondii 1
|
||||
Giardia intestinalis, Trypanosoma cruzi, Equus caballus, Bos taurus, Mus musculus, None, Plasmodium falciparum, Chlorocebus aethiops, Homo sapiens 1
|
||||
Plasmodium falciparum NF54, Trypanosoma cruzi, Trypanosoma brucei rhodesiense, Rattus norvegicus 1
|
||||
None, Homo sapiens, Plasmodium falciparum K1, Plasmodium falciparum 1
|
||||
Saccharomyces cerevisiae S288c, Homo sapiens, None, Saccharomyces cerevisiae, Phytophthora sojae 1
|
||||
Bacillus subtilis, Homo sapiens, Schistosoma mansoni, Saccharomyces cerevisiae, Giardia intestinalis 1
|
||||
Streptococcus, Homo sapiens, None 1
|
||||
Mus musculus, Homo sapiens, Rattus norvegicus 1
|
||||
Homo sapiens, Spinacia oleracea 1
|
||||
Human immunodeficiency virus 1, Mus musculus, None, Hepatitis C virus, Homo sapiens, Rattus norvegicus 1
|
||||
None, Plasmodium falciparum, Trypanosoma brucei rhodesiense 1
|
||||
Hepatitis C virus, None, Rattus norvegicus 1
|
||||
Homo sapiens, Equus caballus 1
|
||||
Plasmodium falciparum NF54, Trypanosoma cruzi, Trypanosoma brucei rhodesiense 1
|
||||
Schistosoma mansoni, Influenza A virus 1
|
||||
Leishmania chagasi, Trypanosoma cruzi 1
|
||||
Candida albicans, Cryptococcus neoformans 1
|
||||
None, Plasmodium falciparum 1
|
||||
Caenorhabditis elegans 1
|
||||
Bos taurus, Sus scrofa 1
|
||||
Plasmodium falciparum, Enterococcus faecium 1
|
||||
Homo sapiens, Gallus gallus 1
|
||||
Homo sapiens, Escherichia coli 1
|
||||
Plasmodium falciparum, Homo sapiens, None, Rattus norvegicus, Schistosoma mansoni 1
|
||||
Homo sapiens, None, Influenza A virus 1
|
||||
Mycobacterium tuberculosis, None 1
|
||||
Escherichia coli, Homo sapiens, Toxoplasma gondii, None, Streptococcus pneumoniae 1
|
||||
Bacillus subtilis, Oryctolagus cuniculus, Homo sapiens, Schistosoma mansoni, Giardia intestinalis 1
|
||||
Homo sapiens, None, Rattus norvegicus, Escherichia coli O157:H7 1
|
||||
Giardia intestinalis, Schistosoma mansoni, Mus musculus, None, Homo sapiens, Saccharomyces cerevisiae 1
|
||||
Trypanosoma cruzi 1
|
||||
Influenza A virus 1
|
||||
Escherichia coli K-12 1
|
||||
Human herpesvirus 4 (strain B95-8) 1
|
||||
51
Data/MacrolactoneDB/ring12_20/processed_embedding_small.csv
Normal file
51
Data/MacrolactoneDB/ring12_20/processed_embedding_small.csv
Normal file
File diff suppressed because one or more lines are too long
11037
Data/MacrolactoneDB/ring12_20/temp.csv
Normal file
11037
Data/MacrolactoneDB/ring12_20/temp.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
Data/MacrolactoneDB/ring12_20/umap_visualization_small.png
Normal file
BIN
Data/MacrolactoneDB/ring12_20/umap_visualization_small.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 199 KiB |
2023
Data/MacrolactoneDB/ring16/temp.csv
Executable file
2023
Data/MacrolactoneDB/ring16/temp.csv
Executable file
File diff suppressed because one or more lines are too long
289579
Data/MacrolactoneDB/ring16/temp.sdf
Executable file
289579
Data/MacrolactoneDB/ring16/temp.sdf
Executable file
File diff suppressed because one or more lines are too long
0
Data/ery_core.txt
Normal file → Executable file
0
Data/ery_core.txt
Normal file → Executable file
36
Data/fragment/README.md
Normal file
36
Data/fragment/README.md
Normal file
@@ -0,0 +1,36 @@
|
||||
## [Cell](https://www.cell.com/cell/abstract/S0092-8674(25)00855-4) 论文筛选数据
|
||||
|
||||
|
||||
|
||||
## 数据输入:原始片段库
|
||||
|
||||
Frags-Enamine-18M.csv:Enamine REAL数据库的18M片段(需提取SMILES)。
|
||||
GDB11-27M.csv:GDB-11数据库的27M片段(需提取SMILES)。
|
||||
|
||||
下载地址:[Zenodo link](https://zenodo.org/records/15191826)
|
||||
|
||||
## 原文筛选逻辑(淋病奈瑟菌靶向)
|
||||
|
||||
(1)数据输入:原始片段库
|
||||
文件来源:
|
||||
Frags-Enamine-18M.csv:Enamine REAL数据库的18M片段(需提取SMILES)。
|
||||
GDB11-27M.csv:GDB-11数据库的27M片段(需提取SMILES)。
|
||||
(2)模型预测:Chemprop预训练模型
|
||||
模型用途:
|
||||
使用预训练的Chemprop模型(针对淋病奈瑟菌或金黄色葡萄球菌)预测片段的抗菌活性得分(范围0-1)。
|
||||
模型合理性:
|
||||
Chemprop模型基于图神经网络(GNN),已在大规模化合物库(如Broad Institute的38,765个化合物)上训练,对结构-活性关系有较高预测精度。
|
||||
论文验证了模型对已知抗生素片段的预测能力(见Figure S1A),证明其可靠性。
|
||||
(3)多维度过滤条件
|
||||
筛选逻辑包含以下条件(需代码实现):
|
||||
|
||||
1.活性阈值:
|
||||
GDB库片段预测得分>0.05;
|
||||
Enamine库片段预测得分>0.1(因合成性更佳)。
|
||||
2.毒性过滤:
|
||||
使用预训练的HepG2、HSkMC、IMR-90细胞毒性模型,剔除预测得分>0.5的片段。
|
||||
3.结构过滤:
|
||||
排除含PAINS/Brenk子结构的片段(易导致假阳性或代谢不稳定)。
|
||||
与已知559个抗生素的Tanimoto相似度<0.5(确保结构新颖性)。
|
||||
(4)结果输出
|
||||
最终获得1,156,945个片段(淋病奈瑟菌靶向),存储于补充数据或Zenodo仓库中。
|
||||
0
Data/image.png
Normal file → Executable file
0
Data/image.png
Normal file → Executable file
|
Before Width: | Height: | Size: 94 KiB After Width: | Height: | Size: 94 KiB |
9
Data/my_sugars.txt
Executable file
9
Data/my_sugars.txt
Executable file
@@ -0,0 +1,9 @@
|
||||
[*R*][C@@H](O[C@@H]1O[C@H](C)[C@@H](O[C@@H]2O[C@H](C)[C@@H](O)[C@@](O)(C)C2)[C@H](N(C)C)[C@H]1O)[*R*]
|
||||
[*R*][C@@H](CO[C@@H]1O[C@H](C)[C@@H](O)[C@@H](OC)[C@H]1OC)[*R*]
|
||||
[*R*][C@H](O[C@H]9C[C@@](C)(OC)[C@@H](O)[C@H](C)O9)[*R*]
|
||||
[*R*][C@H](O[C@@H]9O[C@H](C)C[C@@H]([C@H]9O)N(C)C)[*R*]
|
||||
[*R*][C@H](O[C@@H]9O[C@H](C)C[C@@H]([C@H]9OC(C)=O)N(C)C)[*R*]
|
||||
[*R*][C@H](O[C@H]9C[C@H](OC)O[C@@H](C)[C@@H]9OC(C)=O)[*R*]
|
||||
[*R*][C@H](O[C@H]9C[C@H](OC)[C@@H](O)[C@H](C)O9)[*R*]
|
||||
[*R*][C@H](O[C@H]9C[C@@H](O)[C@H](O)[C@@H](C)O9)[*R*]
|
||||
[*R*][C@H](O[C@@H]9O[C@H](C)C[C@H](NC)[C@H]9O)[*R*]
|
||||
0
Data/selected_extenders.txt
Normal file → Executable file
0
Data/selected_extenders.txt
Normal file → Executable file
2
Data/split_position.md
Executable file
2
Data/split_position.md
Executable file
@@ -0,0 +1,2 @@
|
||||
键编号 31: 17(C) -> 32(O), 键类型: SINGLE
|
||||
键编号 6: 6(C) -> 7(C), 键类型: SINGLE
|
||||
0
Data/sugars
Normal file → Executable file
0
Data/sugars
Normal file → Executable file
0
V1B_full_database.txt
Normal file → Executable file
0
V1B_full_database.txt
Normal file → Executable file
0
analyse/selected_extenders.txt
Normal file → Executable file
0
analyse/selected_extenders.txt
Normal file → Executable file
0
analyse/sugars
Normal file → Executable file
0
analyse/sugars
Normal file → Executable file
0
analyse/sync_cmd.md
Normal file → Executable file
0
analyse/sync_cmd.md
Normal file → Executable file
0
analyse/tylo_core.txt
Normal file → Executable file
0
analyse/tylo_core.txt
Normal file → Executable file
0
docker/Dockerfile
Normal file → Executable file
0
docker/Dockerfile
Normal file → Executable file
0
docker/SIME.def
Normal file → Executable file
0
docker/SIME.def
Normal file → Executable file
0
docker/docker-compose.yml
Normal file → Executable file
0
docker/docker-compose.yml
Normal file → Executable file
0
macro_example/tylo_core.txt
Normal file → Executable file
0
macro_example/tylo_core.txt
Normal file → Executable file
0
macro_example/tylo_core_draft.txt
Normal file → Executable file
0
macro_example/tylo_core_draft.txt
Normal file → Executable file
0
requirements.txt
Normal file → Executable file
0
requirements.txt
Normal file → Executable file
0
static/styles/main_page.css
Normal file → Executable file
0
static/styles/main_page.css
Normal file → Executable file
0
templates/index.html
Normal file → Executable file
0
templates/index.html
Normal file → Executable file
0
templates/success.html
Normal file → Executable file
0
templates/success.html
Normal file → Executable file
376866
test/SIME-MacroValidator.ipynb
Normal file
376866
test/SIME-MacroValidator.ipynb
Normal file
File diff suppressed because one or more lines are too long
20
test/SIME_chemplot_tSNE.ipynb
Normal file → Executable file
20
test/SIME_chemplot_tSNE.ipynb
Normal file → Executable file
@@ -1,5 +1,12 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"micromamba create -n qsar -c rdkit -c mordred-descriptor -c conda-forge rdkit numpy mordred scikit-learn pandas matplotlib padelpy fuzzywuzzy optuna hydra-core ipykernel loguru ipython joblib openbabel mopac rdkit jupyter ipykernel chemplot joblib -y"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
@@ -94,7 +101,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -6091,6 +6098,17 @@
|
||||
"[15:20:17] Explicit valence for atom # 29 C, 5, is greater than permitted\n",
|
||||
"[15:20:17] Explicit valence for atom # 29 C, 5, is greater than permitted\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m在当前单元格或上一个单元格中执行代码时 Kernel 崩溃。\n",
|
||||
"\u001b[1;31m请查看单元格中的代码,以确定故障的可能原因。\n",
|
||||
"\u001b[1;31m单击<a href='https://aka.ms/vscodeJupyterKernelCrash'>此处</a>了解详细信息。\n",
|
||||
"\u001b[1;31m有关更多详细信息,请查看 Jupyter <a href='command:jupyter.viewOutput'>log</a>。"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
|
||||
6409
test/SIME_chemplot_tSNE100w.ipynb
Executable file
6409
test/SIME_chemplot_tSNE100w.ipynb
Executable file
File diff suppressed because one or more lines are too long
289
test/macro16_SIME_synthesis_SA.ipynb
Normal file
289
test/macro16_SIME_synthesis_SA.ipynb
Normal file
File diff suppressed because one or more lines are too long
1144
test/macro16_SIME_synthesis_tSNE.ipynb
Normal file
1144
test/macro16_SIME_synthesis_tSNE.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
88
test/macrocycles_core.ipynb
Executable file
88
test/macrocycles_core.ipynb
Executable file
@@ -0,0 +1,88 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1, 'B': 3, 'B+1': 2, 'B-1': 4, 'O': 2, 'O+1': 3, 'O-1': 1, 'N': 3, 'N+1': 4, 'N-1': 2, 'C': 4, 'C+1': 5, 'C-1': 3, 'P': 5, 'P+1': 6, 'P-1': 4, 'S': 6, 'S+1': 7, 'S-1': 5, '?': 8}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import selfies as sf\n",
|
||||
"\n",
|
||||
"# 获取默认的语义约束字典\n",
|
||||
"constraints = sf.get_preset_constraints(\"default\")\n",
|
||||
"print(constraints)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import selfies as sf\n",
|
||||
"new_constraints = sf.get_preset_constraints(\"default\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sf.set_semantic_constraints(new_constraints)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"smiles_dataset = [\"COC\", \"FCF\", \"O=O\", \"O=Cc1ccccc1\"]\n",
|
||||
"selfies_dataset = list(map(sf.encoder, smiles_dataset))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"alphabet = sf.get_alphabet_from_selfies(selfies_dataset)\n",
|
||||
"alphabet.add(\"[nop]\")\n",
|
||||
"\n",
|
||||
"alphabet = list(sorted(alphabet))\n",
|
||||
"alphabet"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "frage",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
483
test/test.ipynb
Executable file
483
test/test.ipynb
Executable file
File diff suppressed because one or more lines are too long
668
test/tutorial.ipynb
Executable file
668
test/tutorial.ipynb
Executable file
@@ -0,0 +1,668 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Tutorial"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "tM3wFk1e_COd",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## The Basics\n",
|
||||
"We begin by importing `selfies`. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 89
|
||||
},
|
||||
"colab_type": "code",
|
||||
"id": "GH0DQxBN_Fei",
|
||||
"outputId": "56aa043e-df48-4081-f938-49711a166d33"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import selfies as sf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, let's try translating between SMILES and SELFIES - as an example, we will use benzaldehyde. To translate from SMILES to SELFIES, use the `selfies.encoder` function, and to translate from SMILES back to SELFIES, use the `selfies.decoder` function."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"original_smiles = \"O=Cc1ccccc1\" # benzaldehyde\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" encoded_selfies = sf.encoder(original_smiles) # SMILES -> SELFIES\n",
|
||||
" decoded_smiles = sf.decoder(encoded_selfies) # SELFIES -> SMILES\n",
|
||||
"except sf.EncoderError as err: \n",
|
||||
" pass # sf.encoder error...\n",
|
||||
"except sf.DecoderError as err: \n",
|
||||
" pass # sf.decoder error..."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'[O][=C][C][=C][C][=C][C][=C][Ring1][=Branch1]'"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"encoded_selfies"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'O=CC1=CC=CC=C1'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"decoded_smiles"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Ng8PmMiB_RvJ"
|
||||
},
|
||||
"source": [
|
||||
"Note that `original_smiles` and `decoded_smiles` are different strings, but they both represent benzaldehyde. Thus, when comparing the two SMILES strings, string equality should _not_ be used. Insead, use RDKit to check whether the SMILES strings represent the same molecule."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 34
|
||||
},
|
||||
"colab_type": "code",
|
||||
"id": "iAc5FVrP_XV6",
|
||||
"outputId": "b503f896-a2a0-46a6-fc5b-9c474c01ba62"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from rdkit import Chem\n",
|
||||
"\n",
|
||||
"Chem.CanonSmiles(original_smiles) == Chem.CanonSmiles(decoded_smiles)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "IKfNr5m6_h4f",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## Customizing SELFIES\n",
|
||||
"The SELFIES grammar is derived dynamically from a set of semantic constraints, which assign bonding capacities to various atoms. Let's customize the semantic constraints that `selfies` operates on. By default, the following constraints are used:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 200
|
||||
},
|
||||
"colab_type": "code",
|
||||
"id": "Xmce7wvV_t4Y",
|
||||
"outputId": "8b10af2f-486e-4910-8a71-055b59a09746"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'H': 1,\n",
|
||||
" 'F': 1,\n",
|
||||
" 'Cl': 1,\n",
|
||||
" 'Br': 1,\n",
|
||||
" 'I': 1,\n",
|
||||
" 'O': 2,\n",
|
||||
" 'O+1': 3,\n",
|
||||
" 'O-1': 1,\n",
|
||||
" 'N': 3,\n",
|
||||
" 'N+1': 4,\n",
|
||||
" 'N-1': 2,\n",
|
||||
" 'C': 4,\n",
|
||||
" 'C+1': 5,\n",
|
||||
" 'C-1': 3,\n",
|
||||
" 'P': 5,\n",
|
||||
" 'P+1': 6,\n",
|
||||
" 'P-1': 4,\n",
|
||||
" 'S': 6,\n",
|
||||
" 'S+1': 7,\n",
|
||||
" 'S-1': 5,\n",
|
||||
" '?': 8}"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sf.get_preset_constraints(\"default\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"These constraints map atoms (they keys) to their bonding capacities (the values). The special `?` key maps to the bonding capacity for all atoms that are not explicitly listed in the constraints. For example, S and Li are constrained to a maximum of 6 and 8 bonds, respectively. Every SELFIES string can be decoded into a molecule that obeys the current constraints."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'[Li]=CCS=CC#S'"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sf.decoder(\"[Li][=C][C][S][=C][C][#S]\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "KevVGyIEAVlu"
|
||||
},
|
||||
"source": [
|
||||
"But suppose that we instead wanted to constrain S and Li to a maximum of 2 and 1 bond(s), respectively. To do so, we create a new set of constraints, and tell `selfies` to operate on them using `selfies.set_semantic_constraints`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "y5EbmzkKATkD"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"new_constraints = sf.get_preset_constraints(\"default\")\n",
|
||||
"new_constraints['Li'] = 1\n",
|
||||
"new_constraints['S'] = 2\n",
|
||||
"\n",
|
||||
"sf.set_semantic_constraints(new_constraints)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To check that the update was succesful, we can use `selfies.get_semantic_constraints`, which returns the semantic constraints that `selfies` is currently operating on."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'H': 1,\n",
|
||||
" 'F': 1,\n",
|
||||
" 'Cl': 1,\n",
|
||||
" 'Br': 1,\n",
|
||||
" 'I': 1,\n",
|
||||
" 'O': 2,\n",
|
||||
" 'O+1': 3,\n",
|
||||
" 'O-1': 1,\n",
|
||||
" 'N': 3,\n",
|
||||
" 'N+1': 4,\n",
|
||||
" 'N-1': 2,\n",
|
||||
" 'C': 4,\n",
|
||||
" 'C+1': 5,\n",
|
||||
" 'C-1': 3,\n",
|
||||
" 'P': 5,\n",
|
||||
" 'P+1': 6,\n",
|
||||
" 'P-1': 4,\n",
|
||||
" 'S': 2,\n",
|
||||
" 'S+1': 7,\n",
|
||||
" 'S-1': 5,\n",
|
||||
" '?': 8,\n",
|
||||
" 'Li': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sf.get_semantic_constraints()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "e_djGmr_AvM7"
|
||||
},
|
||||
"source": [
|
||||
"Our previous SELFIES string is now decoded like so. Notice that the specified bonding capacities are met, with every S and Li making only 2 and 1 bonds, respectively."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "TCzbjZMAAxpo"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'[Li]CCSCC=S'"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sf.decoder(\"[Li][=C][C][S][=C][C][#S]\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Ng1Lr_e6A3cB"
|
||||
},
|
||||
"source": [
|
||||
"Finally, to revert back to the default constraints, simply call: "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "zwC00Rx5A6eQ"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sf.set_semantic_constraints()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
" Please refer to the API reference for more details and more preset constraints.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## SELFIES in Practice \n",
|
||||
"\n",
|
||||
"Let's use a simple example to show how `selfies` can be used in practice, as well as highlight some convenient utility functions from the library. We start with a toy dataset of SMILES strings. As before, we can use `selfies.encoder` to convert the dataset into SELFIES form."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['[C][O][C]',\n",
|
||||
" '[F][C][F]',\n",
|
||||
" '[O][=O]',\n",
|
||||
" '[O][=C][C][=C][C][=C][C][=C][Ring1][=Branch1]']"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"smiles_dataset = [\"COC\", \"FCF\", \"O=O\", \"O=Cc1ccccc1\"]\n",
|
||||
"selfies_dataset = list(map(sf.encoder, smiles_dataset))\n",
|
||||
"\n",
|
||||
"selfies_dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The function `selfies.len_selfies` computes the symbol length of a SELFIES string. We can use it to find the maximum symbol length of the SELFIES strings in the dataset. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"10"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"max_len = max(sf.len_selfies(s) for s in selfies_dataset)\n",
|
||||
"max_len"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To extract the SELFIES symbols that form the dataset, use `selfies.get_alphabet_from_selfies`. Here, we add `[nop]` to the alphabet, which is a special padding character that `selfies` recognizes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['[=Branch1]', '[=C]', '[=O]', '[C]', '[F]', '[O]', '[Ring1]', '[nop]']"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"alphabet = sf.get_alphabet_from_selfies(selfies_dataset)\n",
|
||||
"alphabet.add(\"[nop]\")\n",
|
||||
"\n",
|
||||
"alphabet = list(sorted(alphabet))\n",
|
||||
"alphabet"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Then, create a mapping between the alphabet SELFIES symbols and indices."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'[=Branch1]': 0,\n",
|
||||
" '[=C]': 1,\n",
|
||||
" '[=O]': 2,\n",
|
||||
" '[C]': 3,\n",
|
||||
" '[F]': 4,\n",
|
||||
" '[O]': 5,\n",
|
||||
" '[Ring1]': 6,\n",
|
||||
" '[nop]': 7}"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vocab_stoi = {symbol: idx for idx, symbol in enumerate(alphabet)}\n",
|
||||
"vocab_itos = {idx: symbol for symbol, idx in vocab_stoi.items()}\n",
|
||||
"\n",
|
||||
"vocab_stoi"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"SELFIES provides some convenience methods to convert between SELFIES strings and label (integer) and one-hot encodings. Using the first entry of the dataset (dimethyl ether) as an example:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dimethyl_ether = selfies_dataset[0]\n",
|
||||
"label, one_hot = sf.selfies_to_encoding(dimethyl_ether, vocab_stoi, pad_to_len=max_len)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[3, 5, 3, 7, 7, 7, 7, 7, 7, 7]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"label"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[[0, 0, 0, 1, 0, 0, 0, 0],\n",
|
||||
" [0, 0, 0, 0, 0, 1, 0, 0],\n",
|
||||
" [0, 0, 0, 1, 0, 0, 0, 0],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 0, 1],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 0, 1],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 0, 1],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 0, 1],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 0, 1],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 0, 1],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 0, 1]]"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"one_hot"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'[C][O][C][nop][nop][nop][nop][nop][nop][nop]'"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dimethyl_ether = sf.encoding_to_selfies(one_hot, vocab_itos, enc_type=\"one_hot\")\n",
|
||||
"dimethyl_ether"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'COC'"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sf.decoder(dimethyl_ether) # sf.decoder ignores [nop]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If different encoding strategies are desired, `selfies.split_selfies` can be used to tokenize a SELFIES string into its individual symbols."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['[C]', '[O]', '[C]']"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(sf.split_selfies(\"[C][O][C]\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Please refer to the API reference for more details and utility functions."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"collapsed_sections": [],
|
||||
"name": "selfies_example.ipynb",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
0
utils/atom_show.ipynb
Normal file → Executable file
0
utils/atom_show.ipynb
Normal file → Executable file
0
utils/bond_show.ipynb
Normal file → Executable file
0
utils/bond_show.ipynb
Normal file → Executable file
0
utils/generate.ipynb
Normal file → Executable file
0
utils/generate.ipynb
Normal file → Executable file
278
utils/simemacrocycle_repair.py
Executable file
278
utils/simemacrocycle_repair.py
Executable file
@@ -0,0 +1,278 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
'''
|
||||
@file :simemacrocycle_repair.py
|
||||
@Description: :SIME工具生成的16元环大环内酯化合物的自动化价态修复工具
|
||||
@Date :2025/03/29 18:29:52
|
||||
@Author :lyzeng
|
||||
@Email :pylyzeng@gmail.com
|
||||
@version :1.0
|
||||
# 安装依赖
|
||||
pip install rdkit pandas swifter joblib tqdm matplotlib
|
||||
|
||||
# 运行脚本
|
||||
python simemacrocycle_repair.py
|
||||
'''
|
||||
|
||||
##############################
|
||||
# 模块说明
|
||||
##############################
|
||||
"""
|
||||
主要功能:
|
||||
1. 批量修复SIME生成的含双键16元环大环内酯的价态错误
|
||||
2. 自动检测并处理以下问题:
|
||||
- 碳原子显式价态超限(如5价碳)
|
||||
- 不合理的显式氢配置
|
||||
- 双键立体化学冲突
|
||||
3. 提供修复结果统计和可视化分析
|
||||
|
||||
输入输出:
|
||||
- 输入:包含SMILES的文本文件(每行一个分子)
|
||||
- 输出:
|
||||
- 修复后的CSV文件(含原始/修正SMILES和状态)
|
||||
- 修复统计图表(PNG)
|
||||
- 摘要报告(TXT)
|
||||
|
||||
依赖环境:
|
||||
- Python >= 3.7
|
||||
- RDKit >= 2022.03
|
||||
- pandas >= 1.3
|
||||
- swifter >= 1.3
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import swifter
|
||||
from joblib import Parallel, delayed
|
||||
from tqdm.auto import tqdm
|
||||
import matplotlib.pyplot as plt
|
||||
from collections import Counter
|
||||
import re
|
||||
from rdkit import Chem
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore', category=UserWarning)
|
||||
|
||||
##############################
|
||||
# 核心修复函数
|
||||
##############################
|
||||
|
||||
def safe_sanitize(mol):
|
||||
"""
|
||||
安全标准化分子结构
|
||||
|
||||
Parameters:
|
||||
mol (rdkit.Chem.Mol): 待检测的分子对象
|
||||
|
||||
Returns:
|
||||
int/None: 返回错误原子索引(如存在价态错误),无错误则返回None
|
||||
"""
|
||||
try:
|
||||
Chem.SanitizeMol(mol)
|
||||
return None
|
||||
except Chem.AtomValenceException as e:
|
||||
match = re.search(r'atom # (\d+)', str(e))
|
||||
return int(match.group(1)) if match else None
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def fix_valence_error(smi):
|
||||
"""
|
||||
修复单个SMILES的价态错误
|
||||
|
||||
Parameters:
|
||||
smi (str): 输入SMILES字符串
|
||||
|
||||
Returns:
|
||||
tuple: (修正后SMILES, 状态, 描述信息)
|
||||
状态可能值:
|
||||
- 'valid': 原始有效
|
||||
- 'corrected': 修复成功
|
||||
- 'kekule': 需Kekule形式
|
||||
- 'failed': 修复失败
|
||||
"""
|
||||
try:
|
||||
mol = Chem.MolFromSmiles(smi, sanitize=False)
|
||||
if not mol:
|
||||
return smi, "invalid", "无法解析SMILES"
|
||||
|
||||
error_atom_idx = safe_sanitize(Chem.Mol(mol))
|
||||
if error_atom_idx is None:
|
||||
return Chem.MolToSmiles(mol), "valid", "原始有效"
|
||||
|
||||
rw_mol = Chem.RWMol(mol)
|
||||
atom = rw_mol.GetAtomWithIdx(error_atom_idx)
|
||||
|
||||
# 修复策略序列
|
||||
repair_actions = [
|
||||
lambda: atom.SetNumExplicitHs(0),
|
||||
lambda: (atom.SetFormalCharge(0), atom.SetNumRadicalElectrons(0)),
|
||||
lambda: rw_mol.RemoveBond(
|
||||
list(atom.GetBonds())[-1].GetBeginAtomIdx(),
|
||||
list(atom.GetBonds())[-1].GetEndAtomIdx()
|
||||
) if atom.GetDegree() > 1 else None
|
||||
]
|
||||
|
||||
for action in repair_actions:
|
||||
action()
|
||||
if safe_sanitize(rw_mol.GetMol()) is None:
|
||||
return Chem.MolToSmiles(rw_mol.GetMol()), "corrected", f"修复原子 {error_atom_idx}"
|
||||
|
||||
Chem.Kekulize(rw_mol)
|
||||
return Chem.MolToSmiles(rw_mol.GetMol(), kekuleSmiles=True), "kekule", "返回Kekule形式"
|
||||
|
||||
except Exception as e:
|
||||
return smi, "failed", str(e)
|
||||
|
||||
##############################
|
||||
# 并行处理模块
|
||||
##############################
|
||||
|
||||
def batch_process(smi_chunk):
|
||||
"""
|
||||
分块处理SMILES列表(兼容并行化)
|
||||
|
||||
Parameters:
|
||||
smi_chunk (list): SMILES字符串列表
|
||||
|
||||
Returns:
|
||||
list: 包含(修正SMILES, 状态, 信息)的元组列表
|
||||
"""
|
||||
return [fix_valence_error(smi) for smi in smi_chunk]
|
||||
|
||||
|
||||
def process_in_chunks(smi_list, chunk_size=50000, n_jobs=4):
|
||||
"""
|
||||
分块并行处理大规模SMILES数据
|
||||
|
||||
Parameters:
|
||||
smi_list (list): 原始SMILES列表
|
||||
chunk_size (int): 每块处理量
|
||||
n_jobs (int): 并行进程数
|
||||
|
||||
Returns:
|
||||
tuple: (修正SMILES列表, 状态列表, 信息列表)
|
||||
"""
|
||||
results = []
|
||||
for i in tqdm(range(0, len(smi_list), chunk_size),
|
||||
desc=f"Processing {len(smi_list):,} molecules"):
|
||||
chunk = smi_list[i:i + chunk_size]
|
||||
chunk_results = Parallel(n_jobs=n_jobs)(
|
||||
delayed(batch_process)(chunk[i:i+1000])
|
||||
for i in range(0, len(chunk), 1000)
|
||||
)
|
||||
results.extend([item for sublist in chunk_results for item in sublist])
|
||||
|
||||
return list(zip(*results)) if results else ([], [], [])
|
||||
|
||||
##############################
|
||||
# 统计分析模块
|
||||
##############################
|
||||
|
||||
def analyze_results(df):
|
||||
"""
|
||||
生成修复结果统计分析报告
|
||||
|
||||
Parameters:
|
||||
df (pd.DataFrame): 包含修复结果的DataFrame
|
||||
|
||||
Returns:
|
||||
dict: 包含关键统计指标的字典
|
||||
"""
|
||||
# 计算基本统计量
|
||||
stats = {
|
||||
'total_molecules': len(df),
|
||||
'valid_count': len(df[df['status'] == 'valid']),
|
||||
'corrected_count': len(df[df['status'] == 'corrected']),
|
||||
'kekule_count': len(df[df['status'] == 'kekule']),
|
||||
'failed_count': len(df[df['status'] == 'failed']),
|
||||
'success_rate': (len(df[df['status'].isin(['valid', 'corrected'])]) / len(df))
|
||||
}
|
||||
|
||||
# 错误分析(仅当存在失败时)
|
||||
if stats['failed_count'] > 0:
|
||||
stats['common_errors'] = dict(df[df['status'] == 'failed']['message'].value_counts().head(5))
|
||||
else:
|
||||
stats['common_errors'] = {}
|
||||
|
||||
# 可视化
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
||||
|
||||
# 状态分布饼图
|
||||
status_counts = df['status'].value_counts()
|
||||
status_counts.plot.pie(ax=ax1, autopct='%1.1f%%', startangle=90)
|
||||
ax1.set_title('修复状态分布')
|
||||
|
||||
# 错误原因柱状图(仅当存在错误时)
|
||||
if stats['common_errors']:
|
||||
pd.Series(stats['common_errors']).plot.barh(ax=ax2)
|
||||
ax2.set_title('Top 5错误原因')
|
||||
else:
|
||||
ax2.axis('off')
|
||||
ax2.text(0.5, 0.5, '无修复失败记录',
|
||||
ha='center', va='center', fontsize=12)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('repair_statistics.png', dpi=150)
|
||||
plt.close()
|
||||
|
||||
return stats
|
||||
|
||||
##############################
|
||||
# 主执行流程
|
||||
##############################
|
||||
|
||||
def main(input_path, output_path="fixed_molecules.csv", n_jobs=4):
|
||||
"""
|
||||
主处理流程
|
||||
|
||||
Parameters:
|
||||
input_path (str): 输入SMILES文件路径
|
||||
output_path (str): 输出CSV文件路径
|
||||
n_jobs (int): 并行进程数
|
||||
"""
|
||||
print(f"\n{' SIME大环内酯修复工具 ':=^50}\n")
|
||||
|
||||
# 数据加载
|
||||
smi_list = [s.strip() for s in Path(input_path).read_text().splitlines() if s.strip()]
|
||||
print(f"✅ 已加载 {len(smi_list):,} 个分子")
|
||||
|
||||
# 分子修复
|
||||
fixed_smiles, statuses, messages = process_in_chunks(smi_list, n_jobs=n_jobs)
|
||||
|
||||
# 结果分析
|
||||
df = pd.DataFrame({
|
||||
'original_smiles': smi_list,
|
||||
'fixed_smiles': fixed_smiles,
|
||||
'status': statuses,
|
||||
'message': messages
|
||||
})
|
||||
|
||||
stats = analyze_results(df)
|
||||
|
||||
# 结果保存
|
||||
df.to_csv(output_path, index=False)
|
||||
print(f"\n{' 修复结果统计 ':=^50}")
|
||||
print(f"总处理数: {stats['total_molecules']:,}")
|
||||
print(f"成功率: {stats['success_rate']:.2%}")
|
||||
print(f"\n输出文件已保存至: {output_path} 和 repair_statistics.png")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="SIME大环内酯修复工具")
|
||||
parser.add_argument('input', help="输入SMILES文件路径")
|
||||
parser.add_argument('-o', '--output', default="fixed_molecules.csv", help="输出CSV路径")
|
||||
parser.add_argument('-j', '--jobs', type=int, default=4, help="并行进程数")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(input_path=args.input, output_path=args.output, n_jobs=args.jobs)
|
||||
|
||||
'''
|
||||
# 查看帮助
|
||||
python simemacrocycle_repair.py -h
|
||||
|
||||
# 运行示例
|
||||
python simemacrocycle_repair.py input.smi -o results.csv -j 8
|
||||
|
||||
python simemacrocycle_repair.py ../data/Macro16_SIME_Synthesis/2025-02-26-05-38-39_mcrl_1.smiles -o ../data/Macro16_SIME_Synthesis/fixed_macrolides_2025.csv -j 8
|
||||
'''
|
||||
159
utils/smiles_svg_show.py
Normal file
159
utils/smiles_svg_show.py
Normal file
@@ -0,0 +1,159 @@
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem.Draw import rdMolDraw2D
|
||||
import boto3
|
||||
|
||||
# 对象存储配置信息(可随时修改)
|
||||
BUCKET_NAME = "{Your_Bucket_Name}"
|
||||
ACCESS_KEY = "{Your_Access_Key}"
|
||||
SECRET_KEY = "{Your_Secret_Key}"
|
||||
ENDPOINT_URL = "{Your_Endpoint_Url}"
|
||||
S3_SVG_PREFIX = "svg_outputs/"
|
||||
|
||||
# 生成SVG图片并高亮
|
||||
def mol_to_svg(mol, highlight_atoms=None, size=(400, 400)):
|
||||
drawer = rdMolDraw2D.MolDraw2DSVG(size[0], size[1])
|
||||
drawer.SetFontSize(6)
|
||||
opts = drawer.drawOptions()
|
||||
opts.addAtomIndices = True
|
||||
|
||||
atom_colors = {}
|
||||
if highlight_atoms:
|
||||
for idx in highlight_atoms:
|
||||
atom_colors[idx] = (1, 0, 0)
|
||||
|
||||
drawer.DrawMolecule(
|
||||
mol,
|
||||
highlightAtoms=highlight_atoms or [],
|
||||
highlightAtomColors=atom_colors
|
||||
)
|
||||
drawer.FinishDrawing()
|
||||
return drawer.GetDrawingText()
|
||||
|
||||
# 上传到对象存储(S3兼容)
|
||||
# 替换原始 upload_svg_to_s3 的返回值
|
||||
def upload_svg_to_s3(svg_content, object_name):
|
||||
session = boto3.session.Session(
|
||||
aws_access_key_id=ACCESS_KEY,
|
||||
aws_secret_access_key=SECRET_KEY,
|
||||
)
|
||||
s3 = session.resource('s3', endpoint_url=ENDPOINT_URL)
|
||||
obj = s3.Object(BUCKET_NAME, object_name)
|
||||
obj.put(Body=svg_content, ContentType='image/svg+xml')
|
||||
|
||||
# 返回 R2.dev 公共 URL
|
||||
return f"https://pub-389f446a01134875b8c7ced0572758de.r2.dev/{object_name}"
|
||||
|
||||
# 检测原子价态错误
|
||||
def find_valence_error_atom(mol):
|
||||
try:
|
||||
Chem.SanitizeMol(mol)
|
||||
return None
|
||||
except Chem.AtomValenceException as e:
|
||||
match = re.search(r'atom # (\d+)', str(e))
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
return None
|
||||
|
||||
# 保存和读取JSON的方法
|
||||
def save_json(data, filename):
|
||||
Path(filename).write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8')
|
||||
|
||||
def load_json(filename):
|
||||
return json.loads(Path(filename).read_text(encoding='utf-8'))
|
||||
|
||||
# 获取原子详细状态信息
|
||||
def get_atom_status(mol, atom_idx):
|
||||
atom = mol.GetAtomWithIdx(atom_idx)
|
||||
mol.UpdatePropertyCache(strict=False)
|
||||
connections = []
|
||||
for bond in atom.GetBonds():
|
||||
neighbor_idx = bond.GetOtherAtomIdx(atom_idx)
|
||||
connections.append({
|
||||
"connected_to": f"#{neighbor_idx} ({mol.GetAtomWithIdx(neighbor_idx).GetSymbol()})",
|
||||
"bond_type": str(bond.GetBondType())
|
||||
})
|
||||
|
||||
return {
|
||||
"explicit_connections": atom.GetDegree(),
|
||||
"formal_charge": atom.GetFormalCharge(),
|
||||
"radical_electrons": atom.GetNumRadicalElectrons(),
|
||||
"implicit_hydrogens": atom.GetNumImplicitHs(),
|
||||
"explicit_hydrogens": atom.GetNumExplicitHs(),
|
||||
"connections_detail": connections
|
||||
}
|
||||
|
||||
# 主程序
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Process SMILES and optionally highlight atoms using atom index or SMARTS pattern.")
|
||||
parser.add_argument('--smiles', type=str, required=True, help='SMILES string of molecule')
|
||||
parser.add_argument('--atom_idx', type=int, help='Atom index to highlight')
|
||||
parser.add_argument('--smarts', type=str, help='SMARTS pattern to highlight matched atoms')
|
||||
parser.add_argument('--output', type=str, default="output.json", help='Output JSON filename')
|
||||
parser.add_argument('--no_s3', action='store_true', help='Save SVG locally instead of S3')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
mol = Chem.MolFromSmiles(args.smiles, sanitize=False)
|
||||
# Chem.SanitizeMol(mol) # 手动完成标准化
|
||||
# Chem.MolToSmiles(mol) # canonical=True by default
|
||||
|
||||
error_atom_idx = find_valence_error_atom(mol)
|
||||
atom_state_info = "OK" if error_atom_idx is None else f"Valence error at atom #{error_atom_idx}"
|
||||
|
||||
highlight_atoms = set()
|
||||
|
||||
if args.atom_idx is not None:
|
||||
highlight_atoms.add(args.atom_idx)
|
||||
|
||||
if args.smarts:
|
||||
patt = Chem.MolFromSmarts(args.smarts)
|
||||
matches = mol.GetSubstructMatches(patt)
|
||||
for match in matches:
|
||||
highlight_atoms.update(match)
|
||||
|
||||
svg_str = mol_to_svg(mol, highlight_atoms=list(highlight_atoms))
|
||||
|
||||
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
|
||||
svg_filename = f"molecule_{timestamp}.svg"
|
||||
|
||||
output_path = Path(args.output)
|
||||
if not output_path.is_absolute():
|
||||
output_path = Path.cwd() / output_path
|
||||
|
||||
if args.no_s3:
|
||||
svg_path = output_path.parent / svg_filename
|
||||
svg_path.write_text(svg_str, encoding='utf-8')
|
||||
svg_location = str(svg_path)
|
||||
else:
|
||||
object_name = f"{S3_SVG_PREFIX}{svg_filename}"
|
||||
svg_location = upload_svg_to_s3(svg_str, object_name)
|
||||
|
||||
output_data = {
|
||||
"atom_state": atom_state_info,
|
||||
"svg_url": svg_location,
|
||||
"svg_filename": svg_filename
|
||||
}
|
||||
|
||||
if args.atom_idx is not None:
|
||||
output_data["atom_status_detail"] = get_atom_status(mol, args.atom_idx)
|
||||
|
||||
save_json(output_data, output_path)
|
||||
|
||||
print(f"Results saved to {output_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
"""
|
||||
# 自动修复键值错误
|
||||
python smiles_svg_show.py --smiles "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H](O)[C@H](C)O9)[C@@H](C)C[C@@H](C)C(=O)/C=C/[C@@H](CC)=C/[C@H](O[C@@H]9O[C@H](C)C[C@@H]([C@H]9O)N(C)C)[N@@](C)O1" --atom_idx 30
|
||||
|
||||
python smiles_svg_show.py --smiles "CCC1=C\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@@H](CC=O)OC(=O)C[C@@H](O)C[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@@H](C)C[C@@H](C)C(=O)\C=C\1" --atom_idx 30
|
||||
# smarts 匹配,要求smiles正确
|
||||
python smiles_svg_show.py --smiles "CCC1=C\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@@H](CC=O)OC(=O)C[C@@H](O)C[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@@H](C)C[C@@H](C)C(=O)\C=C\1" --smarts "[r16]([#8][#6](=[#8]))"
|
||||
"""
|
||||
0
utils/split_multi.ipynb
Normal file → Executable file
0
utils/split_multi.ipynb
Normal file → Executable file
Reference in New Issue
Block a user