update

2025-10-16 17:26:35 +08:00
parent b1d437a06d
commit ea218a3a39
49 changed files with 694742 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 uploads/
 LIBRARIES/
-*.pyc
+*.pyc
+Data/fragment/Frags-Enamine-18M.csv
+Data/fragment/GDB11-27M.csv
--- a/Data/MacrolactoneDB/MacrolactoneDB_Filtered_with_Reported_Activity_2751.csv
+++ b/Data/MacrolactoneDB/MacrolactoneDB_Filtered_with_Reported_Activity_2751.csv
--- a/Data/MacrolactoneDB/Macrolactone_Filtered_16Ring_Active_435.csv
+++ b/Data/MacrolactoneDB/Macrolactone_Filtered_16Ring_Active_435.csv
--- a/Data/MacrolactoneDB/README.md
+++ b/Data/MacrolactoneDB/README.md
@@ -0,0 +1,3 @@
+website: https://macrolact.collaborationspharma.com/
+
+通过MacrolactoneDB页面筛选的结果并不准确，我限定16环，但是在下载的csv里面有14环的结果，所以还是要自己进行在筛选一遍。
--- a/Data/MacrolactoneDB/ring12_20/README.md
+++ b/Data/MacrolactoneDB/ring12_20/README.md
@@ -0,0 +1,3 @@
+Your Filtered Macrolactone Database
+
+11036 compounds have been filtered from MacrolactoneDB based on your specified inputs.
--- a/Data/MacrolactoneDB/ring12_20/counts.txt
+++ b/Data/MacrolactoneDB/ring12_20/counts.txt
@@ -0,0 +1,89 @@
+Target Organisms
+Homo sapiens                                                                                                                                                                                                                                                 815
+Homo sapiens, None                                                                                                                                                                                                                                           180
+Plasmodium falciparum                                                                                                                                                                                                                                        161
+Hepatitis C virus, None                                                                                                                                                                                                                                      112
+Homo sapiens, Plasmodium falciparum                                                                                                                                                                                                                           63
+Oryctolagus cuniculus                                                                                                                                                                                                                                         62
+Mus musculus                                                                                                                                                                                                                                                  60
+Toxoplasma gondii                                                                                                                                                                                                                                             39
+Homo sapiens, Rattus norvegicus                                                                                                                                                                                                                               27
+Mus musculus, Homo sapiens                                                                                                                                                                                                                                    24
+None, Rattus norvegicus                                                                                                                                                                                                                                       23
+Human immunodeficiency virus 1                                                                                                                                                                                                                                20
+Hepatitis C virus                                                                                                                                                                                                                                             18
+Rattus norvegicus                                                                                                                                                                                                                                             17
+Homo sapiens, Sus scrofa                                                                                                                                                                                                                                      11
+Homo sapiens, Chlorocebus aethiops                                                                                                                                                                                                                            10
+Serratia marcescens                                                                                                                                                                                                                                            9
+Escherichia coli                                                                                                                                                                                                                                               8
+Oryctolagus cuniculus, Homo sapiens                                                                                                                                                                                                                            7
+Streptococcus pneumoniae                                                                                                                                                                                                                                       6
+Oryctolagus cuniculus, Staphylococcus aureus, Raoultella planticola, Bacillus subtilis, Mus musculus, Micrococcus luteus, None, Escherichia coli, Plasmodium falciparum, Streptococcus pneumoniae, Homo sapiens, Escherichia coli K-12, Toxoplasma gondii      6
+Plasmodium falciparum K1                                                                                                                                                                                                                                       5
+Bacillus anthracis                                                                                                                                                                                                                                             5
+Mus musculus, Homo sapiens, None                                                                                                                                                                                                                               5
+Bacillus anthracis, Homo sapiens                                                                                                                                                                                                                               4
+Candida albicans, Cryptococcus neoformans, Aspergillus fumigatus                                                                                                                                                                                               4
+Mus musculus, None                                                                                                                                                                                                                                             4
+Plasmodium falciparum, Homo sapiens, None                                                                                                                                                                                                                      4
+None, Homo sapiens, Plasmodium falciparum                                                                                                                                                                                                                      3
+Bacillus subtilis, Homo sapiens                                                                                                                                                                                                                                3
+Oryctolagus cuniculus, Homo sapiens, None                                                                                                                                                                                                                      3
+Sus scrofa, Mus musculus, None, Plasmodium falciparum, Homo sapiens, Rattus norvegicus                                                                                                                                                                         2
+Homo sapiens, None, Rattus norvegicus                                                                                                                                                                                                                          2
+Cryptococcus neoformans                                                                                                                                                                                                                                        2
+Homo sapiens, None, Chlorocebus aethiops                                                                                                                                                                                                                       2
+Staphylococcus aureus                                                                                                                                                                                                                                          2
+Candida albicans, Cryptococcus neoformans, Mycobacterium intracellulare, Aspergillus fumigatus                                                                                                                                                                 2
+Mus musculus, None, Human immunodeficiency virus 1                                                                                                                                                                                                             2
+Escherichia coli (strain K12)                                                                                                                                                                                                                                  2
+Plasmodium falciparum 3D7, Homo sapiens                                                                                                                                                                                                                        2
+Aspergillus fumigatus                                                                                                                                                                                                                                          1
+Sus scrofa                                                                                                                                                                                                                                                     1
+Saccharomyces cerevisiae S288c, Human immunodeficiency virus 1, Human herpesvirus 1, Plasmodium falciparum, None, Homo sapiens, Rattus norvegicus                                                                                                              1
+Hepatitis C virus, Homo sapiens, None                                                                                                                                                                                                                          1
+Plasmodium falciparum 3D7                                                                                                                                                                                                                                      1
+Bacillus subtilis                                                                                                                                                                                                                                              1
+Mus musculus, Homo sapiens, None, Saccharomyces cerevisiae                                                                                                                                                                                                     1
+Chlorocebus aethiops                                                                                                                                                                                                                                           1
+Homo sapiens, Escherichia coli K-12, None                                                                                                                                                                                                                      1
+Hepatitis C virus, Homo sapiens, None, Rattus norvegicus                                                                                                                                                                                                       1
+None, Homo sapiens, Human herpesvirus 1                                                                                                                                                                                                                        1
+Homo sapiens, None, Trypanosoma brucei brucei                                                                                                                                                                                                                  1
+Homo sapiens, None, Cryptococcus neoformans                                                                                                                                                                                                                    1
+Homo sapiens, Rattus norvegicus, Human immunodeficiency virus 1                                                                                                                                                                                                1
+None, Plasmodium falciparum, Escherichia coli, Streptococcus pneumoniae, Naegleria fowleri, Homo sapiens, Streptococcus, Toxoplasma gondii                                                                                                                     1
+Giardia intestinalis, Trypanosoma cruzi, Equus caballus, Bos taurus, Mus musculus, None, Plasmodium falciparum, Chlorocebus aethiops, Homo sapiens                                                                                                             1
+Plasmodium falciparum NF54, Trypanosoma cruzi, Trypanosoma brucei rhodesiense, Rattus norvegicus                                                                                                                                                               1
+None, Homo sapiens, Plasmodium falciparum K1, Plasmodium falciparum                                                                                                                                                                                            1
+Saccharomyces cerevisiae S288c, Homo sapiens, None, Saccharomyces cerevisiae, Phytophthora sojae                                                                                                                                                               1
+Bacillus subtilis, Homo sapiens, Schistosoma mansoni, Saccharomyces cerevisiae, Giardia intestinalis                                                                                                                                                           1
+Streptococcus, Homo sapiens, None                                                                                                                                                                                                                              1
+Mus musculus, Homo sapiens, Rattus norvegicus                                                                                                                                                                                                                  1
+Homo sapiens, Spinacia oleracea                                                                                                                                                                                                                                1
+Human immunodeficiency virus 1, Mus musculus, None, Hepatitis C virus, Homo sapiens, Rattus norvegicus                                                                                                                                                         1
+None, Plasmodium falciparum, Trypanosoma brucei rhodesiense                                                                                                                                                                                                    1
+Hepatitis C virus, None, Rattus norvegicus                                                                                                                                                                                                                     1
+Homo sapiens, Equus caballus                                                                                                                                                                                                                                   1
+Plasmodium falciparum NF54, Trypanosoma cruzi, Trypanosoma brucei rhodesiense                                                                                                                                                                                  1
+Schistosoma mansoni, Influenza A virus                                                                                                                                                                                                                         1
+Leishmania chagasi, Trypanosoma cruzi                                                                                                                                                                                                                          1
+Candida albicans, Cryptococcus neoformans                                                                                                                                                                                                                      1
+None, Plasmodium falciparum                                                                                                                                                                                                                                    1
+Caenorhabditis elegans                                                                                                                                                                                                                                         1
+Bos taurus, Sus scrofa                                                                                                                                                                                                                                         1
+Plasmodium falciparum, Enterococcus faecium                                                                                                                                                                                                                    1
+Homo sapiens, Gallus gallus                                                                                                                                                                                                                                    1
+Homo sapiens, Escherichia coli                                                                                                                                                                                                                                 1
+Plasmodium falciparum, Homo sapiens, None, Rattus norvegicus, Schistosoma mansoni                                                                                                                                                                              1
+Homo sapiens, None, Influenza A virus                                                                                                                                                                                                                          1
+Mycobacterium tuberculosis, None                                                                                                                                                                                                                               1
+Escherichia coli, Homo sapiens, Toxoplasma gondii, None, Streptococcus pneumoniae                                                                                                                                                                              1
+Bacillus subtilis, Oryctolagus cuniculus, Homo sapiens, Schistosoma mansoni, Giardia intestinalis                                                                                                                                                              1
+Homo sapiens, None, Rattus norvegicus, Escherichia coli O157:H7                                                                                                                                                                                                1
+Giardia intestinalis, Schistosoma mansoni, Mus musculus, None, Homo sapiens, Saccharomyces cerevisiae                                                                                                                                                          1
+Trypanosoma cruzi                                                                                                                                                                                                                                              1
+Influenza A virus                                                                                                                                                                                                                                              1
+Escherichia coli K-12                                                                                                                                                                                                                                          1
+Human herpesvirus 4 (strain B95-8)                                                                                                                                                                                                                             1
--- a/Data/MacrolactoneDB/ring12_20/processed_embedding_small.csv
+++ b/Data/MacrolactoneDB/ring12_20/processed_embedding_small.csv
--- a/Data/MacrolactoneDB/ring12_20/temp.csv
+++ b/Data/MacrolactoneDB/ring12_20/temp.csv
--- a/Data/MacrolactoneDB/ring12_20/umap_visualization_small.png
+++ b/Data/MacrolactoneDB/ring12_20/umap_visualization_small.png
--- a/Data/MacrolactoneDB/ring16/temp.csv
+++ b/Data/MacrolactoneDB/ring16/temp.csv
--- a/Data/MacrolactoneDB/ring16/temp.sdf
+++ b/Data/MacrolactoneDB/ring16/temp.sdf
--- a/Data/ery_core.txt
+++ b/Data/ery_core.txt
--- a/Data/fragment/README.md
+++ b/Data/fragment/README.md
@@ -0,0 +1,36 @@
+## [Cell](https://www.cell.com/cell/abstract/S0092-8674(25)00855-4) 论文筛选数据
+
+
+
+## 数据输入：原始片段库
+
+Frags-Enamine-18M.csv：Enamine REAL数据库的18M片段（需提取SMILES）。
+GDB11-27M.csv：GDB-11数据库的27M片段（需提取SMILES）。
+
+下载地址：[Zenodo link](https://zenodo.org/records/15191826)
+
+## 原文筛选逻辑（淋病奈瑟菌靶向）
+
+（1）数据输入：原始片段库
+文件来源：
+Frags-Enamine-18M.csv：Enamine REAL数据库的18M片段（需提取SMILES）。
+GDB11-27M.csv：GDB-11数据库的27M片段（需提取SMILES）。
+（2）模型预测：Chemprop预训练模型
+模型用途：
+使用预训练的Chemprop模型（针对淋病奈瑟菌或金黄色葡萄球菌）预测片段的抗菌活性得分（范围0-1）。
+模型合理性：
+Chemprop模型基于图神经网络（GNN），已在大规模化合物库（如Broad Institute的38,765个化合物）上训练，对结构-活性关系有较高预测精度。
+论文验证了模型对已知抗生素片段的预测能力（见Figure S1A），证明其可靠性。
+（3）多维度过滤条件
+筛选逻辑包含以下条件（需代码实现）：
+
+1.活性阈值：
+GDB库片段预测得分>0.05；
+Enamine库片段预测得分>0.1（因合成性更佳）。
+2.毒性过滤：
+使用预训练的HepG2、HSkMC、IMR-90细胞毒性模型，剔除预测得分>0.5的片段。
+3.结构过滤：
+排除含PAINS/Brenk子结构的片段（易导致假阳性或代谢不稳定）。
+与已知559个抗生素的Tanimoto相似度<0.5（确保结构新颖性）。
+（4）结果输出
+最终获得1,156,945个片段（淋病奈瑟菌靶向），存储于补充数据或Zenodo仓库中。
--- a/Data/image.png
+++ b/Data/image.png
--- a/Data/my_sugars.txt
+++ b/Data/my_sugars.txt
@@ -0,0 +1,9 @@
+[*R*][C@@H](O[C@@H]1O[C@H](C)[C@@H](O[C@@H]2O[C@H](C)[C@@H](O)[C@@](O)(C)C2)[C@H](N(C)C)[C@H]1O)[*R*]
+[*R*][C@@H](CO[C@@H]1O[C@H](C)[C@@H](O)[C@@H](OC)[C@H]1OC)[*R*]
+[*R*][C@H](O[C@H]9C[C@@](C)(OC)[C@@H](O)[C@H](C)O9)[*R*]
+[*R*][C@H](O[C@@H]9O[C@H](C)C[C@@H]([C@H]9O)N(C)C)[*R*]
+[*R*][C@H](O[C@@H]9O[C@H](C)C[C@@H]([C@H]9OC(C)=O)N(C)C)[*R*]
+[*R*][C@H](O[C@H]9C[C@H](OC)O[C@@H](C)[C@@H]9OC(C)=O)[*R*]
+[*R*][C@H](O[C@H]9C[C@H](OC)[C@@H](O)[C@H](C)O9)[*R*]
+[*R*][C@H](O[C@H]9C[C@@H](O)[C@H](O)[C@@H](C)O9)[*R*]
+[*R*][C@H](O[C@@H]9O[C@H](C)C[C@H](NC)[C@H]9O)[*R*]
--- a/Data/selected_extenders.txt
+++ b/Data/selected_extenders.txt
--- a/Data/split_position.md
+++ b/Data/split_position.md
@@ -0,0 +1,2 @@
+键编号 31: 17(C) -> 32(O), 键类型: SINGLE
+键编号 6: 6(C) -> 7(C), 键类型: SINGLE
--- a/Data/sugars
+++ b/Data/sugars
--- a/README.md
+++ b/README.md
--- a/SIME.py
+++ b/SIME.py
--- a/V1B_full_database.txt
+++ b/V1B_full_database.txt
--- a/analyse/selected_extenders.txt
+++ b/analyse/selected_extenders.txt
--- a/analyse/sugars
+++ b/analyse/sugars
--- a/analyse/sync_cmd.md
+++ b/analyse/sync_cmd.md
--- a/analyse/tylo_core.txt
+++ b/analyse/tylo_core.txt
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
--- a/docker/SIME.def
+++ b/docker/SIME.def
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
--- a/macro_example/tylo_core.txt
+++ b/macro_example/tylo_core.txt
--- a/macro_example/tylo_core_draft.txt
+++ b/macro_example/tylo_core_draft.txt
--- a/main.py
+++ b/main.py
--- a/requirements.txt
+++ b/requirements.txt
--- a/static/styles/main_page.css
+++ b/static/styles/main_page.css
--- a/templates/index.html
+++ b/templates/index.html
--- a/templates/success.html
+++ b/templates/success.html
--- a/test/SIME-MacroValidator.ipynb
+++ b/test/SIME-MacroValidator.ipynb
--- a/test/SIME_chemplot_tSNE.ipynb
+++ b/test/SIME_chemplot_tSNE.ipynb
@@ -1,5 +1,12 @@
 {
 "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "micromamba create -n qsar -c rdkit -c mordred-descriptor -c conda-forge rdkit numpy mordred scikit-learn pandas matplotlib padelpy fuzzywuzzy optuna hydra-core ipykernel loguru ipython joblib openbabel mopac rdkit jupyter ipykernel chemplot joblib -y"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 1,
@@ -94,7 +101,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@@ -6091,6 +6098,17 @@
      "[15:20:17] Explicit valence for atom # 29 C, 5, is greater than permitted\n",
      "[15:20:17] Explicit valence for atom # 29 C, 5, is greater than permitted\n"
     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m在当前单元格或上一个单元格中执行代码时 Kernel 崩溃。\n",
+      "\u001b[1;31m请查看单元格中的代码，以确定故障的可能原因。\n",
+      "\u001b[1;31m单击<a href='https://aka.ms/vscodeJupyterKernelCrash'>此处</a>了解详细信息。\n",
+      "\u001b[1;31m有关更多详细信息，请查看 Jupyter <a href='command:jupyter.viewOutput'>log</a>。"
+     ]
    }
   ],
   "source": [
--- a/test/SIME_chemplot_tSNE100w.ipynb
+++ b/test/SIME_chemplot_tSNE100w.ipynb
--- a/test/macro16_SIME_synthesis_SA.ipynb
+++ b/test/macro16_SIME_synthesis_SA.ipynb
--- a/test/macro16_SIME_synthesis_tSNE.ipynb
+++ b/test/macro16_SIME_synthesis_tSNE.ipynb
--- a/test/macrocycles_core.ipynb
+++ b/test/macrocycles_core.ipynb
@@ -0,0 +1,88 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1, 'B': 3, 'B+1': 2, 'B-1': 4, 'O': 2, 'O+1': 3, 'O-1': 1, 'N': 3, 'N+1': 4, 'N-1': 2, 'C': 4, 'C+1': 5, 'C-1': 3, 'P': 5, 'P+1': 6, 'P-1': 4, 'S': 6, 'S+1': 7, 'S-1': 5, '?': 8}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import selfies as sf\n",
+    "\n",
+    "# 获取默认的语义约束字典\n",
+    "constraints = sf.get_preset_constraints(\"default\")\n",
+    "print(constraints)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import selfies as sf\n",
+    "new_constraints = sf.get_preset_constraints(\"default\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sf.set_semantic_constraints(new_constraints)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "smiles_dataset = [\"COC\", \"FCF\", \"O=O\", \"O=Cc1ccccc1\"]\n",
+    "selfies_dataset = list(map(sf.encoder, smiles_dataset))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "alphabet = sf.get_alphabet_from_selfies(selfies_dataset)\n",
+    "alphabet.add(\"[nop]\")\n",
+    "\n",
+    "alphabet = list(sorted(alphabet))\n",
+    "alphabet"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "frage",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/test/test.ipynb
+++ b/test/test.ipynb
--- a/test/tutorial.ipynb
+++ b/test/tutorial.ipynb
@@ -0,0 +1,668 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tutorial"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "tM3wFk1e_COd",
+    "tags": []
+   },
+   "source": [
+    "## The Basics\n",
+    "We begin by importing `selfies`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 89
+    },
+    "colab_type": "code",
+    "id": "GH0DQxBN_Fei",
+    "outputId": "56aa043e-df48-4081-f938-49711a166d33"
+   },
+   "outputs": [],
+   "source": [
+    "import selfies as sf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, let's try translating between SMILES and SELFIES - as an example, we will use benzaldehyde. To translate from SMILES to SELFIES, use the `selfies.encoder` function, and to translate from SMILES back to SELFIES, use the `selfies.decoder` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "original_smiles = \"O=Cc1ccccc1\"  # benzaldehyde\n",
+    "\n",
+    "try:\n",
+    "    encoded_selfies = sf.encoder(original_smiles)  # SMILES  -> SELFIES\n",
+    "    decoded_smiles = sf.decoder(encoded_selfies)   # SELFIES -> SMILES\n",
+    "except sf.EncoderError as err: \n",
+    "    pass  # sf.encoder error...\n",
+    "except sf.DecoderError as err: \n",
+    "    pass  # sf.decoder error..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'[O][=C][C][=C][C][=C][C][=C][Ring1][=Branch1]'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "encoded_selfies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'O=CC1=CC=CC=C1'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "decoded_smiles"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Ng8PmMiB_RvJ"
+   },
+   "source": [
+    "Note that `original_smiles` and `decoded_smiles` are different strings, but they both represent benzaldehyde. Thus, when comparing the two SMILES strings, string equality should _not_ be used. Insead, use RDKit to check whether the SMILES strings represent the same molecule."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 34
+    },
+    "colab_type": "code",
+    "id": "iAc5FVrP_XV6",
+    "outputId": "b503f896-a2a0-46a6-fc5b-9c474c01ba62"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from rdkit import Chem\n",
+    "\n",
+    "Chem.CanonSmiles(original_smiles) == Chem.CanonSmiles(decoded_smiles)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "IKfNr5m6_h4f",
+    "tags": []
+   },
+   "source": [
+    "## Customizing SELFIES\n",
+    "The SELFIES grammar is derived dynamically from a set of semantic constraints, which assign bonding capacities to various atoms. Let's customize the semantic constraints that `selfies` operates on. By default, the following constraints are used:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 200
+    },
+    "colab_type": "code",
+    "id": "Xmce7wvV_t4Y",
+    "outputId": "8b10af2f-486e-4910-8a71-055b59a09746"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'H': 1,\n",
+       " 'F': 1,\n",
+       " 'Cl': 1,\n",
+       " 'Br': 1,\n",
+       " 'I': 1,\n",
+       " 'O': 2,\n",
+       " 'O+1': 3,\n",
+       " 'O-1': 1,\n",
+       " 'N': 3,\n",
+       " 'N+1': 4,\n",
+       " 'N-1': 2,\n",
+       " 'C': 4,\n",
+       " 'C+1': 5,\n",
+       " 'C-1': 3,\n",
+       " 'P': 5,\n",
+       " 'P+1': 6,\n",
+       " 'P-1': 4,\n",
+       " 'S': 6,\n",
+       " 'S+1': 7,\n",
+       " 'S-1': 5,\n",
+       " '?': 8}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sf.get_preset_constraints(\"default\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "These constraints map atoms (they keys) to their bonding capacities (the values). The special `?` key maps to the bonding capacity for all atoms that are not explicitly listed in the constraints. For example, S and Li are constrained to a maximum of 6 and 8 bonds, respectively. Every SELFIES string can be decoded into a molecule that obeys the current constraints."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'[Li]=CCS=CC#S'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sf.decoder(\"[Li][=C][C][S][=C][C][#S]\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "KevVGyIEAVlu"
+   },
+   "source": [
+    "But suppose that we instead wanted to constrain S and Li to a maximum of 2 and 1 bond(s), respectively. To do so, we create a new set of constraints, and tell `selfies` to operate on them using `selfies.set_semantic_constraints`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "y5EbmzkKATkD"
+   },
+   "outputs": [],
+   "source": [
+    "new_constraints = sf.get_preset_constraints(\"default\")\n",
+    "new_constraints['Li'] = 1\n",
+    "new_constraints['S'] = 2\n",
+    "\n",
+    "sf.set_semantic_constraints(new_constraints)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To check that the update was succesful, we can use `selfies.get_semantic_constraints`, which returns the semantic constraints that `selfies` is currently operating on."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'H': 1,\n",
+       " 'F': 1,\n",
+       " 'Cl': 1,\n",
+       " 'Br': 1,\n",
+       " 'I': 1,\n",
+       " 'O': 2,\n",
+       " 'O+1': 3,\n",
+       " 'O-1': 1,\n",
+       " 'N': 3,\n",
+       " 'N+1': 4,\n",
+       " 'N-1': 2,\n",
+       " 'C': 4,\n",
+       " 'C+1': 5,\n",
+       " 'C-1': 3,\n",
+       " 'P': 5,\n",
+       " 'P+1': 6,\n",
+       " 'P-1': 4,\n",
+       " 'S': 2,\n",
+       " 'S+1': 7,\n",
+       " 'S-1': 5,\n",
+       " '?': 8,\n",
+       " 'Li': 1}"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sf.get_semantic_constraints()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "e_djGmr_AvM7"
+   },
+   "source": [
+    "Our previous SELFIES string is now decoded like so. Notice that the specified bonding capacities are met, with every S and Li making only 2 and 1 bonds, respectively."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "TCzbjZMAAxpo"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'[Li]CCSCC=S'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sf.decoder(\"[Li][=C][C][S][=C][C][#S]\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Ng1Lr_e6A3cB"
+   },
+   "source": [
+    "Finally, to revert back to the default constraints, simply call: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "zwC00Rx5A6eQ"
+   },
+   "outputs": [],
+   "source": [
+    "sf.set_semantic_constraints()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " Please refer to the API reference for more details and more preset constraints.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## SELFIES in Practice \n",
+    "\n",
+    "Let's use a simple example to show how `selfies` can be used in practice, as well as highlight some convenient utility functions from the library. We start with a toy dataset of SMILES strings. As before, we can use `selfies.encoder` to convert the dataset into SELFIES form."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['[C][O][C]',\n",
+       " '[F][C][F]',\n",
+       " '[O][=O]',\n",
+       " '[O][=C][C][=C][C][=C][C][=C][Ring1][=Branch1]']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "smiles_dataset = [\"COC\", \"FCF\", \"O=O\", \"O=Cc1ccccc1\"]\n",
+    "selfies_dataset = list(map(sf.encoder, smiles_dataset))\n",
+    "\n",
+    "selfies_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The function `selfies.len_selfies` computes the symbol length of a SELFIES string. We can use it to find the maximum symbol length of the SELFIES strings in the dataset.      "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "10"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "max_len = max(sf.len_selfies(s) for s in selfies_dataset)\n",
+    "max_len"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To extract the SELFIES symbols that form the dataset, use `selfies.get_alphabet_from_selfies`. Here, we add `[nop]` to the alphabet, which is a special padding character that `selfies` recognizes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['[=Branch1]', '[=C]', '[=O]', '[C]', '[F]', '[O]', '[Ring1]', '[nop]']"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "alphabet = sf.get_alphabet_from_selfies(selfies_dataset)\n",
+    "alphabet.add(\"[nop]\")\n",
+    "\n",
+    "alphabet = list(sorted(alphabet))\n",
+    "alphabet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, create a mapping between the alphabet SELFIES symbols and indices."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'[=Branch1]': 0,\n",
+       " '[=C]': 1,\n",
+       " '[=O]': 2,\n",
+       " '[C]': 3,\n",
+       " '[F]': 4,\n",
+       " '[O]': 5,\n",
+       " '[Ring1]': 6,\n",
+       " '[nop]': 7}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vocab_stoi = {symbol: idx for idx, symbol in enumerate(alphabet)}\n",
+    "vocab_itos = {idx: symbol for symbol, idx in vocab_stoi.items()}\n",
+    "\n",
+    "vocab_stoi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "SELFIES provides some convenience methods to convert between SELFIES strings and label (integer) and one-hot encodings. Using the first entry of the dataset (dimethyl ether) as an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dimethyl_ether = selfies_dataset[0]\n",
+    "label, one_hot = sf.selfies_to_encoding(dimethyl_ether, vocab_stoi, pad_to_len=max_len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[3, 5, 3, 7, 7, 7, 7, 7, 7, 7]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[[0, 0, 0, 1, 0, 0, 0, 0],\n",
+       " [0, 0, 0, 0, 0, 1, 0, 0],\n",
+       " [0, 0, 0, 1, 0, 0, 0, 0],\n",
+       " [0, 0, 0, 0, 0, 0, 0, 1],\n",
+       " [0, 0, 0, 0, 0, 0, 0, 1],\n",
+       " [0, 0, 0, 0, 0, 0, 0, 1],\n",
+       " [0, 0, 0, 0, 0, 0, 0, 1],\n",
+       " [0, 0, 0, 0, 0, 0, 0, 1],\n",
+       " [0, 0, 0, 0, 0, 0, 0, 1],\n",
+       " [0, 0, 0, 0, 0, 0, 0, 1]]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "one_hot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'[C][O][C][nop][nop][nop][nop][nop][nop][nop]'"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dimethyl_ether = sf.encoding_to_selfies(one_hot, vocab_itos, enc_type=\"one_hot\")\n",
+    "dimethyl_ether"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'COC'"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sf.decoder(dimethyl_ether)  # sf.decoder ignores [nop]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If different encoding strategies are desired, `selfies.split_selfies` can be used to tokenize a SELFIES string into its individual symbols."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['[C]', '[O]', '[C]']"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(sf.split_selfies(\"[C][O][C]\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Please refer to the API reference for more details and utility functions."
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "collapsed_sections": [],
+   "name": "selfies_example.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/utils/atom_show.ipynb
+++ b/utils/atom_show.ipynb
--- a/utils/bond_show.ipynb
+++ b/utils/bond_show.ipynb
--- a/utils/generate.ipynb
+++ b/utils/generate.ipynb
--- a/utils/simemacrocycle_repair.py
+++ b/utils/simemacrocycle_repair.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@file               :simemacrocycle_repair.py
+@Description:       :SIME工具生成的16元环大环内酯化合物的自动化价态修复工具
+@Date               :2025/03/29 18:29:52
+@Author             :lyzeng
+@Email              :pylyzeng@gmail.com
+@version            :1.0
+# 安装依赖
+pip install rdkit pandas swifter joblib tqdm matplotlib
+
+# 运行脚本
+python simemacrocycle_repair.py
+'''
+
+##############################
+# 模块说明
+##############################
+"""
+主要功能：
+1. 批量修复SIME生成的含双键16元环大环内酯的价态错误
+2. 自动检测并处理以下问题：
+   - 碳原子显式价态超限（如5价碳）
+   - 不合理的显式氢配置
+   - 双键立体化学冲突
+3. 提供修复结果统计和可视化分析
+
+输入输出：
+- 输入：包含SMILES的文本文件（每行一个分子）
+- 输出：
+  - 修复后的CSV文件（含原始/修正SMILES和状态）
+  - 修复统计图表（PNG）
+  - 摘要报告（TXT）
+
+依赖环境：
+- Python >= 3.7
+- RDKit >= 2022.03
+- pandas >= 1.3
+- swifter >= 1.3
+"""
+
+import pandas as pd
+from pathlib import Path
+import swifter
+from joblib import Parallel, delayed
+from tqdm.auto import tqdm
+import matplotlib.pyplot as plt
+from collections import Counter
+import re
+from rdkit import Chem
+import warnings
+warnings.filterwarnings('ignore', category=UserWarning)
+
+##############################
+# 核心修复函数
+##############################
+
+def safe_sanitize(mol):
+    """
+    安全标准化分子结构
+    
+    Parameters:
+        mol (rdkit.Chem.Mol): 待检测的分子对象
+    
+    Returns:
+        int/None: 返回错误原子索引（如存在价态错误），无错误则返回None
+    """
+    try:
+        Chem.SanitizeMol(mol)
+        return None
+    except Chem.AtomValenceException as e:
+        match = re.search(r'atom # (\d+)', str(e))
+        return int(match.group(1)) if match else None
+    except:
+        return None
+
+
+def fix_valence_error(smi):
+    """
+    修复单个SMILES的价态错误
+    
+    Parameters:
+        smi (str): 输入SMILES字符串
+    
+    Returns:
+        tuple: (修正后SMILES, 状态, 描述信息)
+        状态可能值:
+        - 'valid': 原始有效
+        - 'corrected': 修复成功
+        - 'kekule': 需Kekule形式
+        - 'failed': 修复失败
+    """
+    try:
+        mol = Chem.MolFromSmiles(smi, sanitize=False)
+        if not mol:
+            return smi, "invalid", "无法解析SMILES"
+        
+        error_atom_idx = safe_sanitize(Chem.Mol(mol))
+        if error_atom_idx is None:
+            return Chem.MolToSmiles(mol), "valid", "原始有效"
+        
+        rw_mol = Chem.RWMol(mol)
+        atom = rw_mol.GetAtomWithIdx(error_atom_idx)
+        
+        # 修复策略序列
+        repair_actions = [
+            lambda: atom.SetNumExplicitHs(0),
+            lambda: (atom.SetFormalCharge(0), atom.SetNumRadicalElectrons(0)),
+            lambda: rw_mol.RemoveBond(
+                list(atom.GetBonds())[-1].GetBeginAtomIdx(),
+                list(atom.GetBonds())[-1].GetEndAtomIdx()
+            ) if atom.GetDegree() > 1 else None
+        ]
+        
+        for action in repair_actions:
+            action()
+            if safe_sanitize(rw_mol.GetMol()) is None:
+                return Chem.MolToSmiles(rw_mol.GetMol()), "corrected", f"修复原子 {error_atom_idx}"
+        
+        Chem.Kekulize(rw_mol)
+        return Chem.MolToSmiles(rw_mol.GetMol(), kekuleSmiles=True), "kekule", "返回Kekule形式"
+    
+    except Exception as e:
+        return smi, "failed", str(e)
+
+##############################
+# 并行处理模块
+##############################
+
+def batch_process(smi_chunk):
+    """
+    分块处理SMILES列表（兼容并行化）
+    
+    Parameters:
+        smi_chunk (list): SMILES字符串列表
+    
+    Returns:
+        list: 包含(修正SMILES, 状态, 信息)的元组列表
+    """
+    return [fix_valence_error(smi) for smi in smi_chunk]
+
+
+def process_in_chunks(smi_list, chunk_size=50000, n_jobs=4):
+    """
+    分块并行处理大规模SMILES数据
+    
+    Parameters:
+        smi_list (list): 原始SMILES列表
+        chunk_size (int): 每块处理量
+        n_jobs (int): 并行进程数
+    
+    Returns:
+        tuple: (修正SMILES列表, 状态列表, 信息列表)
+    """
+    results = []
+    for i in tqdm(range(0, len(smi_list), chunk_size), 
+                 desc=f"Processing {len(smi_list):,} molecules"):
+        chunk = smi_list[i:i + chunk_size]
+        chunk_results = Parallel(n_jobs=n_jobs)(
+            delayed(batch_process)(chunk[i:i+1000]) 
+            for i in range(0, len(chunk), 1000)
+        )
+        results.extend([item for sublist in chunk_results for item in sublist])
+    
+    return list(zip(*results)) if results else ([], [], [])
+
+##############################
+# 统计分析模块
+##############################
+
+def analyze_results(df):
+    """
+    生成修复结果统计分析报告
+    
+    Parameters:
+        df (pd.DataFrame): 包含修复结果的DataFrame
+    
+    Returns:
+        dict: 包含关键统计指标的字典
+    """
+    # 计算基本统计量
+    stats = {
+        'total_molecules': len(df),
+        'valid_count': len(df[df['status'] == 'valid']),
+        'corrected_count': len(df[df['status'] == 'corrected']),
+        'kekule_count': len(df[df['status'] == 'kekule']),
+        'failed_count': len(df[df['status'] == 'failed']),
+        'success_rate': (len(df[df['status'].isin(['valid', 'corrected'])]) / len(df))
+    }
+    
+    # 错误分析（仅当存在失败时）
+    if stats['failed_count'] > 0:
+        stats['common_errors'] = dict(df[df['status'] == 'failed']['message'].value_counts().head(5))
+    else:
+        stats['common_errors'] = {}
+    
+    # 可视化
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
+    
+    # 状态分布饼图
+    status_counts = df['status'].value_counts()
+    status_counts.plot.pie(ax=ax1, autopct='%1.1f%%', startangle=90)
+    ax1.set_title('修复状态分布')
+    
+    # 错误原因柱状图（仅当存在错误时）
+    if stats['common_errors']:
+        pd.Series(stats['common_errors']).plot.barh(ax=ax2)
+        ax2.set_title('Top 5错误原因')
+    else:
+        ax2.axis('off')
+        ax2.text(0.5, 0.5, '无修复失败记录', 
+                ha='center', va='center', fontsize=12)
+    
+    plt.tight_layout()
+    plt.savefig('repair_statistics.png', dpi=150)
+    plt.close()
+    
+    return stats
+
+##############################
+# 主执行流程
+##############################
+
+def main(input_path, output_path="fixed_molecules.csv", n_jobs=4):
+    """
+    主处理流程
+    
+    Parameters:
+        input_path (str): 输入SMILES文件路径
+        output_path (str): 输出CSV文件路径
+        n_jobs (int): 并行进程数
+    """
+    print(f"\n{' SIME大环内酯修复工具 ':=^50}\n")
+    
+    # 数据加载
+    smi_list = [s.strip() for s in Path(input_path).read_text().splitlines() if s.strip()]
+    print(f"✅ 已加载 {len(smi_list):,} 个分子")
+    
+    # 分子修复
+    fixed_smiles, statuses, messages = process_in_chunks(smi_list, n_jobs=n_jobs)
+    
+    # 结果分析
+    df = pd.DataFrame({
+        'original_smiles': smi_list,
+        'fixed_smiles': fixed_smiles,
+        'status': statuses,
+        'message': messages
+    })
+    
+    stats = analyze_results(df)
+    
+    # 结果保存
+    df.to_csv(output_path, index=False)
+    print(f"\n{' 修复结果统计 ':=^50}")
+    print(f"总处理数: {stats['total_molecules']:,}")
+    print(f"成功率: {stats['success_rate']:.2%}")
+    print(f"\n输出文件已保存至: {output_path} 和 repair_statistics.png")
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="SIME大环内酯修复工具")
+    parser.add_argument('input', help="输入SMILES文件路径")
+    parser.add_argument('-o', '--output', default="fixed_molecules.csv", help="输出CSV路径")
+    parser.add_argument('-j', '--jobs', type=int, default=4, help="并行进程数")
+    args = parser.parse_args()
+    
+    main(input_path=args.input, output_path=args.output, n_jobs=args.jobs)
+
+'''
+# 查看帮助
+python simemacrocycle_repair.py -h
+
+# 运行示例
+python simemacrocycle_repair.py input.smi -o results.csv -j 8
+
+python simemacrocycle_repair.py ../data/Macro16_SIME_Synthesis/2025-02-26-05-38-39_mcrl_1.smiles -o ../data/Macro16_SIME_Synthesis/fixed_macrolides_2025.csv -j 8
+'''
--- a/utils/smiles_svg_show.py
+++ b/utils/smiles_svg_show.py
@@ -0,0 +1,159 @@
+import argparse
+import json
+import re
+from datetime import datetime
+from pathlib import Path
+from rdkit import Chem
+from rdkit.Chem.Draw import rdMolDraw2D
+import boto3
+
+# 对象存储配置信息（可随时修改）
+BUCKET_NAME = "{Your_Bucket_Name}"
+ACCESS_KEY = "{Your_Access_Key}"
+SECRET_KEY = "{Your_Secret_Key}"
+ENDPOINT_URL = "{Your_Endpoint_Url}"
+S3_SVG_PREFIX = "svg_outputs/"
+
+# 生成SVG图片并高亮
+def mol_to_svg(mol, highlight_atoms=None, size=(400, 400)):
+    drawer = rdMolDraw2D.MolDraw2DSVG(size[0], size[1])
+    drawer.SetFontSize(6)
+    opts = drawer.drawOptions()
+    opts.addAtomIndices = True
+
+    atom_colors = {}
+    if highlight_atoms:
+        for idx in highlight_atoms:
+            atom_colors[idx] = (1, 0, 0)
+
+    drawer.DrawMolecule(
+        mol,
+        highlightAtoms=highlight_atoms or [],
+        highlightAtomColors=atom_colors
+    )
+    drawer.FinishDrawing()
+    return drawer.GetDrawingText()
+
+# 上传到对象存储（S3兼容）
+# 替换原始 upload_svg_to_s3 的返回值
+def upload_svg_to_s3(svg_content, object_name):
+    session = boto3.session.Session(
+        aws_access_key_id=ACCESS_KEY,
+        aws_secret_access_key=SECRET_KEY,
+    )
+    s3 = session.resource('s3', endpoint_url=ENDPOINT_URL)
+    obj = s3.Object(BUCKET_NAME, object_name)
+    obj.put(Body=svg_content, ContentType='image/svg+xml')
+
+    # 返回 R2.dev 公共 URL
+    return f"https://pub-389f446a01134875b8c7ced0572758de.r2.dev/{object_name}"
+
+# 检测原子价态错误
+def find_valence_error_atom(mol):
+    try:
+        Chem.SanitizeMol(mol)
+        return None
+    except Chem.AtomValenceException as e:
+        match = re.search(r'atom # (\d+)', str(e))
+        if match:
+            return int(match.group(1))
+    return None
+
+# 保存和读取JSON的方法
+def save_json(data, filename):
+    Path(filename).write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8')
+
+def load_json(filename):
+    return json.loads(Path(filename).read_text(encoding='utf-8'))
+
+# 获取原子详细状态信息
+def get_atom_status(mol, atom_idx):
+    atom = mol.GetAtomWithIdx(atom_idx)
+    mol.UpdatePropertyCache(strict=False)
+    connections = []
+    for bond in atom.GetBonds():
+        neighbor_idx = bond.GetOtherAtomIdx(atom_idx)
+        connections.append({
+            "connected_to": f"#{neighbor_idx} ({mol.GetAtomWithIdx(neighbor_idx).GetSymbol()})",
+            "bond_type": str(bond.GetBondType())
+        })
+
+    return {
+        "explicit_connections": atom.GetDegree(),
+        "formal_charge": atom.GetFormalCharge(),
+        "radical_electrons": atom.GetNumRadicalElectrons(),
+        "implicit_hydrogens": atom.GetNumImplicitHs(),
+        "explicit_hydrogens": atom.GetNumExplicitHs(),
+        "connections_detail": connections
+    }
+
+# 主程序
+def main():
+    parser = argparse.ArgumentParser(description="Process SMILES and optionally highlight atoms using atom index or SMARTS pattern.")
+    parser.add_argument('--smiles', type=str, required=True, help='SMILES string of molecule')
+    parser.add_argument('--atom_idx', type=int, help='Atom index to highlight')
+    parser.add_argument('--smarts', type=str, help='SMARTS pattern to highlight matched atoms')
+    parser.add_argument('--output', type=str, default="output.json", help='Output JSON filename')
+    parser.add_argument('--no_s3', action='store_true', help='Save SVG locally instead of S3')
+
+    args = parser.parse_args()
+
+    mol = Chem.MolFromSmiles(args.smiles, sanitize=False)
+    # Chem.SanitizeMol(mol)  # 手动完成标准化
+    # Chem.MolToSmiles(mol)  # canonical=True by default
+
+    error_atom_idx = find_valence_error_atom(mol)
+    atom_state_info = "OK" if error_atom_idx is None else f"Valence error at atom #{error_atom_idx}"
+
+    highlight_atoms = set()
+
+    if args.atom_idx is not None:
+        highlight_atoms.add(args.atom_idx)
+
+    if args.smarts:
+        patt = Chem.MolFromSmarts(args.smarts)
+        matches = mol.GetSubstructMatches(patt)
+        for match in matches:
+            highlight_atoms.update(match)
+
+    svg_str = mol_to_svg(mol, highlight_atoms=list(highlight_atoms))
+
+    timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
+    svg_filename = f"molecule_{timestamp}.svg"
+
+    output_path = Path(args.output)
+    if not output_path.is_absolute():
+        output_path = Path.cwd() / output_path
+
+    if args.no_s3:
+        svg_path = output_path.parent / svg_filename
+        svg_path.write_text(svg_str, encoding='utf-8')
+        svg_location = str(svg_path)
+    else:
+        object_name = f"{S3_SVG_PREFIX}{svg_filename}"
+        svg_location = upload_svg_to_s3(svg_str, object_name)
+
+    output_data = {
+        "atom_state": atom_state_info,
+        "svg_url": svg_location,
+        "svg_filename": svg_filename
+    }
+
+    if args.atom_idx is not None:
+        output_data["atom_status_detail"] = get_atom_status(mol, args.atom_idx)
+
+    save_json(output_data, output_path)
+
+    print(f"Results saved to {output_path}")
+
+if __name__ == "__main__":
+    main()
+
+"""
+# 自动修复键值错误
+python smiles_svg_show.py --smiles "O=C1C[C@@H](O)C[C@H](O[C@H]9C[C@@](C)(OC)[C@@H](O)[C@H](C)O9)[C@@H](C)C[C@@H](C)C(=O)/C=C/[C@@H](CC)=C/[C@H](O[C@@H]9O[C@H](C)C[C@@H]([C@H]9O)N(C)C)[N@@](C)O1" --atom_idx 30
+
+python smiles_svg_show.py --smiles "CCC1=C\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@@H](CC=O)OC(=O)C[C@@H](O)C[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@@H](C)C[C@@H](C)C(=O)\C=C\1" --atom_idx 30
+# smarts 匹配，要求smiles正确
+python smiles_svg_show.py --smiles "CCC1=C\[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@@H](CC=O)OC(=O)C[C@@H](O)C[C@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@@H](C)C[C@@H](C)C(=O)\C=C\1" --smarts "[r16]([#8][#6](=[#8]))"
+"""
--- a/utils/split_multi.ipynb
+++ b/utils/split_multi.ipynb