update

2025-10-16 17:26:35 +08:00
parent b1d437a06d
commit ea218a3a39
49 changed files with 694742 additions and 2 deletions
--- a/utils/simemacrocycle_repair.py
+++ b/utils/simemacrocycle_repair.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@file               :simemacrocycle_repair.py
+@Description:       :SIME工具生成的16元环大环内酯化合物的自动化价态修复工具
+@Date               :2025/03/29 18:29:52
+@Author             :lyzeng
+@Email              :pylyzeng@gmail.com
+@version            :1.0
+# 安装依赖
+pip install rdkit pandas swifter joblib tqdm matplotlib
+
+# 运行脚本
+python simemacrocycle_repair.py
+'''
+
+##############################
+# 模块说明
+##############################
+"""
+主要功能：
+1. 批量修复SIME生成的含双键16元环大环内酯的价态错误
+2. 自动检测并处理以下问题：
+   - 碳原子显式价态超限（如5价碳）
+   - 不合理的显式氢配置
+   - 双键立体化学冲突
+3. 提供修复结果统计和可视化分析
+
+输入输出：
+- 输入：包含SMILES的文本文件（每行一个分子）
+- 输出：
+  - 修复后的CSV文件（含原始/修正SMILES和状态）
+  - 修复统计图表（PNG）
+  - 摘要报告（TXT）
+
+依赖环境：
+- Python >= 3.7
+- RDKit >= 2022.03
+- pandas >= 1.3
+- swifter >= 1.3
+"""
+
+import pandas as pd
+from pathlib import Path
+import swifter
+from joblib import Parallel, delayed
+from tqdm.auto import tqdm
+import matplotlib.pyplot as plt
+from collections import Counter
+import re
+from rdkit import Chem
+import warnings
+warnings.filterwarnings('ignore', category=UserWarning)
+
+##############################
+# 核心修复函数
+##############################
+
+def safe_sanitize(mol):
+    """
+    安全标准化分子结构
+    
+    Parameters:
+        mol (rdkit.Chem.Mol): 待检测的分子对象
+    
+    Returns:
+        int/None: 返回错误原子索引（如存在价态错误），无错误则返回None
+    """
+    try:
+        Chem.SanitizeMol(mol)
+        return None
+    except Chem.AtomValenceException as e:
+        match = re.search(r'atom # (\d+)', str(e))
+        return int(match.group(1)) if match else None
+    except:
+        return None
+
+
+def fix_valence_error(smi):
+    """
+    修复单个SMILES的价态错误
+    
+    Parameters:
+        smi (str): 输入SMILES字符串
+    
+    Returns:
+        tuple: (修正后SMILES, 状态, 描述信息)
+        状态可能值:
+        - 'valid': 原始有效
+        - 'corrected': 修复成功
+        - 'kekule': 需Kekule形式
+        - 'failed': 修复失败
+    """
+    try:
+        mol = Chem.MolFromSmiles(smi, sanitize=False)
+        if not mol:
+            return smi, "invalid", "无法解析SMILES"
+        
+        error_atom_idx = safe_sanitize(Chem.Mol(mol))
+        if error_atom_idx is None:
+            return Chem.MolToSmiles(mol), "valid", "原始有效"
+        
+        rw_mol = Chem.RWMol(mol)
+        atom = rw_mol.GetAtomWithIdx(error_atom_idx)
+        
+        # 修复策略序列
+        repair_actions = [
+            lambda: atom.SetNumExplicitHs(0),
+            lambda: (atom.SetFormalCharge(0), atom.SetNumRadicalElectrons(0)),
+            lambda: rw_mol.RemoveBond(
+                list(atom.GetBonds())[-1].GetBeginAtomIdx(),
+                list(atom.GetBonds())[-1].GetEndAtomIdx()
+            ) if atom.GetDegree() > 1 else None
+        ]
+        
+        for action in repair_actions:
+            action()
+            if safe_sanitize(rw_mol.GetMol()) is None:
+                return Chem.MolToSmiles(rw_mol.GetMol()), "corrected", f"修复原子 {error_atom_idx}"
+        
+        Chem.Kekulize(rw_mol)
+        return Chem.MolToSmiles(rw_mol.GetMol(), kekuleSmiles=True), "kekule", "返回Kekule形式"
+    
+    except Exception as e:
+        return smi, "failed", str(e)
+
+##############################
+# 并行处理模块
+##############################
+
+def batch_process(smi_chunk):
+    """
+    分块处理SMILES列表（兼容并行化）
+    
+    Parameters:
+        smi_chunk (list): SMILES字符串列表
+    
+    Returns:
+        list: 包含(修正SMILES, 状态, 信息)的元组列表
+    """
+    return [fix_valence_error(smi) for smi in smi_chunk]
+
+
+def process_in_chunks(smi_list, chunk_size=50000, n_jobs=4):
+    """
+    分块并行处理大规模SMILES数据
+    
+    Parameters:
+        smi_list (list): 原始SMILES列表
+        chunk_size (int): 每块处理量
+        n_jobs (int): 并行进程数
+    
+    Returns:
+        tuple: (修正SMILES列表, 状态列表, 信息列表)
+    """
+    results = []
+    for i in tqdm(range(0, len(smi_list), chunk_size), 
+                 desc=f"Processing {len(smi_list):,} molecules"):
+        chunk = smi_list[i:i + chunk_size]
+        chunk_results = Parallel(n_jobs=n_jobs)(
+            delayed(batch_process)(chunk[i:i+1000]) 
+            for i in range(0, len(chunk), 1000)
+        )
+        results.extend([item for sublist in chunk_results for item in sublist])
+    
+    return list(zip(*results)) if results else ([], [], [])
+
+##############################
+# 统计分析模块
+##############################
+
+def analyze_results(df):
+    """
+    生成修复结果统计分析报告
+    
+    Parameters:
+        df (pd.DataFrame): 包含修复结果的DataFrame
+    
+    Returns:
+        dict: 包含关键统计指标的字典
+    """
+    # 计算基本统计量
+    stats = {
+        'total_molecules': len(df),
+        'valid_count': len(df[df['status'] == 'valid']),
+        'corrected_count': len(df[df['status'] == 'corrected']),
+        'kekule_count': len(df[df['status'] == 'kekule']),
+        'failed_count': len(df[df['status'] == 'failed']),
+        'success_rate': (len(df[df['status'].isin(['valid', 'corrected'])]) / len(df))
+    }
+    
+    # 错误分析（仅当存在失败时）
+    if stats['failed_count'] > 0:
+        stats['common_errors'] = dict(df[df['status'] == 'failed']['message'].value_counts().head(5))
+    else:
+        stats['common_errors'] = {}
+    
+    # 可视化
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
+    
+    # 状态分布饼图
+    status_counts = df['status'].value_counts()
+    status_counts.plot.pie(ax=ax1, autopct='%1.1f%%', startangle=90)
+    ax1.set_title('修复状态分布')
+    
+    # 错误原因柱状图（仅当存在错误时）
+    if stats['common_errors']:
+        pd.Series(stats['common_errors']).plot.barh(ax=ax2)
+        ax2.set_title('Top 5错误原因')
+    else:
+        ax2.axis('off')
+        ax2.text(0.5, 0.5, '无修复失败记录', 
+                ha='center', va='center', fontsize=12)
+    
+    plt.tight_layout()
+    plt.savefig('repair_statistics.png', dpi=150)
+    plt.close()
+    
+    return stats
+
+##############################
+# 主执行流程
+##############################
+
+def main(input_path, output_path="fixed_molecules.csv", n_jobs=4):
+    """
+    主处理流程
+    
+    Parameters:
+        input_path (str): 输入SMILES文件路径
+        output_path (str): 输出CSV文件路径
+        n_jobs (int): 并行进程数
+    """
+    print(f"\n{' SIME大环内酯修复工具 ':=^50}\n")
+    
+    # 数据加载
+    smi_list = [s.strip() for s in Path(input_path).read_text().splitlines() if s.strip()]
+    print(f"✅ 已加载 {len(smi_list):,} 个分子")
+    
+    # 分子修复
+    fixed_smiles, statuses, messages = process_in_chunks(smi_list, n_jobs=n_jobs)
+    
+    # 结果分析
+    df = pd.DataFrame({
+        'original_smiles': smi_list,
+        'fixed_smiles': fixed_smiles,
+        'status': statuses,
+        'message': messages
+    })
+    
+    stats = analyze_results(df)
+    
+    # 结果保存
+    df.to_csv(output_path, index=False)
+    print(f"\n{' 修复结果统计 ':=^50}")
+    print(f"总处理数: {stats['total_molecules']:,}")
+    print(f"成功率: {stats['success_rate']:.2%}")
+    print(f"\n输出文件已保存至: {output_path} 和 repair_statistics.png")
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="SIME大环内酯修复工具")
+    parser.add_argument('input', help="输入SMILES文件路径")
+    parser.add_argument('-o', '--output', default="fixed_molecules.csv", help="输出CSV路径")
+    parser.add_argument('-j', '--jobs', type=int, default=4, help="并行进程数")
+    args = parser.parse_args()
+    
+    main(input_path=args.input, output_path=args.output, n_jobs=args.jobs)
+
+'''
+# 查看帮助
+python simemacrocycle_repair.py -h
+
+# 运行示例
+python simemacrocycle_repair.py input.smi -o results.csv -j 8
+
+python simemacrocycle_repair.py ../data/Macro16_SIME_Synthesis/2025-02-26-05-38-39_mcrl_1.smiles -o ../data/Macro16_SIME_Synthesis/fixed_macrolides_2025.csv -j 8
+'''