update
This commit is contained in:
278
utils/simemacrocycle_repair.py
Executable file
278
utils/simemacrocycle_repair.py
Executable file
@@ -0,0 +1,278 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
'''
|
||||
@file :simemacrocycle_repair.py
|
||||
@Description: :SIME工具生成的16元环大环内酯化合物的自动化价态修复工具
|
||||
@Date :2025/03/29 18:29:52
|
||||
@Author :lyzeng
|
||||
@Email :pylyzeng@gmail.com
|
||||
@version :1.0
|
||||
# 安装依赖
|
||||
pip install rdkit pandas swifter joblib tqdm matplotlib
|
||||
|
||||
# 运行脚本
|
||||
python simemacrocycle_repair.py
|
||||
'''
|
||||
|
||||
##############################
|
||||
# 模块说明
|
||||
##############################
|
||||
"""
|
||||
主要功能:
|
||||
1. 批量修复SIME生成的含双键16元环大环内酯的价态错误
|
||||
2. 自动检测并处理以下问题:
|
||||
- 碳原子显式价态超限(如5价碳)
|
||||
- 不合理的显式氢配置
|
||||
- 双键立体化学冲突
|
||||
3. 提供修复结果统计和可视化分析
|
||||
|
||||
输入输出:
|
||||
- 输入:包含SMILES的文本文件(每行一个分子)
|
||||
- 输出:
|
||||
- 修复后的CSV文件(含原始/修正SMILES和状态)
|
||||
- 修复统计图表(PNG)
|
||||
- 摘要报告(TXT)
|
||||
|
||||
依赖环境:
|
||||
- Python >= 3.7
|
||||
- RDKit >= 2022.03
|
||||
- pandas >= 1.3
|
||||
- swifter >= 1.3
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import swifter
|
||||
from joblib import Parallel, delayed
|
||||
from tqdm.auto import tqdm
|
||||
import matplotlib.pyplot as plt
|
||||
from collections import Counter
|
||||
import re
|
||||
from rdkit import Chem
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore', category=UserWarning)
|
||||
|
||||
##############################
|
||||
# 核心修复函数
|
||||
##############################
|
||||
|
||||
def safe_sanitize(mol):
|
||||
"""
|
||||
安全标准化分子结构
|
||||
|
||||
Parameters:
|
||||
mol (rdkit.Chem.Mol): 待检测的分子对象
|
||||
|
||||
Returns:
|
||||
int/None: 返回错误原子索引(如存在价态错误),无错误则返回None
|
||||
"""
|
||||
try:
|
||||
Chem.SanitizeMol(mol)
|
||||
return None
|
||||
except Chem.AtomValenceException as e:
|
||||
match = re.search(r'atom # (\d+)', str(e))
|
||||
return int(match.group(1)) if match else None
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def fix_valence_error(smi):
|
||||
"""
|
||||
修复单个SMILES的价态错误
|
||||
|
||||
Parameters:
|
||||
smi (str): 输入SMILES字符串
|
||||
|
||||
Returns:
|
||||
tuple: (修正后SMILES, 状态, 描述信息)
|
||||
状态可能值:
|
||||
- 'valid': 原始有效
|
||||
- 'corrected': 修复成功
|
||||
- 'kekule': 需Kekule形式
|
||||
- 'failed': 修复失败
|
||||
"""
|
||||
try:
|
||||
mol = Chem.MolFromSmiles(smi, sanitize=False)
|
||||
if not mol:
|
||||
return smi, "invalid", "无法解析SMILES"
|
||||
|
||||
error_atom_idx = safe_sanitize(Chem.Mol(mol))
|
||||
if error_atom_idx is None:
|
||||
return Chem.MolToSmiles(mol), "valid", "原始有效"
|
||||
|
||||
rw_mol = Chem.RWMol(mol)
|
||||
atom = rw_mol.GetAtomWithIdx(error_atom_idx)
|
||||
|
||||
# 修复策略序列
|
||||
repair_actions = [
|
||||
lambda: atom.SetNumExplicitHs(0),
|
||||
lambda: (atom.SetFormalCharge(0), atom.SetNumRadicalElectrons(0)),
|
||||
lambda: rw_mol.RemoveBond(
|
||||
list(atom.GetBonds())[-1].GetBeginAtomIdx(),
|
||||
list(atom.GetBonds())[-1].GetEndAtomIdx()
|
||||
) if atom.GetDegree() > 1 else None
|
||||
]
|
||||
|
||||
for action in repair_actions:
|
||||
action()
|
||||
if safe_sanitize(rw_mol.GetMol()) is None:
|
||||
return Chem.MolToSmiles(rw_mol.GetMol()), "corrected", f"修复原子 {error_atom_idx}"
|
||||
|
||||
Chem.Kekulize(rw_mol)
|
||||
return Chem.MolToSmiles(rw_mol.GetMol(), kekuleSmiles=True), "kekule", "返回Kekule形式"
|
||||
|
||||
except Exception as e:
|
||||
return smi, "failed", str(e)
|
||||
|
||||
##############################
|
||||
# 并行处理模块
|
||||
##############################
|
||||
|
||||
def batch_process(smi_chunk):
|
||||
"""
|
||||
分块处理SMILES列表(兼容并行化)
|
||||
|
||||
Parameters:
|
||||
smi_chunk (list): SMILES字符串列表
|
||||
|
||||
Returns:
|
||||
list: 包含(修正SMILES, 状态, 信息)的元组列表
|
||||
"""
|
||||
return [fix_valence_error(smi) for smi in smi_chunk]
|
||||
|
||||
|
||||
def process_in_chunks(smi_list, chunk_size=50000, n_jobs=4):
|
||||
"""
|
||||
分块并行处理大规模SMILES数据
|
||||
|
||||
Parameters:
|
||||
smi_list (list): 原始SMILES列表
|
||||
chunk_size (int): 每块处理量
|
||||
n_jobs (int): 并行进程数
|
||||
|
||||
Returns:
|
||||
tuple: (修正SMILES列表, 状态列表, 信息列表)
|
||||
"""
|
||||
results = []
|
||||
for i in tqdm(range(0, len(smi_list), chunk_size),
|
||||
desc=f"Processing {len(smi_list):,} molecules"):
|
||||
chunk = smi_list[i:i + chunk_size]
|
||||
chunk_results = Parallel(n_jobs=n_jobs)(
|
||||
delayed(batch_process)(chunk[i:i+1000])
|
||||
for i in range(0, len(chunk), 1000)
|
||||
)
|
||||
results.extend([item for sublist in chunk_results for item in sublist])
|
||||
|
||||
return list(zip(*results)) if results else ([], [], [])
|
||||
|
||||
##############################
|
||||
# 统计分析模块
|
||||
##############################
|
||||
|
||||
def analyze_results(df):
|
||||
"""
|
||||
生成修复结果统计分析报告
|
||||
|
||||
Parameters:
|
||||
df (pd.DataFrame): 包含修复结果的DataFrame
|
||||
|
||||
Returns:
|
||||
dict: 包含关键统计指标的字典
|
||||
"""
|
||||
# 计算基本统计量
|
||||
stats = {
|
||||
'total_molecules': len(df),
|
||||
'valid_count': len(df[df['status'] == 'valid']),
|
||||
'corrected_count': len(df[df['status'] == 'corrected']),
|
||||
'kekule_count': len(df[df['status'] == 'kekule']),
|
||||
'failed_count': len(df[df['status'] == 'failed']),
|
||||
'success_rate': (len(df[df['status'].isin(['valid', 'corrected'])]) / len(df))
|
||||
}
|
||||
|
||||
# 错误分析(仅当存在失败时)
|
||||
if stats['failed_count'] > 0:
|
||||
stats['common_errors'] = dict(df[df['status'] == 'failed']['message'].value_counts().head(5))
|
||||
else:
|
||||
stats['common_errors'] = {}
|
||||
|
||||
# 可视化
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
||||
|
||||
# 状态分布饼图
|
||||
status_counts = df['status'].value_counts()
|
||||
status_counts.plot.pie(ax=ax1, autopct='%1.1f%%', startangle=90)
|
||||
ax1.set_title('修复状态分布')
|
||||
|
||||
# 错误原因柱状图(仅当存在错误时)
|
||||
if stats['common_errors']:
|
||||
pd.Series(stats['common_errors']).plot.barh(ax=ax2)
|
||||
ax2.set_title('Top 5错误原因')
|
||||
else:
|
||||
ax2.axis('off')
|
||||
ax2.text(0.5, 0.5, '无修复失败记录',
|
||||
ha='center', va='center', fontsize=12)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('repair_statistics.png', dpi=150)
|
||||
plt.close()
|
||||
|
||||
return stats
|
||||
|
||||
##############################
|
||||
# 主执行流程
|
||||
##############################
|
||||
|
||||
def main(input_path, output_path="fixed_molecules.csv", n_jobs=4):
|
||||
"""
|
||||
主处理流程
|
||||
|
||||
Parameters:
|
||||
input_path (str): 输入SMILES文件路径
|
||||
output_path (str): 输出CSV文件路径
|
||||
n_jobs (int): 并行进程数
|
||||
"""
|
||||
print(f"\n{' SIME大环内酯修复工具 ':=^50}\n")
|
||||
|
||||
# 数据加载
|
||||
smi_list = [s.strip() for s in Path(input_path).read_text().splitlines() if s.strip()]
|
||||
print(f"✅ 已加载 {len(smi_list):,} 个分子")
|
||||
|
||||
# 分子修复
|
||||
fixed_smiles, statuses, messages = process_in_chunks(smi_list, n_jobs=n_jobs)
|
||||
|
||||
# 结果分析
|
||||
df = pd.DataFrame({
|
||||
'original_smiles': smi_list,
|
||||
'fixed_smiles': fixed_smiles,
|
||||
'status': statuses,
|
||||
'message': messages
|
||||
})
|
||||
|
||||
stats = analyze_results(df)
|
||||
|
||||
# 结果保存
|
||||
df.to_csv(output_path, index=False)
|
||||
print(f"\n{' 修复结果统计 ':=^50}")
|
||||
print(f"总处理数: {stats['total_molecules']:,}")
|
||||
print(f"成功率: {stats['success_rate']:.2%}")
|
||||
print(f"\n输出文件已保存至: {output_path} 和 repair_statistics.png")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="SIME大环内酯修复工具")
|
||||
parser.add_argument('input', help="输入SMILES文件路径")
|
||||
parser.add_argument('-o', '--output', default="fixed_molecules.csv", help="输出CSV路径")
|
||||
parser.add_argument('-j', '--jobs', type=int, default=4, help="并行进程数")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(input_path=args.input, output_path=args.output, n_jobs=args.jobs)
|
||||
|
||||
'''
|
||||
# 查看帮助
|
||||
python simemacrocycle_repair.py -h
|
||||
|
||||
# 运行示例
|
||||
python simemacrocycle_repair.py input.smi -o results.csv -j 8
|
||||
|
||||
python simemacrocycle_repair.py ../data/Macro16_SIME_Synthesis/2025-02-26-05-38-39_mcrl_1.smiles -o ../data/Macro16_SIME_Synthesis/fixed_macrolides_2025.csv -j 8
|
||||
'''
|
||||
Reference in New Issue
Block a user