Files
macro_split/scripts/batch_process_ring16.py
2025-11-14 20:34:58 +08:00

260 lines
9.1 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
批量处理16元环大环内酯分子
处理文件: ring16/temp_filtered_complete.csv (1241个分子)
输出文件夹: output/ring16_fragments/
"""
import sys
from pathlib import Path
# 添加项目根目录到 Python 路径
# 获取脚本所在目录的父目录(项目根目录)
script_dir = Path(__file__).resolve().parent
project_root = script_dir.parent
sys.path.insert(0, str(project_root))
from rdkit import Chem
import pandas as pd
import json
from tqdm import tqdm
from datetime import datetime
from src.ring_numbering import (
assign_ring_numbering,
validate_numbering,
get_ring_atoms
)
from src.fragment_cleaver import cleave_side_chains
def log_message(message, log_file, log_type='info'):
"""记录日志信息"""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
log_entry = f"[{timestamp}] {message}\n"
with open(log_file, 'a', encoding='utf-8') as f:
f.write(log_entry)
if log_type == 'error':
print(f"{message}")
else:
print(f"{message}")
def process_single_molecule(idx, row, output_base_dir, log_file, error_log_file, multiple_lactone_log):
"""处理单个分子"""
try:
# 获取分子信息
molecule_id = row['IDs']
molecule_name = row['molecule_pref_name']
smiles = row['smiles']
parent_id = f"ring16_mol_{idx}"
# 解析SMILES
mol = Chem.MolFromSmiles(smiles)
if mol is None:
log_message(f"分子 {idx} ({molecule_id}) SMILES解析失败", error_log_file, 'error')
return None, 'parse_failed'
# 检查环上是否有多个内酯键
from src.ring_numbering import find_lactone_carbon, get_ring_atoms
# 获取16元环上的所有原子
ring_atoms = get_ring_atoms(mol)
if len(ring_atoms) > 0:
# 使用更严格的SMARTS模式环上的氧连接到环上的羰基碳
# [r16;O] 环上的氧,连接到 [r16;#6] 环上的碳,该碳有双键氧 (=O)
lactone_pattern = Chem.MolFromSmarts("[r16;#8][r16;#6](=[#8])")
if lactone_pattern:
matches = mol.GetSubstructMatches(lactone_pattern)
# 提取环上的内酯碳索引match[1]是羰基碳)
lactone_carbons_on_ring = []
for match in matches:
if len(match) >= 2:
carbonyl_carbon = match[1] # 羰基碳
# 确保这个碳确实在16元环上
if carbonyl_carbon in ring_atoms:
lactone_carbons_on_ring.append(carbonyl_carbon)
# 去重
lactone_carbons_on_ring = list(set(lactone_carbons_on_ring))
# 只有当环上有多个内酯键时才过滤
if len(lactone_carbons_on_ring) > 1:
log_message(
f"分子 {idx} ({molecule_id}, {molecule_name}) 环上有{len(lactone_carbons_on_ring)}个内酯键,已过滤。环上内酯碳索引: {lactone_carbons_on_ring}",
multiple_lactone_log,
'info'
)
return None, 'multiple_lactones'
# 分配编号
numbering = assign_ring_numbering(mol)
if len(numbering) == 0:
log_message(f"分子 {idx} ({molecule_id}) 编号失败", error_log_file, 'error')
return None, 'numbering_failed'
# 验证编号
if not validate_numbering(mol, numbering):
log_message(f"分子 {idx} ({molecule_id}) 编号验证失败", error_log_file, 'error')
return None, 'validation_failed'
# 侧链断裂
try:
mol_fragments = cleave_side_chains(mol, smiles, parent_id, numbering)
except Exception as e:
log_message(f"分子 {idx} ({molecule_id}) 裂解失败: {str(e)}", error_log_file, 'error')
return None, 'cleavage_failed'
# 创建分子专属文件夹
mol_output_dir = output_base_dir / parent_id
mol_output_dir.mkdir(parents=True, exist_ok=True)
# 保存整个MoleculeFragments对象
mol_fragments_path = mol_output_dir / f"{parent_id}_all_fragments.json"
mol_fragments.to_json_file(str(mol_fragments_path))
# 保存每个碎片
for frag in mol_fragments.fragments:
frag_path = mol_output_dir / f"{frag.fragment_id}.json"
frag.to_json_file(str(frag_path))
# 保存分子信息到metadata
metadata = {
'parent_id': parent_id,
'molecule_id': molecule_id,
'molecule_name': molecule_name,
'smiles': smiles,
'ring_size': 16,
'num_fragments': len(mol_fragments.fragments),
'processing_date': datetime.now().isoformat()
}
metadata_path = mol_output_dir / 'metadata.json'
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
return len(mol_fragments.fragments), 'success'
except Exception as e:
log_message(f"分子 {idx} 未预期错误: {str(e)}", error_log_file, 'error')
return None, 'unexpected_error'
def main():
print("="*80)
print("16元环大环内酯批量处理程序")
print("="*80)
print()
# 读取数据
input_file = Path('ring16/temp_filtered_complete.csv')
if not input_file.exists():
print(f"❌ 错误: 找不到输入文件 {input_file}")
return
df = pd.read_csv(input_file)
print(f"✓ 读取数据集: {len(df)} 个分子")
print(f" 文件: {input_file}")
print()
# 创建输出文件夹
output_base_dir = Path('output/ring16_fragments')
output_base_dir.mkdir(parents=True, exist_ok=True)
# 创建日志文件
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = output_base_dir / f'processing_log_{timestamp}.txt'
error_log_file = output_base_dir / f'error_log_{timestamp}.txt'
multiple_lactone_log = output_base_dir / f'multiple_lactone_log_{timestamp}.txt'
print(f"✓ 输出文件夹: {output_base_dir}")
print(f"✓ 日志文件: {log_file}")
print()
# 统计信息
stats = {
'total': len(df),
'success': 0,
'failed_parse': 0,
'failed_numbering': 0,
'failed_validation': 0,
'failed_cleavage': 0,
'failed_unexpected': 0,
'multiple_lactones': 0,
'total_fragments': 0
}
# 批量处理
print("开始批量处理...")
print()
log_message(f"开始批量处理 {len(df)} 个16元环分子", log_file)
for idx in tqdm(range(len(df)), desc="处理进度"):
num_fragments, status = process_single_molecule(
idx, df.iloc[idx], output_base_dir, log_file, error_log_file, multiple_lactone_log
)
if status == 'success':
stats['success'] += 1
stats['total_fragments'] += num_fragments
elif status == 'parse_failed':
stats['failed_parse'] += 1
elif status == 'numbering_failed':
stats['failed_numbering'] += 1
elif status == 'validation_failed':
stats['failed_validation'] += 1
elif status == 'cleavage_failed':
stats['failed_cleavage'] += 1
elif status == 'multiple_lactones':
stats['multiple_lactones'] += 1
else:
stats['failed_unexpected'] += 1
print()
print("="*80)
print("处理完成!")
print("="*80)
print()
# 打印统计结果
print("统计结果:")
print(f" 总分子数: {stats['total']}")
print(f" 成功处理: {stats['success']} ({stats['success']/stats['total']*100:.2f}%)")
print(f" 总碎片数: {stats['total_fragments']}")
if stats['success'] > 0:
print(f" 平均碎片: {stats['total_fragments']/stats['success']:.2f} 个/分子")
print()
print("失败情况:")
print(f" SMILES解析失败: {stats['failed_parse']}")
print(f" 编号失败: {stats['failed_numbering']}")
print(f" 验证失败: {stats['failed_validation']}")
print(f" 裂解失败: {stats['failed_cleavage']}")
print(f" 多个内酯键: {stats['multiple_lactones']}")
print(f" 其他错误: {stats['failed_unexpected']}")
total_failed = stats['total'] - stats['success']
print(f" 总失败: {total_failed} ({total_failed/stats['total']*100:.2f}%)")
print()
# 保存统计结果
stats_file = output_base_dir / 'processing_statistics.json'
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2)
print(f"✓ 统计结果已保存: {stats_file}")
print(f"✓ 输出文件夹: {output_base_dir}")
print()
log_message(f"处理完成: 成功 {stats['success']}/{stats['total']}", log_file)
if __name__ == '__main__':
main()