feat(toolkit): ship macro_lactone_toolkit package

Unify macrolactone detection, numbering, fragmentation, and
splicing under the installable macro_lactone_toolkit package.

- replace legacy src.* modules with the new package layout
- add analyze/number/fragment CLI entrypoints and pixi tasks
- migrate tests, README, and scripts to the new package API
This commit is contained in:
2026-03-18 22:06:45 +08:00
parent a768d26e47
commit 5e7b236f31
45 changed files with 1302 additions and 6304 deletions

257
scripts/batch_process_ring16.py Executable file → Normal file
View File

@@ -1,259 +1,10 @@
#!/usr/bin/env python3
"""
批量处理16元环大环内酯分子
处理文件: ring16/temp_filtered_complete.csv (1241个分子)
输出文件夹: output/ring16_fragments/
"""
from __future__ import annotations
import sys
from pathlib import Path
# 添加项目根目录到 Python 路径
# 获取脚本所在目录的父目录(项目根目录)
script_dir = Path(__file__).resolve().parent
project_root = script_dir.parent
sys.path.insert(0, str(project_root))
from rdkit import Chem
import pandas as pd
import json
from tqdm import tqdm
from datetime import datetime
from src.ring_numbering import (
assign_ring_numbering,
validate_numbering,
get_ring_atoms
)
from src.fragment_cleaver import cleave_side_chains
from macro_lactone_toolkit.cli import main
def log_message(message, log_file, log_type='info'):
"""记录日志信息"""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
log_entry = f"[{timestamp}] {message}\n"
with open(log_file, 'a', encoding='utf-8') as f:
f.write(log_entry)
if log_type == 'error':
print(f"{message}")
else:
print(f"{message}")
def process_single_molecule(idx, row, output_base_dir, log_file, error_log_file, multiple_lactone_log):
"""处理单个分子"""
try:
# 获取分子信息
molecule_id = row['IDs']
molecule_name = row['molecule_pref_name']
smiles = row['smiles']
parent_id = f"ring16_mol_{idx}"
# 解析SMILES
mol = Chem.MolFromSmiles(smiles)
if mol is None:
log_message(f"分子 {idx} ({molecule_id}) SMILES解析失败", error_log_file, 'error')
return None, 'parse_failed'
# 检查环上是否有多个内酯键
from src.ring_numbering import find_lactone_carbon, get_ring_atoms
# 获取16元环上的所有原子
ring_atoms = get_ring_atoms(mol)
if len(ring_atoms) > 0:
# 使用更严格的SMARTS模式环上的氧连接到环上的羰基碳
# [r16;O] 环上的氧,连接到 [r16;#6] 环上的碳,该碳有双键氧 (=O)
lactone_pattern = Chem.MolFromSmarts("[r16;#8][r16;#6](=[#8])")
if lactone_pattern:
matches = mol.GetSubstructMatches(lactone_pattern)
# 提取环上的内酯碳索引match[1]是羰基碳)
lactone_carbons_on_ring = []
for match in matches:
if len(match) >= 2:
carbonyl_carbon = match[1] # 羰基碳
# 确保这个碳确实在16元环上
if carbonyl_carbon in ring_atoms:
lactone_carbons_on_ring.append(carbonyl_carbon)
# 去重
lactone_carbons_on_ring = list(set(lactone_carbons_on_ring))
# 只有当环上有多个内酯键时才过滤
if len(lactone_carbons_on_ring) > 1:
log_message(
f"分子 {idx} ({molecule_id}, {molecule_name}) 环上有{len(lactone_carbons_on_ring)}个内酯键,已过滤。环上内酯碳索引: {lactone_carbons_on_ring}",
multiple_lactone_log,
'info'
)
return None, 'multiple_lactones'
# 分配编号
numbering = assign_ring_numbering(mol)
if len(numbering) == 0:
log_message(f"分子 {idx} ({molecule_id}) 编号失败", error_log_file, 'error')
return None, 'numbering_failed'
# 验证编号
if not validate_numbering(mol, numbering):
log_message(f"分子 {idx} ({molecule_id}) 编号验证失败", error_log_file, 'error')
return None, 'validation_failed'
# 侧链断裂
try:
mol_fragments = cleave_side_chains(mol, smiles, parent_id, numbering)
except Exception as e:
log_message(f"分子 {idx} ({molecule_id}) 裂解失败: {str(e)}", error_log_file, 'error')
return None, 'cleavage_failed'
# 创建分子专属文件夹
mol_output_dir = output_base_dir / parent_id
mol_output_dir.mkdir(parents=True, exist_ok=True)
# 保存整个MoleculeFragments对象
mol_fragments_path = mol_output_dir / f"{parent_id}_all_fragments.json"
mol_fragments.to_json_file(str(mol_fragments_path))
# 保存每个碎片
for frag in mol_fragments.fragments:
frag_path = mol_output_dir / f"{frag.fragment_id}.json"
frag.to_json_file(str(frag_path))
# 保存分子信息到metadata
metadata = {
'parent_id': parent_id,
'molecule_id': molecule_id,
'molecule_name': molecule_name,
'smiles': smiles,
'ring_size': 16,
'num_fragments': len(mol_fragments.fragments),
'processing_date': datetime.now().isoformat()
}
metadata_path = mol_output_dir / 'metadata.json'
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
return len(mol_fragments.fragments), 'success'
except Exception as e:
log_message(f"分子 {idx} 未预期错误: {str(e)}", error_log_file, 'error')
return None, 'unexpected_error'
def main():
print("="*80)
print("16元环大环内酯批量处理程序")
print("="*80)
print()
# 读取数据
input_file = Path('ring16/temp_filtered_complete.csv')
if not input_file.exists():
print(f"❌ 错误: 找不到输入文件 {input_file}")
return
df = pd.read_csv(input_file)
print(f"✓ 读取数据集: {len(df)} 个分子")
print(f" 文件: {input_file}")
print()
# 创建输出文件夹
output_base_dir = Path('output/ring16_fragments')
output_base_dir.mkdir(parents=True, exist_ok=True)
# 创建日志文件
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = output_base_dir / f'processing_log_{timestamp}.txt'
error_log_file = output_base_dir / f'error_log_{timestamp}.txt'
multiple_lactone_log = output_base_dir / f'multiple_lactone_log_{timestamp}.txt'
print(f"✓ 输出文件夹: {output_base_dir}")
print(f"✓ 日志文件: {log_file}")
print()
# 统计信息
stats = {
'total': len(df),
'success': 0,
'failed_parse': 0,
'failed_numbering': 0,
'failed_validation': 0,
'failed_cleavage': 0,
'failed_unexpected': 0,
'multiple_lactones': 0,
'total_fragments': 0
}
# 批量处理
print("开始批量处理...")
print()
log_message(f"开始批量处理 {len(df)} 个16元环分子", log_file)
for idx in tqdm(range(len(df)), desc="处理进度"):
num_fragments, status = process_single_molecule(
idx, df.iloc[idx], output_base_dir, log_file, error_log_file, multiple_lactone_log
)
if status == 'success':
stats['success'] += 1
stats['total_fragments'] += num_fragments
elif status == 'parse_failed':
stats['failed_parse'] += 1
elif status == 'numbering_failed':
stats['failed_numbering'] += 1
elif status == 'validation_failed':
stats['failed_validation'] += 1
elif status == 'cleavage_failed':
stats['failed_cleavage'] += 1
elif status == 'multiple_lactones':
stats['multiple_lactones'] += 1
else:
stats['failed_unexpected'] += 1
print()
print("="*80)
print("处理完成!")
print("="*80)
print()
# 打印统计结果
print("统计结果:")
print(f" 总分子数: {stats['total']}")
print(f" 成功处理: {stats['success']} ({stats['success']/stats['total']*100:.2f}%)")
print(f" 总碎片数: {stats['total_fragments']}")
if stats['success'] > 0:
print(f" 平均碎片: {stats['total_fragments']/stats['success']:.2f} 个/分子")
print()
print("失败情况:")
print(f" SMILES解析失败: {stats['failed_parse']}")
print(f" 编号失败: {stats['failed_numbering']}")
print(f" 验证失败: {stats['failed_validation']}")
print(f" 裂解失败: {stats['failed_cleavage']}")
print(f" 多个内酯键: {stats['multiple_lactones']}")
print(f" 其他错误: {stats['failed_unexpected']}")
total_failed = stats['total'] - stats['success']
print(f" 总失败: {total_failed} ({total_failed/stats['total']*100:.2f}%)")
print()
# 保存统计结果
stats_file = output_base_dir / 'processing_statistics.json'
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2)
print(f"✓ 统计结果已保存: {stats_file}")
print(f"✓ 输出文件夹: {output_base_dir}")
print()
log_message(f"处理完成: 成功 {stats['success']}/{stats['total']}", log_file)
if __name__ == '__main__':
if __name__ == "__main__":
sys.argv = ["macro-lactone-toolkit", "fragment", "--ring-size", "16", *sys.argv[1:]]
main()