first add

2025-11-14 20:34:58 +08:00
commit 0d99f7d12c
46 changed files with 698209 additions and 0 deletions
--- a/scripts/batch_process_ring16.py
+++ b/scripts/batch_process_ring16.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+批量处理16元环大环内酯分子
+处理文件: ring16/temp_filtered_complete.csv (1241个分子)
+输出文件夹: output/ring16_fragments/
+"""
+
+import sys
+from pathlib import Path
+
+# 添加项目根目录到 Python 路径
+# 获取脚本所在目录的父目录（项目根目录）
+script_dir = Path(__file__).resolve().parent
+project_root = script_dir.parent
+sys.path.insert(0, str(project_root))
+
+from rdkit import Chem
+import pandas as pd
+import json
+from tqdm import tqdm
+from datetime import datetime
+
+from src.ring_numbering import (
+    assign_ring_numbering,
+    validate_numbering,
+    get_ring_atoms
+)
+from src.fragment_cleaver import cleave_side_chains
+
+
+def log_message(message, log_file, log_type='info'):
+    """记录日志信息"""
+    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    log_entry = f"[{timestamp}] {message}\n"
+    
+    with open(log_file, 'a', encoding='utf-8') as f:
+        f.write(log_entry)
+    
+    if log_type == 'error':
+        print(f"❌ {message}")
+    else:
+        print(f"✓ {message}")
+
+
+def process_single_molecule(idx, row, output_base_dir, log_file, error_log_file, multiple_lactone_log):
+    """处理单个分子"""
+    try:
+        # 获取分子信息
+        molecule_id = row['IDs']
+        molecule_name = row['molecule_pref_name']
+        smiles = row['smiles']
+        parent_id = f"ring16_mol_{idx}"
+        
+        # 解析SMILES
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            log_message(f"分子 {idx} ({molecule_id}) SMILES解析失败", error_log_file, 'error')
+            return None, 'parse_failed'
+        
+        # 检查环上是否有多个内酯键
+        from src.ring_numbering import find_lactone_carbon, get_ring_atoms
+        
+        # 获取16元环上的所有原子
+        ring_atoms = get_ring_atoms(mol)
+        
+        if len(ring_atoms) > 0:
+            # 使用更严格的SMARTS模式：环上的氧连接到环上的羰基碳
+            # [r16;O] 环上的氧，连接到 [r16;#6] 环上的碳，该碳有双键氧 (=O)
+            lactone_pattern = Chem.MolFromSmarts("[r16;#8][r16;#6](=[#8])")
+            
+            if lactone_pattern:
+                matches = mol.GetSubstructMatches(lactone_pattern)
+                
+                # 提取环上的内酯碳索引（match[1]是羰基碳）
+                lactone_carbons_on_ring = []
+                for match in matches:
+                    if len(match) >= 2:
+                        carbonyl_carbon = match[1]  # 羰基碳
+                        # 确保这个碳确实在16元环上
+                        if carbonyl_carbon in ring_atoms:
+                            lactone_carbons_on_ring.append(carbonyl_carbon)
+                
+                # 去重
+                lactone_carbons_on_ring = list(set(lactone_carbons_on_ring))
+                
+                # 只有当环上有多个内酯键时才过滤
+                if len(lactone_carbons_on_ring) > 1:
+                    log_message(
+                        f"分子 {idx} ({molecule_id}, {molecule_name}) 环上有{len(lactone_carbons_on_ring)}个内酯键，已过滤。环上内酯碳索引: {lactone_carbons_on_ring}",
+                        multiple_lactone_log,
+                        'info'
+                    )
+                    return None, 'multiple_lactones'
+        
+        # 分配编号
+        numbering = assign_ring_numbering(mol)
+        if len(numbering) == 0:
+            log_message(f"分子 {idx} ({molecule_id}) 编号失败", error_log_file, 'error')
+            return None, 'numbering_failed'
+        
+        # 验证编号
+        if not validate_numbering(mol, numbering):
+            log_message(f"分子 {idx} ({molecule_id}) 编号验证失败", error_log_file, 'error')
+            return None, 'validation_failed'
+        
+        # 侧链断裂
+        try:
+            mol_fragments = cleave_side_chains(mol, smiles, parent_id, numbering)
+        except Exception as e:
+            log_message(f"分子 {idx} ({molecule_id}) 裂解失败: {str(e)}", error_log_file, 'error')
+            return None, 'cleavage_failed'
+        
+        # 创建分子专属文件夹
+        mol_output_dir = output_base_dir / parent_id
+        mol_output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # 保存整个MoleculeFragments对象
+        mol_fragments_path = mol_output_dir / f"{parent_id}_all_fragments.json"
+        mol_fragments.to_json_file(str(mol_fragments_path))
+        
+        # 保存每个碎片
+        for frag in mol_fragments.fragments:
+            frag_path = mol_output_dir / f"{frag.fragment_id}.json"
+            frag.to_json_file(str(frag_path))
+        
+        # 保存分子信息到metadata
+        metadata = {
+            'parent_id': parent_id,
+            'molecule_id': molecule_id,
+            'molecule_name': molecule_name,
+            'smiles': smiles,
+            'ring_size': 16,
+            'num_fragments': len(mol_fragments.fragments),
+            'processing_date': datetime.now().isoformat()
+        }
+        
+        metadata_path = mol_output_dir / 'metadata.json'
+        with open(metadata_path, 'w', encoding='utf-8') as f:
+            json.dump(metadata, f, indent=2, ensure_ascii=False)
+        
+        return len(mol_fragments.fragments), 'success'
+        
+    except Exception as e:
+        log_message(f"分子 {idx} 未预期错误: {str(e)}", error_log_file, 'error')
+        return None, 'unexpected_error'
+
+
+def main():
+    print("="*80)
+    print("16元环大环内酯批量处理程序")
+    print("="*80)
+    print()
+    
+    # 读取数据
+    input_file = Path('ring16/temp_filtered_complete.csv')
+    if not input_file.exists():
+        print(f"❌ 错误: 找不到输入文件 {input_file}")
+        return
+    
+    df = pd.read_csv(input_file)
+    print(f"✓ 读取数据集: {len(df)} 个分子")
+    print(f"  文件: {input_file}")
+    print()
+    
+    # 创建输出文件夹
+    output_base_dir = Path('output/ring16_fragments')
+    output_base_dir.mkdir(parents=True, exist_ok=True)
+    
+    # 创建日志文件
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_file = output_base_dir / f'processing_log_{timestamp}.txt'
+    error_log_file = output_base_dir / f'error_log_{timestamp}.txt'
+    multiple_lactone_log = output_base_dir / f'multiple_lactone_log_{timestamp}.txt'
+    
+    print(f"✓ 输出文件夹: {output_base_dir}")
+    print(f"✓ 日志文件: {log_file}")
+    print()
+    
+    # 统计信息
+    stats = {
+        'total': len(df),
+        'success': 0,
+        'failed_parse': 0,
+        'failed_numbering': 0,
+        'failed_validation': 0,
+        'failed_cleavage': 0,
+        'failed_unexpected': 0,
+        'multiple_lactones': 0,
+        'total_fragments': 0
+    }
+    
+    # 批量处理
+    print("开始批量处理...")
+    print()
+    
+    log_message(f"开始批量处理 {len(df)} 个16元环分子", log_file)
+    
+    for idx in tqdm(range(len(df)), desc="处理进度"):
+        num_fragments, status = process_single_molecule(
+            idx, df.iloc[idx], output_base_dir, log_file, error_log_file, multiple_lactone_log
+        )
+        
+        if status == 'success':
+            stats['success'] += 1
+            stats['total_fragments'] += num_fragments
+        elif status == 'parse_failed':
+            stats['failed_parse'] += 1
+        elif status == 'numbering_failed':
+            stats['failed_numbering'] += 1
+        elif status == 'validation_failed':
+            stats['failed_validation'] += 1
+        elif status == 'cleavage_failed':
+            stats['failed_cleavage'] += 1
+        elif status == 'multiple_lactones':
+            stats['multiple_lactones'] += 1
+        else:
+            stats['failed_unexpected'] += 1
+    
+    print()
+    print("="*80)
+    print("处理完成！")
+    print("="*80)
+    print()
+    
+    # 打印统计结果
+    print("统计结果:")
+    print(f"  总分子数: {stats['total']}")
+    print(f"  成功处理: {stats['success']} ({stats['success']/stats['total']*100:.2f}%)")
+    print(f"  总碎片数: {stats['total_fragments']}")
+    if stats['success'] > 0:
+        print(f"  平均碎片: {stats['total_fragments']/stats['success']:.2f} 个/分子")
+    print()
+    
+    print("失败情况:")
+    print(f"  SMILES解析失败: {stats['failed_parse']}")
+    print(f"  编号失败: {stats['failed_numbering']}")
+    print(f"  验证失败: {stats['failed_validation']}")
+    print(f"  裂解失败: {stats['failed_cleavage']}")
+    print(f"  多个内酯键: {stats['multiple_lactones']}")
+    print(f"  其他错误: {stats['failed_unexpected']}")
+    total_failed = stats['total'] - stats['success']
+    print(f"  总失败: {total_failed} ({total_failed/stats['total']*100:.2f}%)")
+    print()
+    
+    # 保存统计结果
+    stats_file = output_base_dir / 'processing_statistics.json'
+    with open(stats_file, 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2)
+    
+    print(f"✓ 统计结果已保存: {stats_file}")
+    print(f"✓ 输出文件夹: {output_base_dir}")
+    print()
+    
+    log_message(f"处理完成: 成功 {stats['success']}/{stats['total']}", log_file)
+
+
+if __name__ == '__main__':
+    main()
+