first add

2025-11-14 20:34:58 +08:00
commit 0d99f7d12c
46 changed files with 698209 additions and 0 deletions
--- a/scripts/generate_sdf_and_statistics.py
+++ b/scripts/generate_sdf_and_statistics.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+"""
+统计分析795个成功处理的分子，并生成SDF文件
+
+功能：
+1. 统计哪些位置被替换（断裂）最多
+2. 为每个分子生成3D SDF文件（使用ETKDGv3）
+"""
+
+import sys
+from pathlib import Path
+from collections import Counter, defaultdict
+import json
+from tqdm import tqdm
+
+# 添加项目根目录到 Python 路径
+script_dir = Path(__file__).resolve().parent
+project_root = script_dir.parent
+sys.path.insert(0, str(project_root))
+
+from rdkit import Chem
+from rdkit.Chem import AllChem
+
+from src.fragment_dataclass import MoleculeFragments
+
+
+def load_all_fragments(output_dir):
+    """加载所有分子的碎片信息"""
+    fragments_dir = Path(output_dir) / "ring16_fragments"
+    mol_dirs = sorted([d for d in fragments_dir.glob("ring16_mol_*") if d.is_dir()])
+    
+    all_fragments = []
+    for mol_dir in tqdm(mol_dirs, desc="加载碎片信息"):
+        fragments_file = mol_dir / f"{mol_dir.name}_all_fragments.json"
+        if fragments_file.exists():
+            try:
+                mol_fragments = MoleculeFragments.from_json_file(str(fragments_file))
+                all_fragments.append((mol_dir, mol_fragments))
+            except Exception as e:
+                print(f"警告: 无法加载 {fragments_file}: {e}")
+    
+    return all_fragments
+
+
+def analyze_cleavage_positions(all_fragments):
+    """统计分析哪些位置被替换最多"""
+    position_counter = Counter()
+    position_molecules = defaultdict(list)  # 记录每个位置出现在哪些分子中
+    
+    for mol_dir, mol_fragments in all_fragments:
+        positions = mol_fragments.get_cleavage_positions()
+        for pos in positions:
+            position_counter[pos] += 1
+            position_molecules[pos].append(mol_dir.name)
+    
+    # 打印统计结果
+    print("\n" + "="*80)
+    print("位置替换统计（按替换次数排序）")
+    print("="*80)
+    print(f"{'位置':<8} {'替换次数':<12} {'占比':<10} {'分子数':<10}")
+    print("-"*80)
+    
+    total_replacements = sum(position_counter.values())
+    for pos in sorted(position_counter.keys()):
+        count = position_counter[pos]
+        percentage = count / total_replacements * 100 if total_replacements > 0 else 0
+        mol_count = len(set(position_molecules[pos]))
+        print(f"{pos:<8} {count:<12} {percentage:>6.2f}%   {mol_count:<10}")
+    
+    print("-"*80)
+    print(f"{'总计':<8} {total_replacements:<12} {'100.00%':<10} {len(all_fragments):<10}")
+    print("="*80)
+    
+    # 保存统计结果
+    stats = {
+        'position_counts': dict(position_counter),
+        'position_molecules': {str(k): list(set(v)) for k, v in position_molecules.items()},
+        'total_replacements': total_replacements,
+        'total_molecules': len(all_fragments)
+    }
+    
+    stats_file = Path(output_dir) / "ring16_fragments" / "cleavage_position_statistics.json"
+    with open(stats_file, 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+    
+    print(f"\n✓ 统计结果已保存: {stats_file}")
+    
+    return position_counter, position_molecules
+
+
+def generate_3d_sdf(mol, output_path, max_attempts=100):
+    """使用ETKDGv3生成3D SDF文件"""
+    try:
+        # 添加氢原子
+        mol = Chem.AddHs(mol)
+        
+        # 使用ETKDGv3参数
+        params = AllChem.ETKDGv3()
+        
+        for attempt in range(max_attempts):
+            try:
+                status = AllChem.EmbedMolecule(mol, params)
+                if status == 0:
+                    # UFF优化
+                    AllChem.UFFOptimizeMolecule(mol)
+                    
+                    writer = Chem.SDWriter(str(output_path))
+                    writer.write(mol)
+                    writer.close()
+                    return mol, True
+            except Exception as e:
+                if attempt == max_attempts - 1:
+                    raise e
+                continue
+        
+        return None, False
+    except Exception as e:
+        print(f"生成3D SDF失败: {e}")
+        return None, False
+
+
+def generate_sdf_files(all_fragments, output_dir):
+    """为所有分子生成3D SDF文件"""
+    fragments_dir = Path(output_dir) / "ring16_fragments"
+    
+    print("\n" + "="*80)
+    print("生成3D SDF文件")
+    print("="*80)
+    
+    success_count = 0
+    failed_count = 0
+    failed_molecules = []  # 记录失败的分子
+    
+    for mol_dir, mol_fragments in tqdm(all_fragments, desc="生成3D SDF"):
+        smiles = mol_fragments.parent_smiles
+        
+        # 先检查能否从SMILES创建分子
+        test_mol = Chem.MolFromSmiles(smiles)
+        if test_mol is None:
+            failed_count += 1
+            failed_molecules.append((mol_dir.name, "无法从SMILES创建分子"))
+            continue
+        
+        base_name = mol_dir.name
+        sdf_3d_path = mol_dir / f"{base_name}_3d.sdf"
+        
+        # 重试机制：最多尝试10次，每次从原始SMILES重新创建分子
+        max_retries = 10
+        success = False
+        
+        for retry in range(max_retries):
+            # 每次重试都从原始SMILES重新创建分子
+            mol = Chem.MolFromSmiles(smiles)
+            if mol is None:
+            continue
+        
+            mol_3d, success = generate_3d_sdf(mol, sdf_3d_path, max_attempts=100)
+            if success and mol_3d is not None:
+                success_count += 1
+                break
+        
+        if not success:
+            failed_count += 1
+            failed_molecules.append((mol_dir.name, f"10次重试后仍失败"))
+    
+    print(f"\n✓ 成功生成3D SDF: {success_count}/{len(all_fragments)}")
+    if failed_count > 0:
+        print(f"⚠ 失败: {failed_count}")
+        print("\n失败的分子列表:")
+        for mol_name, reason in failed_molecules:
+            print(f"  - {mol_name}: {reason}")
+
+
+def main():
+    global output_dir
+    output_dir = Path('output')
+    
+    print("="*80)
+    print("16元环大环内酯统计分析及SDF生成程序")
+    print("="*80)
+    print()
+    
+    # 1. 加载所有碎片信息
+    print("步骤1: 加载所有分子的碎片信息...")
+    all_fragments = load_all_fragments(output_dir)
+    print(f"✓ 加载了 {len(all_fragments)} 个分子的碎片信息")
+    
+    # 2. 统计分析位置替换
+    print("\n步骤2: 统计分析位置替换...")
+    position_counter, position_molecules = analyze_cleavage_positions(all_fragments)
+    
+    # 3. 生成3D SDF文件
+    print("\n步骤3: 生成3D SDF文件...")
+    generate_sdf_files(all_fragments, output_dir)
+    
+    print("\n" + "="*80)
+    print("处理完成！")
+    print("="*80)
+
+
+if __name__ == '__main__':
+    main()
+