#!/usr/bin/env python3 """ 统计分析795个成功处理的分子,并生成SDF文件 功能: 1. 统计哪些位置被替换(断裂)最多 2. 为每个分子生成3D SDF文件(使用ETKDGv3) """ import sys from pathlib import Path from collections import Counter, defaultdict import json from tqdm import tqdm # 添加项目根目录到 Python 路径 script_dir = Path(__file__).resolve().parent project_root = script_dir.parent sys.path.insert(0, str(project_root)) from rdkit import Chem from rdkit.Chem import AllChem from src.fragment_dataclass import MoleculeFragments def load_all_fragments(output_dir): """加载所有分子的碎片信息""" fragments_dir = Path(output_dir) / "ring16_fragments" mol_dirs = sorted([d for d in fragments_dir.glob("ring16_mol_*") if d.is_dir()]) all_fragments = [] for mol_dir in tqdm(mol_dirs, desc="加载碎片信息"): fragments_file = mol_dir / f"{mol_dir.name}_all_fragments.json" if fragments_file.exists(): try: mol_fragments = MoleculeFragments.from_json_file(str(fragments_file)) all_fragments.append((mol_dir, mol_fragments)) except Exception as e: print(f"警告: 无法加载 {fragments_file}: {e}") return all_fragments def analyze_cleavage_positions(all_fragments): """统计分析哪些位置被替换最多""" position_counter = Counter() position_molecules = defaultdict(list) # 记录每个位置出现在哪些分子中 for mol_dir, mol_fragments in all_fragments: positions = mol_fragments.get_cleavage_positions() for pos in positions: position_counter[pos] += 1 position_molecules[pos].append(mol_dir.name) # 打印统计结果 print("\n" + "="*80) print("位置替换统计(按替换次数排序)") print("="*80) print(f"{'位置':<8} {'替换次数':<12} {'占比':<10} {'分子数':<10}") print("-"*80) total_replacements = sum(position_counter.values()) for pos in sorted(position_counter.keys()): count = position_counter[pos] percentage = count / total_replacements * 100 if total_replacements > 0 else 0 mol_count = len(set(position_molecules[pos])) print(f"{pos:<8} {count:<12} {percentage:>6.2f}% {mol_count:<10}") print("-"*80) print(f"{'总计':<8} {total_replacements:<12} {'100.00%':<10} {len(all_fragments):<10}") print("="*80) # 保存统计结果 stats = { 'position_counts': dict(position_counter), 'position_molecules': {str(k): list(set(v)) for k, v in position_molecules.items()}, 'total_replacements': total_replacements, 'total_molecules': len(all_fragments) } stats_file = Path(output_dir) / "ring16_fragments" / "cleavage_position_statistics.json" with open(stats_file, 'w', encoding='utf-8') as f: json.dump(stats, f, indent=2, ensure_ascii=False) print(f"\n✓ 统计结果已保存: {stats_file}") return position_counter, position_molecules def generate_3d_sdf(mol, output_path, max_attempts=100): """使用ETKDGv3生成3D SDF文件""" try: # 添加氢原子 mol = Chem.AddHs(mol) # 使用ETKDGv3参数 params = AllChem.ETKDGv3() for attempt in range(max_attempts): try: status = AllChem.EmbedMolecule(mol, params) if status == 0: # UFF优化 AllChem.UFFOptimizeMolecule(mol) writer = Chem.SDWriter(str(output_path)) writer.write(mol) writer.close() return mol, True except Exception as e: if attempt == max_attempts - 1: raise e continue return None, False except Exception as e: print(f"生成3D SDF失败: {e}") return None, False def generate_sdf_files(all_fragments, output_dir): """为所有分子生成3D SDF文件""" fragments_dir = Path(output_dir) / "ring16_fragments" print("\n" + "="*80) print("生成3D SDF文件") print("="*80) success_count = 0 failed_count = 0 failed_molecules = [] # 记录失败的分子 for mol_dir, mol_fragments in tqdm(all_fragments, desc="生成3D SDF"): smiles = mol_fragments.parent_smiles # 先检查能否从SMILES创建分子 test_mol = Chem.MolFromSmiles(smiles) if test_mol is None: failed_count += 1 failed_molecules.append((mol_dir.name, "无法从SMILES创建分子")) continue base_name = mol_dir.name sdf_3d_path = mol_dir / f"{base_name}_3d.sdf" # 重试机制:最多尝试10次,每次从原始SMILES重新创建分子 max_retries = 10 success = False for retry in range(max_retries): # 每次重试都从原始SMILES重新创建分子 mol = Chem.MolFromSmiles(smiles) if mol is None: continue mol_3d, success = generate_3d_sdf(mol, sdf_3d_path, max_attempts=100) if success and mol_3d is not None: success_count += 1 break if not success: failed_count += 1 failed_molecules.append((mol_dir.name, f"10次重试后仍失败")) print(f"\n✓ 成功生成3D SDF: {success_count}/{len(all_fragments)}") if failed_count > 0: print(f"⚠ 失败: {failed_count}") print("\n失败的分子列表:") for mol_name, reason in failed_molecules: print(f" - {mol_name}: {reason}") def main(): global output_dir output_dir = Path('output') print("="*80) print("16元环大环内酯统计分析及SDF生成程序") print("="*80) print() # 1. 加载所有碎片信息 print("步骤1: 加载所有分子的碎片信息...") all_fragments = load_all_fragments(output_dir) print(f"✓ 加载了 {len(all_fragments)} 个分子的碎片信息") # 2. 统计分析位置替换 print("\n步骤2: 统计分析位置替换...") position_counter, position_molecules = analyze_cleavage_positions(all_fragments) # 3. 生成3D SDF文件 print("\n步骤3: 生成3D SDF文件...") generate_sdf_files(all_fragments, output_dir) print("\n" + "="*80) print("处理完成!") print("="*80) if __name__ == '__main__': main()