204 lines
6.5 KiB
Python
204 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
统计分析795个成功处理的分子,并生成SDF文件
|
||
|
||
功能:
|
||
1. 统计哪些位置被替换(断裂)最多
|
||
2. 为每个分子生成3D SDF文件(使用ETKDGv3)
|
||
"""
|
||
|
||
import sys
|
||
from pathlib import Path
|
||
from collections import Counter, defaultdict
|
||
import json
|
||
from tqdm import tqdm
|
||
|
||
# 添加项目根目录到 Python 路径
|
||
script_dir = Path(__file__).resolve().parent
|
||
project_root = script_dir.parent
|
||
sys.path.insert(0, str(project_root))
|
||
|
||
from rdkit import Chem
|
||
from rdkit.Chem import AllChem
|
||
|
||
from src.fragment_dataclass import MoleculeFragments
|
||
|
||
|
||
def load_all_fragments(output_dir):
|
||
"""加载所有分子的碎片信息"""
|
||
fragments_dir = Path(output_dir) / "ring16_fragments"
|
||
mol_dirs = sorted([d for d in fragments_dir.glob("ring16_mol_*") if d.is_dir()])
|
||
|
||
all_fragments = []
|
||
for mol_dir in tqdm(mol_dirs, desc="加载碎片信息"):
|
||
fragments_file = mol_dir / f"{mol_dir.name}_all_fragments.json"
|
||
if fragments_file.exists():
|
||
try:
|
||
mol_fragments = MoleculeFragments.from_json_file(str(fragments_file))
|
||
all_fragments.append((mol_dir, mol_fragments))
|
||
except Exception as e:
|
||
print(f"警告: 无法加载 {fragments_file}: {e}")
|
||
|
||
return all_fragments
|
||
|
||
|
||
def analyze_cleavage_positions(all_fragments):
|
||
"""统计分析哪些位置被替换最多"""
|
||
position_counter = Counter()
|
||
position_molecules = defaultdict(list) # 记录每个位置出现在哪些分子中
|
||
|
||
for mol_dir, mol_fragments in all_fragments:
|
||
positions = mol_fragments.get_cleavage_positions()
|
||
for pos in positions:
|
||
position_counter[pos] += 1
|
||
position_molecules[pos].append(mol_dir.name)
|
||
|
||
# 打印统计结果
|
||
print("\n" + "="*80)
|
||
print("位置替换统计(按替换次数排序)")
|
||
print("="*80)
|
||
print(f"{'位置':<8} {'替换次数':<12} {'占比':<10} {'分子数':<10}")
|
||
print("-"*80)
|
||
|
||
total_replacements = sum(position_counter.values())
|
||
for pos in sorted(position_counter.keys()):
|
||
count = position_counter[pos]
|
||
percentage = count / total_replacements * 100 if total_replacements > 0 else 0
|
||
mol_count = len(set(position_molecules[pos]))
|
||
print(f"{pos:<8} {count:<12} {percentage:>6.2f}% {mol_count:<10}")
|
||
|
||
print("-"*80)
|
||
print(f"{'总计':<8} {total_replacements:<12} {'100.00%':<10} {len(all_fragments):<10}")
|
||
print("="*80)
|
||
|
||
# 保存统计结果
|
||
stats = {
|
||
'position_counts': dict(position_counter),
|
||
'position_molecules': {str(k): list(set(v)) for k, v in position_molecules.items()},
|
||
'total_replacements': total_replacements,
|
||
'total_molecules': len(all_fragments)
|
||
}
|
||
|
||
stats_file = Path(output_dir) / "ring16_fragments" / "cleavage_position_statistics.json"
|
||
with open(stats_file, 'w', encoding='utf-8') as f:
|
||
json.dump(stats, f, indent=2, ensure_ascii=False)
|
||
|
||
print(f"\n✓ 统计结果已保存: {stats_file}")
|
||
|
||
return position_counter, position_molecules
|
||
|
||
|
||
def generate_3d_sdf(mol, output_path, max_attempts=100):
|
||
"""使用ETKDGv3生成3D SDF文件"""
|
||
try:
|
||
# 添加氢原子
|
||
mol = Chem.AddHs(mol)
|
||
|
||
# 使用ETKDGv3参数
|
||
params = AllChem.ETKDGv3()
|
||
|
||
for attempt in range(max_attempts):
|
||
try:
|
||
status = AllChem.EmbedMolecule(mol, params)
|
||
if status == 0:
|
||
# UFF优化
|
||
AllChem.UFFOptimizeMolecule(mol)
|
||
|
||
writer = Chem.SDWriter(str(output_path))
|
||
writer.write(mol)
|
||
writer.close()
|
||
return mol, True
|
||
except Exception as e:
|
||
if attempt == max_attempts - 1:
|
||
raise e
|
||
continue
|
||
|
||
return None, False
|
||
except Exception as e:
|
||
print(f"生成3D SDF失败: {e}")
|
||
return None, False
|
||
|
||
|
||
def generate_sdf_files(all_fragments, output_dir):
|
||
"""为所有分子生成3D SDF文件"""
|
||
fragments_dir = Path(output_dir) / "ring16_fragments"
|
||
|
||
print("\n" + "="*80)
|
||
print("生成3D SDF文件")
|
||
print("="*80)
|
||
|
||
success_count = 0
|
||
failed_count = 0
|
||
failed_molecules = [] # 记录失败的分子
|
||
|
||
for mol_dir, mol_fragments in tqdm(all_fragments, desc="生成3D SDF"):
|
||
smiles = mol_fragments.parent_smiles
|
||
|
||
# 先检查能否从SMILES创建分子
|
||
test_mol = Chem.MolFromSmiles(smiles)
|
||
if test_mol is None:
|
||
failed_count += 1
|
||
failed_molecules.append((mol_dir.name, "无法从SMILES创建分子"))
|
||
continue
|
||
|
||
base_name = mol_dir.name
|
||
sdf_3d_path = mol_dir / f"{base_name}_3d.sdf"
|
||
|
||
# 重试机制:最多尝试10次,每次从原始SMILES重新创建分子
|
||
max_retries = 10
|
||
success = False
|
||
|
||
for retry in range(max_retries):
|
||
# 每次重试都从原始SMILES重新创建分子
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if mol is None:
|
||
continue
|
||
|
||
mol_3d, success = generate_3d_sdf(mol, sdf_3d_path, max_attempts=100)
|
||
if success and mol_3d is not None:
|
||
success_count += 1
|
||
break
|
||
|
||
if not success:
|
||
failed_count += 1
|
||
failed_molecules.append((mol_dir.name, f"10次重试后仍失败"))
|
||
|
||
print(f"\n✓ 成功生成3D SDF: {success_count}/{len(all_fragments)}")
|
||
if failed_count > 0:
|
||
print(f"⚠ 失败: {failed_count}")
|
||
print("\n失败的分子列表:")
|
||
for mol_name, reason in failed_molecules:
|
||
print(f" - {mol_name}: {reason}")
|
||
|
||
|
||
def main():
|
||
global output_dir
|
||
output_dir = Path('output')
|
||
|
||
print("="*80)
|
||
print("16元环大环内酯统计分析及SDF生成程序")
|
||
print("="*80)
|
||
print()
|
||
|
||
# 1. 加载所有碎片信息
|
||
print("步骤1: 加载所有分子的碎片信息...")
|
||
all_fragments = load_all_fragments(output_dir)
|
||
print(f"✓ 加载了 {len(all_fragments)} 个分子的碎片信息")
|
||
|
||
# 2. 统计分析位置替换
|
||
print("\n步骤2: 统计分析位置替换...")
|
||
position_counter, position_molecules = analyze_cleavage_positions(all_fragments)
|
||
|
||
# 3. 生成3D SDF文件
|
||
print("\n步骤3: 生成3D SDF文件...")
|
||
generate_sdf_files(all_fragments, output_dir)
|
||
|
||
print("\n" + "="*80)
|
||
print("处理完成!")
|
||
print("="*80)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|
||
|