first add

This commit is contained in:
2025-11-14 20:34:58 +08:00
commit 0d99f7d12c
46 changed files with 698209 additions and 0 deletions

View File

@@ -0,0 +1,203 @@
#!/usr/bin/env python3
"""
统计分析795个成功处理的分子并生成SDF文件
功能:
1. 统计哪些位置被替换(断裂)最多
2. 为每个分子生成3D SDF文件使用ETKDGv3
"""
import sys
from pathlib import Path
from collections import Counter, defaultdict
import json
from tqdm import tqdm
# 添加项目根目录到 Python 路径
script_dir = Path(__file__).resolve().parent
project_root = script_dir.parent
sys.path.insert(0, str(project_root))
from rdkit import Chem
from rdkit.Chem import AllChem
from src.fragment_dataclass import MoleculeFragments
def load_all_fragments(output_dir):
"""加载所有分子的碎片信息"""
fragments_dir = Path(output_dir) / "ring16_fragments"
mol_dirs = sorted([d for d in fragments_dir.glob("ring16_mol_*") if d.is_dir()])
all_fragments = []
for mol_dir in tqdm(mol_dirs, desc="加载碎片信息"):
fragments_file = mol_dir / f"{mol_dir.name}_all_fragments.json"
if fragments_file.exists():
try:
mol_fragments = MoleculeFragments.from_json_file(str(fragments_file))
all_fragments.append((mol_dir, mol_fragments))
except Exception as e:
print(f"警告: 无法加载 {fragments_file}: {e}")
return all_fragments
def analyze_cleavage_positions(all_fragments):
"""统计分析哪些位置被替换最多"""
position_counter = Counter()
position_molecules = defaultdict(list) # 记录每个位置出现在哪些分子中
for mol_dir, mol_fragments in all_fragments:
positions = mol_fragments.get_cleavage_positions()
for pos in positions:
position_counter[pos] += 1
position_molecules[pos].append(mol_dir.name)
# 打印统计结果
print("\n" + "="*80)
print("位置替换统计(按替换次数排序)")
print("="*80)
print(f"{'位置':<8} {'替换次数':<12} {'占比':<10} {'分子数':<10}")
print("-"*80)
total_replacements = sum(position_counter.values())
for pos in sorted(position_counter.keys()):
count = position_counter[pos]
percentage = count / total_replacements * 100 if total_replacements > 0 else 0
mol_count = len(set(position_molecules[pos]))
print(f"{pos:<8} {count:<12} {percentage:>6.2f}% {mol_count:<10}")
print("-"*80)
print(f"{'总计':<8} {total_replacements:<12} {'100.00%':<10} {len(all_fragments):<10}")
print("="*80)
# 保存统计结果
stats = {
'position_counts': dict(position_counter),
'position_molecules': {str(k): list(set(v)) for k, v in position_molecules.items()},
'total_replacements': total_replacements,
'total_molecules': len(all_fragments)
}
stats_file = Path(output_dir) / "ring16_fragments" / "cleavage_position_statistics.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2, ensure_ascii=False)
print(f"\n✓ 统计结果已保存: {stats_file}")
return position_counter, position_molecules
def generate_3d_sdf(mol, output_path, max_attempts=100):
"""使用ETKDGv3生成3D SDF文件"""
try:
# 添加氢原子
mol = Chem.AddHs(mol)
# 使用ETKDGv3参数
params = AllChem.ETKDGv3()
for attempt in range(max_attempts):
try:
status = AllChem.EmbedMolecule(mol, params)
if status == 0:
# UFF优化
AllChem.UFFOptimizeMolecule(mol)
writer = Chem.SDWriter(str(output_path))
writer.write(mol)
writer.close()
return mol, True
except Exception as e:
if attempt == max_attempts - 1:
raise e
continue
return None, False
except Exception as e:
print(f"生成3D SDF失败: {e}")
return None, False
def generate_sdf_files(all_fragments, output_dir):
"""为所有分子生成3D SDF文件"""
fragments_dir = Path(output_dir) / "ring16_fragments"
print("\n" + "="*80)
print("生成3D SDF文件")
print("="*80)
success_count = 0
failed_count = 0
failed_molecules = [] # 记录失败的分子
for mol_dir, mol_fragments in tqdm(all_fragments, desc="生成3D SDF"):
smiles = mol_fragments.parent_smiles
# 先检查能否从SMILES创建分子
test_mol = Chem.MolFromSmiles(smiles)
if test_mol is None:
failed_count += 1
failed_molecules.append((mol_dir.name, "无法从SMILES创建分子"))
continue
base_name = mol_dir.name
sdf_3d_path = mol_dir / f"{base_name}_3d.sdf"
# 重试机制最多尝试10次每次从原始SMILES重新创建分子
max_retries = 10
success = False
for retry in range(max_retries):
# 每次重试都从原始SMILES重新创建分子
mol = Chem.MolFromSmiles(smiles)
if mol is None:
continue
mol_3d, success = generate_3d_sdf(mol, sdf_3d_path, max_attempts=100)
if success and mol_3d is not None:
success_count += 1
break
if not success:
failed_count += 1
failed_molecules.append((mol_dir.name, f"10次重试后仍失败"))
print(f"\n✓ 成功生成3D SDF: {success_count}/{len(all_fragments)}")
if failed_count > 0:
print(f"⚠ 失败: {failed_count}")
print("\n失败的分子列表:")
for mol_name, reason in failed_molecules:
print(f" - {mol_name}: {reason}")
def main():
global output_dir
output_dir = Path('output')
print("="*80)
print("16元环大环内酯统计分析及SDF生成程序")
print("="*80)
print()
# 1. 加载所有碎片信息
print("步骤1: 加载所有分子的碎片信息...")
all_fragments = load_all_fragments(output_dir)
print(f"✓ 加载了 {len(all_fragments)} 个分子的碎片信息")
# 2. 统计分析位置替换
print("\n步骤2: 统计分析位置替换...")
position_counter, position_molecules = analyze_cleavage_positions(all_fragments)
# 3. 生成3D SDF文件
print("\n步骤3: 生成3D SDF文件...")
generate_sdf_files(all_fragments, output_dir)
print("\n" + "="*80)
print("处理完成!")
print("="*80)
if __name__ == '__main__':
main()