Update ring_visualization.py with enhanced filtering and batch processing capabilities
- Add batch_visualize_macrolactones function for processing multiple molecules - Add test_all_ring_sizes_with_filtering function for comprehensive testing - Add analyze_ring_atom_composition function for composition analysis - Improve error handling and progress reporting - Add support for carbon-only and carbon-nitrogen filtering - Enhanced SVG output with better file organization
This commit is contained in:
@@ -7,11 +7,99 @@ This module provides functions for:
|
||||
3. Supporting both general molecules and macrolactones (12-20 membered rings)
|
||||
"""
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import Draw
|
||||
from rdkit.Chem import Draw, AllChem
|
||||
from rdkit.Chem.Draw import rdMolDraw2D
|
||||
from IPython.display import SVG, display
|
||||
from typing import Dict, Tuple, Optional, List, Union
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
# 导入分析器类
|
||||
try:
|
||||
from src.macro_lactone_analyzer import MacroLactoneAnalyzer
|
||||
except ImportError:
|
||||
# 如果导入失败,定义一个简单的替代类
|
||||
class MacroLactoneAnalyzer:
|
||||
def is_valid_macrolactone(self, mol, ring_size):
|
||||
# 简单检查是否有指定大小的环
|
||||
ring_atoms = get_ring_atoms_by_size(mol, ring_size)
|
||||
return ring_atoms is not None
|
||||
|
||||
# 原子序数映射,支持元素符号和原子序数字符串
|
||||
ATOMIC_NUMBERS = {
|
||||
"H": 1, "C": 6, "N": 7, "O": 8, "S": 16, "P": 15,
|
||||
"F": 9, "Cl": 17, "Br": 35, "I": 53
|
||||
}
|
||||
|
||||
def normalize_atom_types(atom_types: Optional[List[str]]) -> Optional[List[int]]:
|
||||
"""
|
||||
标准化原子类型列表,将元素符号转换为原子序数
|
||||
|
||||
Args:
|
||||
atom_types: 原子类型列表,可以是元素符号["C", "N"]或原子序数["6", "7"]
|
||||
|
||||
Returns:
|
||||
标准化的原子序数列表,如果输入为None则返回None
|
||||
"""
|
||||
if atom_types is None:
|
||||
return None
|
||||
|
||||
normalized = []
|
||||
for atom_type in atom_types:
|
||||
if atom_type.isdigit():
|
||||
# 已经是原子序数字符串
|
||||
normalized.append(int(atom_type))
|
||||
elif atom_type.upper() in ATOMIC_NUMBERS:
|
||||
# 元素符号,转换为原子序数
|
||||
normalized.append(ATOMIC_NUMBERS[atom_type.upper()])
|
||||
else:
|
||||
raise ValueError(f"不支持的原子类型: {atom_type}")
|
||||
|
||||
return normalized
|
||||
|
||||
def validate_ring_atom_composition(
|
||||
mol: Chem.Mol,
|
||||
ring_atoms: List[int],
|
||||
carbonyl_carbon_idx: int,
|
||||
ester_oxygen_idx: int,
|
||||
allowed_atom_types: Optional[List[int]] = None
|
||||
) -> Tuple[bool, str]:
|
||||
"""
|
||||
验证环原子组成是否符合要求
|
||||
|
||||
Args:
|
||||
mol: RDKit分子对象
|
||||
ring_atoms: 环原子索引列表
|
||||
carbonyl_carbon_idx: 羰基碳索引(位置1)
|
||||
ester_oxygen_idx: 酯氧索引(位置2)
|
||||
allowed_atom_types: 允许的原子类型列表(原子序数)
|
||||
|
||||
Returns:
|
||||
(is_valid, reason): 是否有效及原因说明
|
||||
"""
|
||||
if allowed_atom_types is None:
|
||||
return True, "不限制原子类型"
|
||||
|
||||
# 检查环中每个原子(除了酯键氧)
|
||||
invalid_atoms = []
|
||||
|
||||
for atom_idx in ring_atoms:
|
||||
# 跳过酯键氧原子(位置2)
|
||||
if atom_idx == ester_oxygen_idx:
|
||||
continue
|
||||
|
||||
atom = mol.GetAtomWithIdx(atom_idx)
|
||||
atomic_num = atom.GetAtomicNum()
|
||||
|
||||
if atomic_num not in allowed_atom_types:
|
||||
symbol = atom.GetSymbol()
|
||||
invalid_atoms.append(f"{symbol}(索引:{atom_idx})")
|
||||
|
||||
if invalid_atoms:
|
||||
return False, f"环中包含不允许的原子类型: {', '.join(invalid_atoms)}"
|
||||
|
||||
return True, f"环原子组成符合要求,只允许: {[Chem.GetPeriodicTable().GetElementSymbol(num) for num in allowed_atom_types]}"
|
||||
|
||||
|
||||
def get_ring_atoms_by_size(mol: Chem.Mol, ring_size: int) -> Optional[List[int]]:
|
||||
@@ -223,16 +311,85 @@ def create_ring_numbering(
|
||||
|
||||
def get_macrolactone_numbering(
|
||||
mol: Union[Chem.Mol, str],
|
||||
ring_size: int = 16
|
||||
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int]]:
|
||||
ring_size: int = 16,
|
||||
allowed_ring_atom_types: Optional[List[str]] = None
|
||||
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int], Tuple[bool, str]]:
|
||||
"""
|
||||
Get ring numbering for a macrolactone molecule.
|
||||
Get ring numbering for a macrolactone molecule with optional atom type filtering.
|
||||
|
||||
This function performs the complete numbering workflow:
|
||||
1. Find ring of specified size
|
||||
2. Find ester group
|
||||
3. Find carbonyl carbon and ester oxygen
|
||||
4. Create numbering mapping
|
||||
4. Validate ring atom composition (if filtering requested)
|
||||
5. Create numbering mapping
|
||||
|
||||
Args:
|
||||
mol: RDKit molecule object or SMILES string
|
||||
ring_size: Size of the macrolactone ring (12-20, default 16)
|
||||
allowed_ring_atom_types: Allowed atom types for ring atoms (excluding ester oxygen)
|
||||
- None: No restriction (default behavior)
|
||||
- ["C"]: Only carbon atoms allowed (except ester oxygen)
|
||||
- ["C", "N"]: Carbon and nitrogen atoms allowed (except ester oxygen)
|
||||
- Can use element symbols or atomic number strings: ["6", "7"]
|
||||
|
||||
Returns:
|
||||
Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason))
|
||||
Returns (None, None, None, None, None, (False, reason)) if numbering fails
|
||||
"""
|
||||
# Convert SMILES to molecule if needed
|
||||
if isinstance(mol, str):
|
||||
mol = Chem.MolFromSmiles(mol)
|
||||
if mol is None:
|
||||
return None, None, None, None, None, (False, "无法解析SMILES字符串")
|
||||
|
||||
# 标准化原子类型
|
||||
try:
|
||||
allowed_atom_numbers = normalize_atom_types(allowed_ring_atom_types)
|
||||
except ValueError as e:
|
||||
return None, None, None, None, None, (False, f"原子类型参数错误: {str(e)}")
|
||||
|
||||
# Find ring of specified size
|
||||
ring_atoms = get_ring_atoms_by_size(mol, ring_size)
|
||||
if ring_atoms is None:
|
||||
return None, None, None, None, None, (False, f"未找到{ring_size}元环")
|
||||
|
||||
# Find ester group
|
||||
ester_atoms, pattern = find_ester_smarts(mol, ring_atoms)
|
||||
if ester_atoms is None:
|
||||
return None, None, None, None, None, (False, "未在环中找到酯键")
|
||||
|
||||
# Find carbonyl carbon and ester oxygen
|
||||
carbonyl_carbon = get_carbonyl_carbon_in_ring(mol, ester_atoms, ring_atoms)
|
||||
ester_oxygen = get_ester_oxygen_in_ring(mol, ester_atoms, ring_atoms)
|
||||
|
||||
if carbonyl_carbon is None or ester_oxygen is None:
|
||||
return None, None, None, None, None, (False, "无法识别羰基碳或酯氧原子")
|
||||
|
||||
# 验证环原子组成(如果指定了允许的原子类型)
|
||||
if allowed_atom_numbers is not None:
|
||||
is_valid, validation_reason = validate_ring_atom_composition(
|
||||
mol, ring_atoms, carbonyl_carbon, ester_oxygen, allowed_atom_numbers
|
||||
)
|
||||
if not is_valid:
|
||||
return None, None, None, None, None, (False, validation_reason)
|
||||
else:
|
||||
validation_reason = "不限制原子类型"
|
||||
|
||||
# Create numbering
|
||||
ring_numbering, ordered_atoms = create_ring_numbering(
|
||||
mol, ring_atoms, carbonyl_carbon, ester_oxygen
|
||||
)
|
||||
|
||||
return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (True, validation_reason)
|
||||
|
||||
|
||||
def get_macrolactone_numbering_legacy(
|
||||
mol: Union[Chem.Mol, str],
|
||||
ring_size: int = 16
|
||||
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int]]:
|
||||
"""
|
||||
向后兼容版本的get_macrolactone_numbering函数
|
||||
|
||||
Args:
|
||||
mol: RDKit molecule object or SMILES string
|
||||
@@ -242,35 +399,496 @@ def get_macrolactone_numbering(
|
||||
Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen)
|
||||
Returns (None, None, None, None, None) if numbering fails
|
||||
"""
|
||||
# Convert SMILES to molecule if needed
|
||||
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
|
||||
get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types=None)
|
||||
|
||||
if not is_valid:
|
||||
return None, None, None, None, None
|
||||
|
||||
return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen
|
||||
|
||||
|
||||
def is_pure_carbon_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
|
||||
"""
|
||||
便捷函数:检查是否为纯碳大环内酯(除酯键氧外只含碳原子)
|
||||
|
||||
Args:
|
||||
mol: RDKit molecule object or SMILES string
|
||||
ring_size: Size of the macrolactone ring
|
||||
|
||||
Returns:
|
||||
(is_valid, reason): 是否为纯碳大环内酯及原因
|
||||
"""
|
||||
_, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
|
||||
mol, ring_size, allowed_ring_atom_types=["C"]
|
||||
)
|
||||
return is_valid, reason
|
||||
|
||||
|
||||
def is_carbon_nitrogen_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
|
||||
"""
|
||||
便捷函数:检查是否为碳氮大环内酯(除酯键氧外只含碳和氮原子)
|
||||
|
||||
Args:
|
||||
mol: RDKit molecule object or SMILES string
|
||||
ring_size: Size of the macrolactone ring
|
||||
|
||||
Returns:
|
||||
(is_valid, reason): 是否为碳氮大环内酯及原因
|
||||
"""
|
||||
_, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
|
||||
mol, ring_size, allowed_ring_atom_types=["C", "N"]
|
||||
)
|
||||
return is_valid, reason
|
||||
|
||||
|
||||
def visualize_macrolactone_with_auto_coloring(
|
||||
mol: Union[Chem.Mol, str],
|
||||
ring_size: Optional[int] = None,
|
||||
allowed_ring_atom_types: Optional[List[str]] = None,
|
||||
size: Tuple[int, int] = (800, 800),
|
||||
title: str = "",
|
||||
return_svg: bool = False,
|
||||
show_atom_labels: bool = True
|
||||
) -> Union[str, None]:
|
||||
"""
|
||||
可视化大环内酯,自动根据原子类型进行颜色编码
|
||||
|
||||
Args:
|
||||
mol: RDKit分子对象或SMILES字符串
|
||||
ring_size: 环大小(12-20),如果为None则自动检测
|
||||
allowed_ring_atom_types: 允许的环原子类型(除酯氧外)
|
||||
- None: 不限制,显示所有原子类型
|
||||
- ["C"]: 只允许碳原子(默认筛选条件)
|
||||
- ["C", "N"]: 允许碳和氮原子
|
||||
size: 图像大小
|
||||
title: 图像标题
|
||||
return_svg: 是否返回SVG字符串
|
||||
show_atom_labels: 是否显示原子编号标签
|
||||
|
||||
Returns:
|
||||
SVG字符串(如果return_svg=True)或显示图像
|
||||
"""
|
||||
# 转换SMILES到分子对象
|
||||
if isinstance(mol, str):
|
||||
mol_obj = Chem.MolFromSmiles(mol)
|
||||
if mol_obj is None:
|
||||
if return_svg:
|
||||
return None
|
||||
else:
|
||||
print("❌ 无法解析SMILES字符串")
|
||||
return None
|
||||
else:
|
||||
mol_obj = mol
|
||||
|
||||
# 自动检测环大小(如果未指定)
|
||||
if ring_size is None:
|
||||
ring_sizes = []
|
||||
for size in range(12, 21):
|
||||
ring_atoms = get_ring_atoms_by_size(mol_obj, size)
|
||||
if ring_atoms:
|
||||
# 检查是否为有效大环内酯
|
||||
analyzer = MacroLactoneAnalyzer()
|
||||
if analyzer.is_valid_macrolactone(mol_obj, size):
|
||||
ring_sizes.append(size)
|
||||
|
||||
if not ring_sizes:
|
||||
if return_svg:
|
||||
return None
|
||||
else:
|
||||
print("❌ 未找到12-20元大环内酯")
|
||||
return None
|
||||
|
||||
# 使用找到的第一个环大小
|
||||
ring_size = ring_sizes[0]
|
||||
if len(ring_sizes) > 1:
|
||||
print(f"⚠️ 找到多个环大小: {ring_sizes},使用 {ring_size}")
|
||||
|
||||
# 获取环编号信息
|
||||
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
|
||||
get_macrolactone_numbering(mol_obj, ring_size, allowed_ring_atom_types)
|
||||
|
||||
if not is_valid or not ring_atoms:
|
||||
if return_svg:
|
||||
return None
|
||||
else:
|
||||
print(f"❌ 无法获取环编号: {validation_reason}")
|
||||
return None
|
||||
|
||||
# 创建分子副本并设置原子标签
|
||||
mol_copy = Chem.Mol(mol_obj)
|
||||
AllChem.Compute2DCoords(mol_copy)
|
||||
|
||||
if show_atom_labels:
|
||||
for atom_idx in ring_atoms:
|
||||
if atom_idx in ring_numbering:
|
||||
atom = mol_copy.GetAtomWithIdx(atom_idx)
|
||||
atom.SetProp("atomNote", str(ring_numbering[atom_idx]))
|
||||
|
||||
# 定义原子颜色方案
|
||||
def get_atom_color(symbol: str, is_ester_oxygen: bool = False) -> Tuple[float, float, float]:
|
||||
"""获取原子颜色"""
|
||||
if is_ester_oxygen:
|
||||
return (1.0, 0.4, 0.4) # 酯氧用深红色
|
||||
elif symbol == 'C':
|
||||
return (0.7, 0.8, 1.0) # 碳用蓝色
|
||||
elif symbol == 'N':
|
||||
return (1.0, 0.8, 1.0) # 氮用粉色
|
||||
elif symbol == 'O':
|
||||
return (1.0, 0.7, 0.7) # 氧用红色
|
||||
elif symbol == 'S':
|
||||
return (1.0, 1.0, 0.6) # 硫用黄色
|
||||
elif symbol == 'P':
|
||||
return (0.8, 0.6, 1.0) # 磷用紫色
|
||||
else:
|
||||
return (0.8, 1.0, 0.8) # 其他用绿色
|
||||
|
||||
# 设置原子颜色
|
||||
atom_colors = {}
|
||||
atom_type_stats = {}
|
||||
|
||||
for atom_idx in ring_atoms:
|
||||
atom = mol_obj.GetAtomWithIdx(atom_idx)
|
||||
symbol = atom.GetSymbol()
|
||||
is_ester_oxygen = (atom_idx == ester_oxygen)
|
||||
|
||||
# 设置颜色
|
||||
color = get_atom_color(symbol, is_ester_oxygen)
|
||||
atom_colors[atom_idx] = color
|
||||
|
||||
# 统计原子类型(不包括酯氧)
|
||||
if not is_ester_oxygen:
|
||||
atom_type_stats[symbol] = atom_type_stats.get(symbol, 0) + 1
|
||||
|
||||
# 绘制分子
|
||||
# 确保size是元组格式
|
||||
if isinstance(size, int):
|
||||
size = (size, size)
|
||||
elif isinstance(size, (list, tuple)) and len(size) == 1:
|
||||
size = (size[0], size[0])
|
||||
elif not isinstance(size, (list, tuple)) or len(size) != 2:
|
||||
size = (800, 800) # 默认大小
|
||||
|
||||
drawer = rdMolDraw2D.MolDraw2DSVG(int(size[0]), int(size[1]))
|
||||
drawer.SetFontSize(12) # 设置合适的字体大小(最小为6)
|
||||
|
||||
# 注意:某些RDKit版本不支持DrawTitle,暂时注释掉
|
||||
# if title:
|
||||
# drawer.DrawTitle(title)
|
||||
|
||||
drawer.DrawMolecule(mol_copy,
|
||||
highlightAtoms=ring_atoms,
|
||||
highlightAtomColors=atom_colors)
|
||||
drawer.FinishDrawing()
|
||||
svg = drawer.GetDrawingText()
|
||||
|
||||
# 显示统计信息
|
||||
if not return_svg:
|
||||
display(SVG(svg))
|
||||
|
||||
print(f"\n📊 分子信息:")
|
||||
print(f" 环大小: {ring_size} 元环")
|
||||
print(f" 羰基碳位置: {ring_numbering.get(carbonyl_carbon, 'N/A')} (深红色标记)")
|
||||
print(f" 酯氧位置: {ring_numbering.get(ester_oxygen, 'N/A')} (深红色标记)")
|
||||
print(f" 环原子组成: {atom_type_stats}")
|
||||
print(f" 筛选条件: {validation_reason}")
|
||||
|
||||
# 显示颜色说明
|
||||
print(f"\n🎨 颜色说明:")
|
||||
print(f" 深红色: 酯键氧原子 (位置2)")
|
||||
print(f" 蓝色: 碳原子")
|
||||
print(f" 粉色: 氮原子")
|
||||
print(f" 红色: 氧原子")
|
||||
print(f" 黄色: 硫原子")
|
||||
print(f" 紫色: 磷原子")
|
||||
print(f" 绿色: 其他原子")
|
||||
|
||||
return svg if return_svg else None
|
||||
|
||||
|
||||
def batch_visualize_macrolactones(
|
||||
data_file: Path,
|
||||
ring_sizes: List[int] = None,
|
||||
allowed_ring_atom_types: Optional[List[str]] = None,
|
||||
max_examples_per_size: int = 3,
|
||||
output_dir: Optional[Path] = None
|
||||
) -> Dict[int, List[Dict]]:
|
||||
"""
|
||||
批量可视化大环内酯分子
|
||||
|
||||
Args:
|
||||
data_file: CSV数据文件路径
|
||||
ring_sizes: 要测试的环大小列表,默认为12-20
|
||||
allowed_ring_atom_types: 允许的环原子类型
|
||||
max_examples_per_size: 每种环大小最大示例数
|
||||
output_dir: 输出目录,如果指定则保存SVG文件
|
||||
|
||||
Returns:
|
||||
按环大小分组的可视化结果字典
|
||||
"""
|
||||
if ring_sizes is None:
|
||||
ring_sizes = list(range(12, 21))
|
||||
|
||||
print(f"🔍 开始批量可视化大环内酯")
|
||||
print(f" 数据文件: {data_file}")
|
||||
print(f" 环大小范围: {ring_sizes}")
|
||||
print(f" 筛选条件: {allowed_ring_atom_types or '无限制'}")
|
||||
|
||||
# 加载数据
|
||||
if not data_file.exists():
|
||||
print(f"❌ 数据文件不存在: {data_file}")
|
||||
return {}
|
||||
|
||||
df = pd.read_csv(data_file)
|
||||
print(f"✓ 加载数据: {len(df)} 个分子")
|
||||
|
||||
results = {}
|
||||
|
||||
for ring_size in ring_sizes:
|
||||
print(f"\n🔄 处理 {ring_size} 元环...")
|
||||
|
||||
size_results = []
|
||||
found_count = 0
|
||||
|
||||
for idx, row in df.iterrows():
|
||||
if found_count >= max_examples_per_size:
|
||||
break
|
||||
|
||||
smiles = row.get('smiles', '')
|
||||
if not smiles:
|
||||
continue
|
||||
|
||||
try:
|
||||
# 测试分子
|
||||
mol = Chem.MolFromSmiles(smiles)
|
||||
if not mol:
|
||||
continue
|
||||
|
||||
# 检查是否为指定大小的有效大环内酯
|
||||
analyzer = MacroLactoneAnalyzer()
|
||||
if not analyzer.is_valid_macrolactone(mol, ring_size):
|
||||
continue
|
||||
|
||||
# 应用筛选条件(如果指定)
|
||||
if allowed_ring_atom_types:
|
||||
is_valid, validation_reason = get_macrolactone_numbering(
|
||||
mol, ring_size, allowed_ring_atom_types
|
||||
)[5]
|
||||
if not is_valid:
|
||||
continue
|
||||
|
||||
# 获取详细信息
|
||||
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
|
||||
get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types)
|
||||
|
||||
if is_valid:
|
||||
# 分析原子组成
|
||||
composition = analyze_ring_atom_composition(smiles, ring_size)
|
||||
|
||||
result = {
|
||||
'index': idx,
|
||||
'smiles': smiles,
|
||||
'molecule_id': row.get('molecule_id', f'mol_{idx}'),
|
||||
'ring_size': ring_size,
|
||||
'composition': composition,
|
||||
'validation_reason': validation_reason,
|
||||
'carbonyl_carbon_pos': ring_numbering.get(carbonyl_carbon, 'N/A'),
|
||||
'ester_oxygen_pos': ring_numbering.get(ester_oxygen, 'N/A')
|
||||
}
|
||||
|
||||
size_results.append(result)
|
||||
found_count += 1
|
||||
|
||||
print(f" ✓ 找到示例 {found_count}: 分子{idx} ({composition})")
|
||||
|
||||
# 保存可视化(如果指定了输出目录)
|
||||
if output_dir:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
filename = f"ring{ring_size}_example{found_count}_mol{idx}.svg"
|
||||
output_path = output_dir / filename
|
||||
|
||||
svg = visualize_macrolactone_with_auto_coloring(
|
||||
mol, ring_size, allowed_ring_atom_types,
|
||||
return_svg=True,
|
||||
title=f"{ring_size}-元环示例 {found_count}"
|
||||
)
|
||||
|
||||
if svg:
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(svg)
|
||||
print(f" 💾 保存到: {output_path}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ⚠️ 处理分子 {idx} 时出错: {str(e)}")
|
||||
continue
|
||||
|
||||
results[ring_size] = size_results
|
||||
print(f" 📊 {ring_size} 元环: 找到 {len(size_results)} 个示例")
|
||||
|
||||
# 显示总体统计
|
||||
print(f"\n📈 总体统计:")
|
||||
total_found = sum(len(results[size]) for size in ring_sizes)
|
||||
print(f" 总示例数: {total_found}")
|
||||
|
||||
for ring_size in ring_sizes:
|
||||
count = len(results[ring_size])
|
||||
print(f" {ring_size} 元环: {count} 个示例")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def test_all_ring_sizes_with_filtering(
|
||||
filtered_data_dir: Path,
|
||||
allowed_ring_atom_types: Optional[List[str]] = ["C"], # 默认只允许碳原子
|
||||
output_dir: Optional[Path] = None
|
||||
) -> Dict[int, Dict]:
|
||||
"""
|
||||
测试所有环大小(12-20)的筛选效果
|
||||
|
||||
Args:
|
||||
filtered_data_dir: 包含filtered CSV文件的目录
|
||||
allowed_ring_atom_types: 允许的环原子类型
|
||||
output_dir: 输出目录
|
||||
|
||||
Returns:
|
||||
详细的测试结果
|
||||
"""
|
||||
print(f"🧪 开始测试所有环大小的筛选效果")
|
||||
print(f" 数据目录: {filtered_data_dir}")
|
||||
print(f" 筛选条件: {allowed_ring_atom_types or '无限制'}")
|
||||
|
||||
all_results = {}
|
||||
|
||||
for ring_size in range(12, 21):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"🔍 测试 {ring_size} 元环")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 查找对应的数据文件
|
||||
data_file = filtered_data_dir / f'macrolactone_ring{ring_size}_filtered.csv'
|
||||
|
||||
if not data_file.exists():
|
||||
print(f"⚠️ 未找到 {ring_size} 元环数据文件: {data_file}")
|
||||
all_results[ring_size] = {
|
||||
'data_file_exists': False,
|
||||
'total_molecules': 0,
|
||||
'valid_macrolactones': 0,
|
||||
'filtered_molecules': 0,
|
||||
'examples': []
|
||||
}
|
||||
continue
|
||||
|
||||
# 批量测试
|
||||
batch_results = batch_visualize_macrolactones(
|
||||
data_file,
|
||||
ring_sizes=[ring_size],
|
||||
allowed_ring_atom_types=allowed_ring_atom_types,
|
||||
max_examples_per_size=5,
|
||||
output_dir=output_dir / f'ring{ring_size}_examples' if output_dir else None
|
||||
)
|
||||
|
||||
# 统计结果
|
||||
size_results = batch_results.get(ring_size, [])
|
||||
|
||||
# 加载完整数据进行统计
|
||||
try:
|
||||
df = pd.read_csv(data_file)
|
||||
total_molecules = len(df)
|
||||
|
||||
# 统计有效大环内酯数量
|
||||
valid_count = 0
|
||||
filtered_count = 0
|
||||
|
||||
for _, row in df.iterrows():
|
||||
smiles = row.get('smiles', '')
|
||||
if not smiles:
|
||||
continue
|
||||
|
||||
try:
|
||||
mol = Chem.MolFromSmiles(smiles)
|
||||
if mol:
|
||||
analyzer = MacroLactoneAnalyzer()
|
||||
if analyzer.is_valid_macrolactone(mol, ring_size):
|
||||
valid_count += 1
|
||||
|
||||
if allowed_ring_atom_types:
|
||||
is_valid, _ = get_macrolactone_numbering(
|
||||
mol, ring_size, allowed_ring_atom_types
|
||||
)[5]
|
||||
if is_valid:
|
||||
filtered_count += 1
|
||||
else:
|
||||
filtered_count += 1
|
||||
except:
|
||||
continue
|
||||
|
||||
all_results[ring_size] = {
|
||||
'data_file_exists': True,
|
||||
'total_molecules': total_molecules,
|
||||
'valid_macrolactones': valid_count,
|
||||
'filtered_molecules': filtered_count,
|
||||
'filter_rate': filtered_count / valid_count * 100 if valid_count > 0 else 0,
|
||||
'examples': size_results
|
||||
}
|
||||
|
||||
print(f"\n📊 {ring_size} 元环统计:")
|
||||
print(f" 总分子数: {total_molecules}")
|
||||
print(f" 有效大环内酯: {valid_count}")
|
||||
print(f" 通过筛选: {filtered_count}")
|
||||
print(f" 筛选通过率: {filtered_count/valid_count*100:.1f}%" if valid_count > 0 else " 筛选通过率: 0%")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 统计 {ring_size} 元环时出错: {str(e)}")
|
||||
all_results[ring_size] = {
|
||||
'data_file_exists': True,
|
||||
'error': str(e),
|
||||
'examples': size_results
|
||||
}
|
||||
|
||||
# 显示总体统计
|
||||
print(f"\n{'='*60}")
|
||||
print(f"📈 所有环大小测试总结")
|
||||
print(f"{'='*60}")
|
||||
|
||||
total_molecules = sum(result.get('total_molecules', 0) for result in all_results.values())
|
||||
total_valid = sum(result.get('valid_macrolactones', 0) for result in all_results.values())
|
||||
total_filtered = sum(result.get('filtered_molecules', 0) for result in all_results.values())
|
||||
|
||||
print(f"总分子数: {total_molecules}")
|
||||
print(f"总有效大环内酯: {total_valid}")
|
||||
print(f"总通过筛选: {total_filtered}")
|
||||
print(f"总体筛选通过率: {total_filtered/total_valid*100:.1f}%" if total_valid > 0 else "总体筛选通过率: 0%")
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def analyze_ring_atom_composition(mol: Union[Chem.Mol, str], ring_size: int) -> Dict[str, int]:
|
||||
# 首先获取环编号信息
|
||||
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
|
||||
get_macrolactone_numbering(mol, ring_size)
|
||||
|
||||
if not is_valid or not ring_atoms:
|
||||
return {}
|
||||
|
||||
# 转换SMILES到分子对象(如果需要)
|
||||
if isinstance(mol, str):
|
||||
mol = Chem.MolFromSmiles(mol)
|
||||
if mol is None:
|
||||
return None, None, None, None, None
|
||||
return {}
|
||||
|
||||
# Find ring of specified size
|
||||
ring_atoms = get_ring_atoms_by_size(mol, ring_size)
|
||||
if ring_atoms is None:
|
||||
return None, None, None, None, None
|
||||
# 统计原子类型
|
||||
composition = {}
|
||||
|
||||
# Find ester group
|
||||
ester_atoms, pattern = find_ester_smarts(mol, ring_atoms)
|
||||
if ester_atoms is None:
|
||||
return None, None, None, None, None
|
||||
for atom_idx in ring_atoms:
|
||||
# 跳过酯键氧原子
|
||||
if atom_idx == ester_oxygen:
|
||||
continue
|
||||
|
||||
atom = mol.GetAtomWithIdx(atom_idx)
|
||||
symbol = atom.GetSymbol()
|
||||
composition[symbol] = composition.get(symbol, 0) + 1
|
||||
|
||||
# Find carbonyl carbon and ester oxygen
|
||||
carbonyl_carbon = get_carbonyl_carbon_in_ring(mol, ester_atoms, ring_atoms)
|
||||
ester_oxygen = get_ester_oxygen_in_ring(mol, ester_atoms, ring_atoms)
|
||||
|
||||
if carbonyl_carbon is None or ester_oxygen is None:
|
||||
return None, None, None, None, None
|
||||
|
||||
# Create numbering
|
||||
ring_numbering, ordered_atoms = create_ring_numbering(
|
||||
mol, ring_atoms, carbonyl_carbon, ester_oxygen
|
||||
)
|
||||
|
||||
return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen
|
||||
return composition
|
||||
|
||||
|
||||
def draw_mol_with_ring_numbering(
|
||||
|
||||
Reference in New Issue
Block a user