Files
search_macro/src/ring_visualization.py
hotwa c9ef531d9b Update ring_visualization.py with enhanced filtering and batch processing capabilities
- Add batch_visualize_macrolactones function for processing multiple molecules
- Add test_all_ring_sizes_with_filtering function for comprehensive testing
- Add analyze_ring_atom_composition function for composition analysis
- Improve error handling and progress reporting
- Add support for carbon-only and carbon-nitrogen filtering
- Enhanced SVG output with better file organization
2025-11-14 21:34:45 +08:00

1176 lines
41 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Ring numbering and visualization utilities for macrolactones.
This module provides functions for:
1. Assigning ring numbering starting from C=O carbonyl carbon
2. Visualizing molecules with ring numbering in Jupyter notebooks
3. Supporting both general molecules and macrolactones (12-20 membered rings)
"""
from rdkit import Chem
from rdkit.Chem import Draw, AllChem
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG, display
from typing import Dict, Tuple, Optional, List, Union
from collections import deque
from pathlib import Path
import pandas as pd
# 导入分析器类
try:
from src.macro_lactone_analyzer import MacroLactoneAnalyzer
except ImportError:
# 如果导入失败,定义一个简单的替代类
class MacroLactoneAnalyzer:
def is_valid_macrolactone(self, mol, ring_size):
# 简单检查是否有指定大小的环
ring_atoms = get_ring_atoms_by_size(mol, ring_size)
return ring_atoms is not None
# 原子序数映射,支持元素符号和原子序数字符串
ATOMIC_NUMBERS = {
"H": 1, "C": 6, "N": 7, "O": 8, "S": 16, "P": 15,
"F": 9, "Cl": 17, "Br": 35, "I": 53
}
def normalize_atom_types(atom_types: Optional[List[str]]) -> Optional[List[int]]:
"""
标准化原子类型列表,将元素符号转换为原子序数
Args:
atom_types: 原子类型列表,可以是元素符号["C", "N"]或原子序数["6", "7"]
Returns:
标准化的原子序数列表如果输入为None则返回None
"""
if atom_types is None:
return None
normalized = []
for atom_type in atom_types:
if atom_type.isdigit():
# 已经是原子序数字符串
normalized.append(int(atom_type))
elif atom_type.upper() in ATOMIC_NUMBERS:
# 元素符号,转换为原子序数
normalized.append(ATOMIC_NUMBERS[atom_type.upper()])
else:
raise ValueError(f"不支持的原子类型: {atom_type}")
return normalized
def validate_ring_atom_composition(
mol: Chem.Mol,
ring_atoms: List[int],
carbonyl_carbon_idx: int,
ester_oxygen_idx: int,
allowed_atom_types: Optional[List[int]] = None
) -> Tuple[bool, str]:
"""
验证环原子组成是否符合要求
Args:
mol: RDKit分子对象
ring_atoms: 环原子索引列表
carbonyl_carbon_idx: 羰基碳索引位置1
ester_oxygen_idx: 酯氧索引位置2
allowed_atom_types: 允许的原子类型列表(原子序数)
Returns:
(is_valid, reason): 是否有效及原因说明
"""
if allowed_atom_types is None:
return True, "不限制原子类型"
# 检查环中每个原子(除了酯键氧)
invalid_atoms = []
for atom_idx in ring_atoms:
# 跳过酯键氧原子位置2
if atom_idx == ester_oxygen_idx:
continue
atom = mol.GetAtomWithIdx(atom_idx)
atomic_num = atom.GetAtomicNum()
if atomic_num not in allowed_atom_types:
symbol = atom.GetSymbol()
invalid_atoms.append(f"{symbol}(索引:{atom_idx})")
if invalid_atoms:
return False, f"环中包含不允许的原子类型: {', '.join(invalid_atoms)}"
return True, f"环原子组成符合要求,只允许: {[Chem.GetPeriodicTable().GetElementSymbol(num) for num in allowed_atom_types]}"
def get_ring_atoms_by_size(mol: Chem.Mol, ring_size: int) -> Optional[List[int]]:
"""
Find atoms in a ring of specified size.
Args:
mol: RDKit molecule object
ring_size: Size of the ring to find (e.g., 16 for 16-membered ring)
Returns:
List of atom indices in the ring, or None if not found
"""
ring_info = mol.GetRingInfo()
rings = ring_info.AtomRings()
for ring in rings:
if len(ring) == ring_size:
return list(ring)
return None
def find_ester_smarts(mol: Chem.Mol, ring_atoms: List[int]) -> Tuple[Optional[List[int]], Optional[str]]:
"""
Find ester group atoms using SMARTS patterns.
Args:
mol: RDKit molecule object
ring_atoms: List of atom indices in the ring
Returns:
Tuple of (ester_atoms_list, pattern_string) or (None, None) if not found
"""
ring_atoms_set = set(ring_atoms)
# Try different SMARTS patterns
patterns = [
"[r16][C](=O)[O]", # Original pattern for 16-membered ring
"[C](=O)[O]", # General pattern
"C(=O)O", # Simple pattern
]
for pattern_str in patterns:
pattern = Chem.MolFromSmarts(pattern_str)
if pattern is None:
continue
matches = mol.GetSubstructMatches(pattern)
for match in matches:
# Check if first atom is in the ring
if match[0] in ring_atoms_set:
return list(match), pattern_str
return None, None
def get_carbonyl_carbon_in_ring(mol: Chem.Mol, ester_atoms: List[int], ring_atoms: List[int]) -> Optional[int]:
"""
Find the C=O carbonyl carbon atom in the ring.
Args:
mol: RDKit molecule object
ester_atoms: List of ester group atom indices
ring_atoms: List of ring atom indices
Returns:
Atom index of carbonyl carbon, or None if not found
"""
ring_atoms_set = set(ring_atoms)
for idx in ester_atoms:
atom = mol.GetAtomWithIdx(idx)
# Find C atom in the ring
if atom.GetSymbol() == 'C' and idx in ring_atoms_set:
# Check if this C has a double bond to O (carbonyl feature)
for neighbor in atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx()
bond = mol.GetBondBetweenAtoms(idx, neighbor_idx)
if bond.GetBondType() == Chem.BondType.DOUBLE and neighbor.GetSymbol() == 'O':
return idx
return None
def get_ester_oxygen_in_ring(mol: Chem.Mol, ester_atoms: List[int], ring_atoms: List[int]) -> Optional[int]:
"""
Find the ester oxygen atom in the ring.
Args:
mol: RDKit molecule object
ester_atoms: List of ester group atom indices
ring_atoms: List of ring atom indices
Returns:
Atom index of ester oxygen in ring, or None if not found
"""
ring_atoms_set = set(ring_atoms)
for idx in ester_atoms:
atom = mol.GetAtomWithIdx(idx)
if atom.GetSymbol() == 'O' and idx in ring_atoms_set:
return idx
return None
def order_ring_atoms_from_start(
mol: Chem.Mol,
ring_atoms: List[int],
start_atom_idx: int,
target_atom_idx: Optional[int] = None
) -> List[int]:
"""
Order ring atoms starting from a specific atom, optionally prioritizing path to target atom.
Args:
mol: RDKit molecule object
ring_atoms: List of ring atom indices
start_atom_idx: Starting atom index
target_atom_idx: Optional target atom index to prioritize path towards
Returns:
Ordered list of atom indices
"""
ring_atoms_set = set(ring_atoms)
if start_atom_idx not in ring_atoms_set:
return ring_atoms
ordered = [start_atom_idx]
remaining = ring_atoms_set - {start_atom_idx}
current = start_atom_idx
# If target atom specified, prioritize path towards it
if target_atom_idx and target_atom_idx in ring_atoms_set:
queue = deque([(start_atom_idx, [start_atom_idx])])
visited = {start_atom_idx}
while queue:
node, path = queue.popleft()
if node == target_atom_idx:
# Found path, add atoms in path
for atom_idx in path[1:]: # Skip first (already in ordered)
if atom_idx in remaining:
ordered.append(atom_idx)
remaining.remove(atom_idx)
break
atom = mol.GetAtomWithIdx(node)
for neighbor in atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx()
if neighbor_idx in ring_atoms_set and neighbor_idx not in visited:
visited.add(neighbor_idx)
queue.append((neighbor_idx, path + [neighbor_idx]))
# Continue adding remaining ring atoms
while remaining:
atom = mol.GetAtomWithIdx(current)
found_next = False
for neighbor in atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx()
if neighbor_idx in remaining:
ordered.append(neighbor_idx)
remaining.remove(neighbor_idx)
current = neighbor_idx
found_next = True
break
if not found_next:
if remaining:
next_atom = remaining.pop()
ordered.append(next_atom)
current = next_atom
return ordered
def create_ring_numbering(
mol: Chem.Mol,
ring_atoms: List[int],
carbonyl_carbon_idx: int,
ester_oxygen_idx: int
) -> Tuple[Dict[int, int], List[int]]:
"""
Create ring numbering mapping starting from C=O carbonyl carbon.
Args:
mol: RDKit molecule object
ring_atoms: List of ring atom indices
carbonyl_carbon_idx: Index of C=O carbonyl carbon (will be position 1)
ester_oxygen_idx: Index of ester oxygen in ring (for ordering direction)
Returns:
Tuple of (numbering_dict, ordered_atoms_list)
numbering_dict: Maps atom index to position (1-N)
ordered_atoms_list: Ordered list of atom indices
"""
ordered_atoms = order_ring_atoms_from_start(
mol, ring_atoms, carbonyl_carbon_idx, ester_oxygen_idx
)
numbering = {}
for i, atom_idx in enumerate(ordered_atoms, start=1):
numbering[atom_idx] = i
return numbering, ordered_atoms
def get_macrolactone_numbering(
mol: Union[Chem.Mol, str],
ring_size: int = 16,
allowed_ring_atom_types: Optional[List[str]] = None
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int], Tuple[bool, str]]:
"""
Get ring numbering for a macrolactone molecule with optional atom type filtering.
This function performs the complete numbering workflow:
1. Find ring of specified size
2. Find ester group
3. Find carbonyl carbon and ester oxygen
4. Validate ring atom composition (if filtering requested)
5. Create numbering mapping
Args:
mol: RDKit molecule object or SMILES string
ring_size: Size of the macrolactone ring (12-20, default 16)
allowed_ring_atom_types: Allowed atom types for ring atoms (excluding ester oxygen)
- None: No restriction (default behavior)
- ["C"]: Only carbon atoms allowed (except ester oxygen)
- ["C", "N"]: Carbon and nitrogen atoms allowed (except ester oxygen)
- Can use element symbols or atomic number strings: ["6", "7"]
Returns:
Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason))
Returns (None, None, None, None, None, (False, reason)) if numbering fails
"""
# Convert SMILES to molecule if needed
if isinstance(mol, str):
mol = Chem.MolFromSmiles(mol)
if mol is None:
return None, None, None, None, None, (False, "无法解析SMILES字符串")
# 标准化原子类型
try:
allowed_atom_numbers = normalize_atom_types(allowed_ring_atom_types)
except ValueError as e:
return None, None, None, None, None, (False, f"原子类型参数错误: {str(e)}")
# Find ring of specified size
ring_atoms = get_ring_atoms_by_size(mol, ring_size)
if ring_atoms is None:
return None, None, None, None, None, (False, f"未找到{ring_size}元环")
# Find ester group
ester_atoms, pattern = find_ester_smarts(mol, ring_atoms)
if ester_atoms is None:
return None, None, None, None, None, (False, "未在环中找到酯键")
# Find carbonyl carbon and ester oxygen
carbonyl_carbon = get_carbonyl_carbon_in_ring(mol, ester_atoms, ring_atoms)
ester_oxygen = get_ester_oxygen_in_ring(mol, ester_atoms, ring_atoms)
if carbonyl_carbon is None or ester_oxygen is None:
return None, None, None, None, None, (False, "无法识别羰基碳或酯氧原子")
# 验证环原子组成(如果指定了允许的原子类型)
if allowed_atom_numbers is not None:
is_valid, validation_reason = validate_ring_atom_composition(
mol, ring_atoms, carbonyl_carbon, ester_oxygen, allowed_atom_numbers
)
if not is_valid:
return None, None, None, None, None, (False, validation_reason)
else:
validation_reason = "不限制原子类型"
# Create numbering
ring_numbering, ordered_atoms = create_ring_numbering(
mol, ring_atoms, carbonyl_carbon, ester_oxygen
)
return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (True, validation_reason)
def get_macrolactone_numbering_legacy(
mol: Union[Chem.Mol, str],
ring_size: int = 16
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int]]:
"""
向后兼容版本的get_macrolactone_numbering函数
Args:
mol: RDKit molecule object or SMILES string
ring_size: Size of the macrolactone ring (12-20, default 16)
Returns:
Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen)
Returns (None, None, None, None, None) if numbering fails
"""
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types=None)
if not is_valid:
return None, None, None, None, None
return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen
def is_pure_carbon_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
"""
便捷函数:检查是否为纯碳大环内酯(除酯键氧外只含碳原子)
Args:
mol: RDKit molecule object or SMILES string
ring_size: Size of the macrolactone ring
Returns:
(is_valid, reason): 是否为纯碳大环内酯及原因
"""
_, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
mol, ring_size, allowed_ring_atom_types=["C"]
)
return is_valid, reason
def is_carbon_nitrogen_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
"""
便捷函数:检查是否为碳氮大环内酯(除酯键氧外只含碳和氮原子)
Args:
mol: RDKit molecule object or SMILES string
ring_size: Size of the macrolactone ring
Returns:
(is_valid, reason): 是否为碳氮大环内酯及原因
"""
_, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
mol, ring_size, allowed_ring_atom_types=["C", "N"]
)
return is_valid, reason
def visualize_macrolactone_with_auto_coloring(
mol: Union[Chem.Mol, str],
ring_size: Optional[int] = None,
allowed_ring_atom_types: Optional[List[str]] = None,
size: Tuple[int, int] = (800, 800),
title: str = "",
return_svg: bool = False,
show_atom_labels: bool = True
) -> Union[str, None]:
"""
可视化大环内酯,自动根据原子类型进行颜色编码
Args:
mol: RDKit分子对象或SMILES字符串
ring_size: 环大小12-20如果为None则自动检测
allowed_ring_atom_types: 允许的环原子类型(除酯氧外)
- None: 不限制,显示所有原子类型
- ["C"]: 只允许碳原子(默认筛选条件)
- ["C", "N"]: 允许碳和氮原子
size: 图像大小
title: 图像标题
return_svg: 是否返回SVG字符串
show_atom_labels: 是否显示原子编号标签
Returns:
SVG字符串如果return_svg=True或显示图像
"""
# 转换SMILES到分子对象
if isinstance(mol, str):
mol_obj = Chem.MolFromSmiles(mol)
if mol_obj is None:
if return_svg:
return None
else:
print("❌ 无法解析SMILES字符串")
return None
else:
mol_obj = mol
# 自动检测环大小(如果未指定)
if ring_size is None:
ring_sizes = []
for size in range(12, 21):
ring_atoms = get_ring_atoms_by_size(mol_obj, size)
if ring_atoms:
# 检查是否为有效大环内酯
analyzer = MacroLactoneAnalyzer()
if analyzer.is_valid_macrolactone(mol_obj, size):
ring_sizes.append(size)
if not ring_sizes:
if return_svg:
return None
else:
print("❌ 未找到12-20元大环内酯")
return None
# 使用找到的第一个环大小
ring_size = ring_sizes[0]
if len(ring_sizes) > 1:
print(f"⚠️ 找到多个环大小: {ring_sizes},使用 {ring_size}")
# 获取环编号信息
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
get_macrolactone_numbering(mol_obj, ring_size, allowed_ring_atom_types)
if not is_valid or not ring_atoms:
if return_svg:
return None
else:
print(f"❌ 无法获取环编号: {validation_reason}")
return None
# 创建分子副本并设置原子标签
mol_copy = Chem.Mol(mol_obj)
AllChem.Compute2DCoords(mol_copy)
if show_atom_labels:
for atom_idx in ring_atoms:
if atom_idx in ring_numbering:
atom = mol_copy.GetAtomWithIdx(atom_idx)
atom.SetProp("atomNote", str(ring_numbering[atom_idx]))
# 定义原子颜色方案
def get_atom_color(symbol: str, is_ester_oxygen: bool = False) -> Tuple[float, float, float]:
"""获取原子颜色"""
if is_ester_oxygen:
return (1.0, 0.4, 0.4) # 酯氧用深红色
elif symbol == 'C':
return (0.7, 0.8, 1.0) # 碳用蓝色
elif symbol == 'N':
return (1.0, 0.8, 1.0) # 氮用粉色
elif symbol == 'O':
return (1.0, 0.7, 0.7) # 氧用红色
elif symbol == 'S':
return (1.0, 1.0, 0.6) # 硫用黄色
elif symbol == 'P':
return (0.8, 0.6, 1.0) # 磷用紫色
else:
return (0.8, 1.0, 0.8) # 其他用绿色
# 设置原子颜色
atom_colors = {}
atom_type_stats = {}
for atom_idx in ring_atoms:
atom = mol_obj.GetAtomWithIdx(atom_idx)
symbol = atom.GetSymbol()
is_ester_oxygen = (atom_idx == ester_oxygen)
# 设置颜色
color = get_atom_color(symbol, is_ester_oxygen)
atom_colors[atom_idx] = color
# 统计原子类型(不包括酯氧)
if not is_ester_oxygen:
atom_type_stats[symbol] = atom_type_stats.get(symbol, 0) + 1
# 绘制分子
# 确保size是元组格式
if isinstance(size, int):
size = (size, size)
elif isinstance(size, (list, tuple)) and len(size) == 1:
size = (size[0], size[0])
elif not isinstance(size, (list, tuple)) or len(size) != 2:
size = (800, 800) # 默认大小
drawer = rdMolDraw2D.MolDraw2DSVG(int(size[0]), int(size[1]))
drawer.SetFontSize(12) # 设置合适的字体大小最小为6
# 注意某些RDKit版本不支持DrawTitle暂时注释掉
# if title:
# drawer.DrawTitle(title)
drawer.DrawMolecule(mol_copy,
highlightAtoms=ring_atoms,
highlightAtomColors=atom_colors)
drawer.FinishDrawing()
svg = drawer.GetDrawingText()
# 显示统计信息
if not return_svg:
display(SVG(svg))
print(f"\n📊 分子信息:")
print(f" 环大小: {ring_size} 元环")
print(f" 羰基碳位置: {ring_numbering.get(carbonyl_carbon, 'N/A')} (深红色标记)")
print(f" 酯氧位置: {ring_numbering.get(ester_oxygen, 'N/A')} (深红色标记)")
print(f" 环原子组成: {atom_type_stats}")
print(f" 筛选条件: {validation_reason}")
# 显示颜色说明
print(f"\n🎨 颜色说明:")
print(f" 深红色: 酯键氧原子 (位置2)")
print(f" 蓝色: 碳原子")
print(f" 粉色: 氮原子")
print(f" 红色: 氧原子")
print(f" 黄色: 硫原子")
print(f" 紫色: 磷原子")
print(f" 绿色: 其他原子")
return svg if return_svg else None
def batch_visualize_macrolactones(
data_file: Path,
ring_sizes: List[int] = None,
allowed_ring_atom_types: Optional[List[str]] = None,
max_examples_per_size: int = 3,
output_dir: Optional[Path] = None
) -> Dict[int, List[Dict]]:
"""
批量可视化大环内酯分子
Args:
data_file: CSV数据文件路径
ring_sizes: 要测试的环大小列表默认为12-20
allowed_ring_atom_types: 允许的环原子类型
max_examples_per_size: 每种环大小最大示例数
output_dir: 输出目录如果指定则保存SVG文件
Returns:
按环大小分组的可视化结果字典
"""
if ring_sizes is None:
ring_sizes = list(range(12, 21))
print(f"🔍 开始批量可视化大环内酯")
print(f" 数据文件: {data_file}")
print(f" 环大小范围: {ring_sizes}")
print(f" 筛选条件: {allowed_ring_atom_types or '无限制'}")
# 加载数据
if not data_file.exists():
print(f"❌ 数据文件不存在: {data_file}")
return {}
df = pd.read_csv(data_file)
print(f"✓ 加载数据: {len(df)} 个分子")
results = {}
for ring_size in ring_sizes:
print(f"\n🔄 处理 {ring_size} 元环...")
size_results = []
found_count = 0
for idx, row in df.iterrows():
if found_count >= max_examples_per_size:
break
smiles = row.get('smiles', '')
if not smiles:
continue
try:
# 测试分子
mol = Chem.MolFromSmiles(smiles)
if not mol:
continue
# 检查是否为指定大小的有效大环内酯
analyzer = MacroLactoneAnalyzer()
if not analyzer.is_valid_macrolactone(mol, ring_size):
continue
# 应用筛选条件(如果指定)
if allowed_ring_atom_types:
is_valid, validation_reason = get_macrolactone_numbering(
mol, ring_size, allowed_ring_atom_types
)[5]
if not is_valid:
continue
# 获取详细信息
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types)
if is_valid:
# 分析原子组成
composition = analyze_ring_atom_composition(smiles, ring_size)
result = {
'index': idx,
'smiles': smiles,
'molecule_id': row.get('molecule_id', f'mol_{idx}'),
'ring_size': ring_size,
'composition': composition,
'validation_reason': validation_reason,
'carbonyl_carbon_pos': ring_numbering.get(carbonyl_carbon, 'N/A'),
'ester_oxygen_pos': ring_numbering.get(ester_oxygen, 'N/A')
}
size_results.append(result)
found_count += 1
print(f" ✓ 找到示例 {found_count}: 分子{idx} ({composition})")
# 保存可视化(如果指定了输出目录)
if output_dir:
output_dir.mkdir(parents=True, exist_ok=True)
filename = f"ring{ring_size}_example{found_count}_mol{idx}.svg"
output_path = output_dir / filename
svg = visualize_macrolactone_with_auto_coloring(
mol, ring_size, allowed_ring_atom_types,
return_svg=True,
title=f"{ring_size}-元环示例 {found_count}"
)
if svg:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(svg)
print(f" 💾 保存到: {output_path}")
except Exception as e:
print(f" ⚠️ 处理分子 {idx} 时出错: {str(e)}")
continue
results[ring_size] = size_results
print(f" 📊 {ring_size} 元环: 找到 {len(size_results)} 个示例")
# 显示总体统计
print(f"\n📈 总体统计:")
total_found = sum(len(results[size]) for size in ring_sizes)
print(f" 总示例数: {total_found}")
for ring_size in ring_sizes:
count = len(results[ring_size])
print(f" {ring_size} 元环: {count} 个示例")
return results
def test_all_ring_sizes_with_filtering(
filtered_data_dir: Path,
allowed_ring_atom_types: Optional[List[str]] = ["C"], # 默认只允许碳原子
output_dir: Optional[Path] = None
) -> Dict[int, Dict]:
"""
测试所有环大小12-20的筛选效果
Args:
filtered_data_dir: 包含filtered CSV文件的目录
allowed_ring_atom_types: 允许的环原子类型
output_dir: 输出目录
Returns:
详细的测试结果
"""
print(f"🧪 开始测试所有环大小的筛选效果")
print(f" 数据目录: {filtered_data_dir}")
print(f" 筛选条件: {allowed_ring_atom_types or '无限制'}")
all_results = {}
for ring_size in range(12, 21):
print(f"\n{'='*60}")
print(f"🔍 测试 {ring_size} 元环")
print(f"{'='*60}")
# 查找对应的数据文件
data_file = filtered_data_dir / f'macrolactone_ring{ring_size}_filtered.csv'
if not data_file.exists():
print(f"⚠️ 未找到 {ring_size} 元环数据文件: {data_file}")
all_results[ring_size] = {
'data_file_exists': False,
'total_molecules': 0,
'valid_macrolactones': 0,
'filtered_molecules': 0,
'examples': []
}
continue
# 批量测试
batch_results = batch_visualize_macrolactones(
data_file,
ring_sizes=[ring_size],
allowed_ring_atom_types=allowed_ring_atom_types,
max_examples_per_size=5,
output_dir=output_dir / f'ring{ring_size}_examples' if output_dir else None
)
# 统计结果
size_results = batch_results.get(ring_size, [])
# 加载完整数据进行统计
try:
df = pd.read_csv(data_file)
total_molecules = len(df)
# 统计有效大环内酯数量
valid_count = 0
filtered_count = 0
for _, row in df.iterrows():
smiles = row.get('smiles', '')
if not smiles:
continue
try:
mol = Chem.MolFromSmiles(smiles)
if mol:
analyzer = MacroLactoneAnalyzer()
if analyzer.is_valid_macrolactone(mol, ring_size):
valid_count += 1
if allowed_ring_atom_types:
is_valid, _ = get_macrolactone_numbering(
mol, ring_size, allowed_ring_atom_types
)[5]
if is_valid:
filtered_count += 1
else:
filtered_count += 1
except:
continue
all_results[ring_size] = {
'data_file_exists': True,
'total_molecules': total_molecules,
'valid_macrolactones': valid_count,
'filtered_molecules': filtered_count,
'filter_rate': filtered_count / valid_count * 100 if valid_count > 0 else 0,
'examples': size_results
}
print(f"\n📊 {ring_size} 元环统计:")
print(f" 总分子数: {total_molecules}")
print(f" 有效大环内酯: {valid_count}")
print(f" 通过筛选: {filtered_count}")
print(f" 筛选通过率: {filtered_count/valid_count*100:.1f}%" if valid_count > 0 else " 筛选通过率: 0%")
except Exception as e:
print(f"❌ 统计 {ring_size} 元环时出错: {str(e)}")
all_results[ring_size] = {
'data_file_exists': True,
'error': str(e),
'examples': size_results
}
# 显示总体统计
print(f"\n{'='*60}")
print(f"📈 所有环大小测试总结")
print(f"{'='*60}")
total_molecules = sum(result.get('total_molecules', 0) for result in all_results.values())
total_valid = sum(result.get('valid_macrolactones', 0) for result in all_results.values())
total_filtered = sum(result.get('filtered_molecules', 0) for result in all_results.values())
print(f"总分子数: {total_molecules}")
print(f"总有效大环内酯: {total_valid}")
print(f"总通过筛选: {total_filtered}")
print(f"总体筛选通过率: {total_filtered/total_valid*100:.1f}%" if total_valid > 0 else "总体筛选通过率: 0%")
return all_results
def analyze_ring_atom_composition(mol: Union[Chem.Mol, str], ring_size: int) -> Dict[str, int]:
# 首先获取环编号信息
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
get_macrolactone_numbering(mol, ring_size)
if not is_valid or not ring_atoms:
return {}
# 转换SMILES到分子对象如果需要
if isinstance(mol, str):
mol = Chem.MolFromSmiles(mol)
if mol is None:
return {}
# 统计原子类型
composition = {}
for atom_idx in ring_atoms:
# 跳过酯键氧原子
if atom_idx == ester_oxygen:
continue
atom = mol.GetAtomWithIdx(atom_idx)
symbol = atom.GetSymbol()
composition[symbol] = composition.get(symbol, 0) + 1
return composition
def draw_mol_with_ring_numbering(
mol: Union[Chem.Mol, str],
ring_numbering: Optional[Dict[int, int]] = None,
ring_atoms: Optional[List[int]] = None,
size: Tuple[int, int] = (800, 800),
title: str = "",
ring_size: int = 16,
return_svg: bool = False
) -> Optional[str]:
"""
Draw molecule with ring numbering displayed.
This function can work in two modes:
1. If ring_numbering is provided, use it directly
2. If ring_numbering is None, automatically compute it for macrolactones
Args:
mol: RDKit molecule object or SMILES string
ring_numbering: Optional pre-computed numbering dictionary
ring_atoms: Optional pre-computed ring atoms list
size: Image size (width, height)
title: Optional title for the image
ring_size: Ring size for auto-numbering (default 16)
return_svg: If True, return SVG string instead of displaying
Returns:
SVG string if return_svg=True, None otherwise (displays in notebook)
"""
# Convert SMILES to molecule if needed
if isinstance(mol, str):
mol = Chem.MolFromSmiles(mol)
if mol is None:
print("Error: Could not parse SMILES")
return None
# Auto-compute numbering if not provided
if ring_numbering is None:
ring_atoms, ring_numbering, _, _, _ = get_macrolactone_numbering(mol, ring_size)
if ring_numbering is None:
print("Error: Could not compute ring numbering")
return None
# Get ring atoms if not provided
if ring_atoms is None:
ring_atoms = list(ring_numbering.keys())
# Create drawer
drawer = rdMolDraw2D.MolDraw2DSVG(size[0], size[1])
drawer.SetFontSize(6) # Minimum font size is 6
draw_options = drawer.drawOptions()
draw_options.addAtomIndices = False
# Highlight ring atoms
highlight_atoms = list(ring_atoms)
atom_colors = {}
for atom_idx in ring_atoms:
atom_colors[atom_idx] = (0.8, 0.9, 1.0) # Light blue
# Create copy and set atom notes (ring numbering)
mol_copy = Chem.Mol(mol)
for atom_idx in ring_atoms:
if atom_idx in ring_numbering:
atom = mol_copy.GetAtomWithIdx(atom_idx)
atom.SetProp("atomNote", str(ring_numbering[atom_idx]))
# Draw molecule
drawer.DrawMolecule(
mol_copy,
highlightAtoms=highlight_atoms,
highlightAtomColors=atom_colors
)
drawer.FinishDrawing()
svg = drawer.GetDrawingText()
if return_svg:
return svg
else:
display(SVG(svg))
return None
def visualize_molecule_with_numbering(
mol: Union[Chem.Mol, str],
ring_size: int = 16,
size: Tuple[int, int] = (800, 800),
title: str = ""
) -> Tuple[Optional[Dict[int, int]], Optional[List[int]]]:
"""
Convenience function to visualize a molecule with automatic ring numbering.
This is the main function to use in Jupyter notebooks for quick visualization.
Args:
mol: RDKit molecule object or SMILES string
ring_size: Ring size for macrolactones (default 16)
size: Image size (width, height)
title: Optional title
Returns:
Tuple of (ring_numbering_dict, ring_atoms_list)
"""
# Get numbering
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen = \
get_macrolactone_numbering(mol, ring_size)
if ring_numbering is None:
print("Error: Could not compute ring numbering")
return None, None
# Display
print(f"Ring size: {ring_size}")
print(f"Carbonyl C position: {ring_numbering.get(carbonyl_carbon, 'N/A')}")
print(f"Ester O position: {ring_numbering.get(ester_oxygen, 'N/A')}")
print(f"Numbering range: 1-{len(ring_numbering)}")
draw_mol_with_ring_numbering(
mol, ring_numbering, ring_atoms, size, title, ring_size
)
return ring_numbering, ring_atoms
def identify_side_chains(mol: Chem.Mol, ring_atoms: List[int]) -> List[Tuple[int, int]]:
"""
Identify side chains attached to the ring.
Args:
mol: RDKit molecule object
ring_atoms: List of atom indices in the ring
Returns:
List of tuples (ring_atom_idx, side_chain_first_atom_idx)
"""
side_chains = []
ring_atom_set = set(ring_atoms)
for ring_atom_idx in ring_atoms:
atom = mol.GetAtomWithIdx(ring_atom_idx)
for neighbor in atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx()
# If neighbor is not in the ring, it's a side chain
if neighbor_idx not in ring_atom_set:
side_chains.append((ring_atom_idx, neighbor_idx))
return side_chains
def extract_side_chain_fragment(
mol: Chem.Mol,
ring_atom_idx: int,
side_chain_start_idx: int,
ring_atoms: List[int]
) -> Optional[str]:
"""
Extract a side chain fragment as a SMILES string with dummy atom (*) at attachment point.
Args:
mol: RDKit molecule object
ring_atom_idx: Index of the ring atom where side chain is attached
side_chain_start_idx: Index of the first atom in the side chain
ring_atoms: List of all ring atom indices
Returns:
SMILES string of the fragment (contains dummy atom *), or None if extraction failed
"""
ring_atom_set = set(ring_atoms)
visited = set()
queue = [side_chain_start_idx]
side_chain_atoms = []
# Use BFS to collect all atoms in the side chain
while queue:
current_idx = queue.pop(0)
if current_idx in visited:
continue
visited.add(current_idx)
side_chain_atoms.append(current_idx)
atom = mol.GetAtomWithIdx(current_idx)
for neighbor in atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx()
# Only continue into non-ring atoms
if neighbor_idx not in ring_atom_set and neighbor_idx not in visited:
queue.append(neighbor_idx)
if not side_chain_atoms:
return None
# Get the bond type to the ring
bond_to_ring = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx)
if bond_to_ring is None:
return None
bond_type = bond_to_ring.GetBondType()
# Create a new molecule with only the side chain atoms
fragment_mol = Chem.RWMol()
old_to_new = {}
# Add atoms
for old_idx in side_chain_atoms:
atom = mol.GetAtomWithIdx(old_idx)
new_atom = Chem.Atom(atom.GetAtomicNum())
new_atom.SetFormalCharge(atom.GetFormalCharge())
new_atom.SetIsAromatic(atom.GetIsAromatic())
new_idx = fragment_mol.AddAtom(new_atom)
old_to_new[old_idx] = new_idx
# Add bonds (within side chain)
for old_idx in side_chain_atoms:
atom = mol.GetAtomWithIdx(old_idx)
for neighbor in atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx()
if neighbor_idx in old_to_new and old_idx < neighbor_idx:
bond = mol.GetBondBetweenAtoms(old_idx, neighbor_idx)
fragment_mol.AddBond(
old_to_new[old_idx],
old_to_new[neighbor_idx],
bond.GetBondType()
)
# Add dummy atom (atomic number 0, displays as * in SMILES)
# Dummy atom connects to the atom that was originally connected to the ring
attachment_point = old_to_new[side_chain_start_idx]
dummy_atom_idx = fragment_mol.AddAtom(Chem.Atom(0))
# Add bond between dummy atom and attachment point, keeping original bond type
fragment_mol.AddBond(attachment_point, dummy_atom_idx, bond_type)
# Convert to molecule and get SMILES
try:
fragment_mol = fragment_mol.GetMol()
Chem.SanitizeMol(fragment_mol)
fragment_smiles = Chem.MolToSmiles(fragment_mol)
return fragment_smiles
except Exception as e:
return None
def cleave_side_chain_at_position(
mol: Chem.Mol,
ring_atoms: List[int],
ring_numbering: Dict[int, int],
position: int
) -> List[str]:
"""
Cleave side chains at a specified ring position.
Args:
mol: RDKit molecule object
ring_atoms: List of ring atom indices
ring_numbering: Ring numbering dictionary mapping atom index to position
position: Position number to cleave (1-N, where N is ring size)
Returns:
List of SMILES strings, one for each side chain (contains dummy atom *,
compatible with molzip for reconstruction)
"""
# Find the ring atom index for this position
ring_atom_idx = None
for atom_idx, pos in ring_numbering.items():
if pos == position:
ring_atom_idx = atom_idx
break
if ring_atom_idx is None:
return []
# Find all side chains attached to this ring atom
side_chains = identify_side_chains(mol, ring_atoms)
fragments = []
for ring_atom, side_start in side_chains:
if ring_atom == ring_atom_idx:
fragment_smiles = extract_side_chain_fragment(mol, ring_atom, side_start, ring_atoms)
if fragment_smiles:
fragments.append(fragment_smiles)
return fragments