- Add batch_visualize_macrolactones function for processing multiple molecules - Add test_all_ring_sizes_with_filtering function for comprehensive testing - Add analyze_ring_atom_composition function for composition analysis - Improve error handling and progress reporting - Add support for carbon-only and carbon-nitrogen filtering - Enhanced SVG output with better file organization
1176 lines
41 KiB
Python
1176 lines
41 KiB
Python
"""
|
||
Ring numbering and visualization utilities for macrolactones.
|
||
|
||
This module provides functions for:
|
||
1. Assigning ring numbering starting from C=O carbonyl carbon
|
||
2. Visualizing molecules with ring numbering in Jupyter notebooks
|
||
3. Supporting both general molecules and macrolactones (12-20 membered rings)
|
||
"""
|
||
from rdkit import Chem
|
||
from rdkit.Chem import Draw, AllChem
|
||
from rdkit.Chem.Draw import rdMolDraw2D
|
||
from IPython.display import SVG, display
|
||
from typing import Dict, Tuple, Optional, List, Union
|
||
from collections import deque
|
||
from pathlib import Path
|
||
import pandas as pd
|
||
|
||
# 导入分析器类
|
||
try:
|
||
from src.macro_lactone_analyzer import MacroLactoneAnalyzer
|
||
except ImportError:
|
||
# 如果导入失败,定义一个简单的替代类
|
||
class MacroLactoneAnalyzer:
|
||
def is_valid_macrolactone(self, mol, ring_size):
|
||
# 简单检查是否有指定大小的环
|
||
ring_atoms = get_ring_atoms_by_size(mol, ring_size)
|
||
return ring_atoms is not None
|
||
|
||
# 原子序数映射,支持元素符号和原子序数字符串
|
||
ATOMIC_NUMBERS = {
|
||
"H": 1, "C": 6, "N": 7, "O": 8, "S": 16, "P": 15,
|
||
"F": 9, "Cl": 17, "Br": 35, "I": 53
|
||
}
|
||
|
||
def normalize_atom_types(atom_types: Optional[List[str]]) -> Optional[List[int]]:
|
||
"""
|
||
标准化原子类型列表,将元素符号转换为原子序数
|
||
|
||
Args:
|
||
atom_types: 原子类型列表,可以是元素符号["C", "N"]或原子序数["6", "7"]
|
||
|
||
Returns:
|
||
标准化的原子序数列表,如果输入为None则返回None
|
||
"""
|
||
if atom_types is None:
|
||
return None
|
||
|
||
normalized = []
|
||
for atom_type in atom_types:
|
||
if atom_type.isdigit():
|
||
# 已经是原子序数字符串
|
||
normalized.append(int(atom_type))
|
||
elif atom_type.upper() in ATOMIC_NUMBERS:
|
||
# 元素符号,转换为原子序数
|
||
normalized.append(ATOMIC_NUMBERS[atom_type.upper()])
|
||
else:
|
||
raise ValueError(f"不支持的原子类型: {atom_type}")
|
||
|
||
return normalized
|
||
|
||
def validate_ring_atom_composition(
|
||
mol: Chem.Mol,
|
||
ring_atoms: List[int],
|
||
carbonyl_carbon_idx: int,
|
||
ester_oxygen_idx: int,
|
||
allowed_atom_types: Optional[List[int]] = None
|
||
) -> Tuple[bool, str]:
|
||
"""
|
||
验证环原子组成是否符合要求
|
||
|
||
Args:
|
||
mol: RDKit分子对象
|
||
ring_atoms: 环原子索引列表
|
||
carbonyl_carbon_idx: 羰基碳索引(位置1)
|
||
ester_oxygen_idx: 酯氧索引(位置2)
|
||
allowed_atom_types: 允许的原子类型列表(原子序数)
|
||
|
||
Returns:
|
||
(is_valid, reason): 是否有效及原因说明
|
||
"""
|
||
if allowed_atom_types is None:
|
||
return True, "不限制原子类型"
|
||
|
||
# 检查环中每个原子(除了酯键氧)
|
||
invalid_atoms = []
|
||
|
||
for atom_idx in ring_atoms:
|
||
# 跳过酯键氧原子(位置2)
|
||
if atom_idx == ester_oxygen_idx:
|
||
continue
|
||
|
||
atom = mol.GetAtomWithIdx(atom_idx)
|
||
atomic_num = atom.GetAtomicNum()
|
||
|
||
if atomic_num not in allowed_atom_types:
|
||
symbol = atom.GetSymbol()
|
||
invalid_atoms.append(f"{symbol}(索引:{atom_idx})")
|
||
|
||
if invalid_atoms:
|
||
return False, f"环中包含不允许的原子类型: {', '.join(invalid_atoms)}"
|
||
|
||
return True, f"环原子组成符合要求,只允许: {[Chem.GetPeriodicTable().GetElementSymbol(num) for num in allowed_atom_types]}"
|
||
|
||
|
||
def get_ring_atoms_by_size(mol: Chem.Mol, ring_size: int) -> Optional[List[int]]:
|
||
"""
|
||
Find atoms in a ring of specified size.
|
||
|
||
Args:
|
||
mol: RDKit molecule object
|
||
ring_size: Size of the ring to find (e.g., 16 for 16-membered ring)
|
||
|
||
Returns:
|
||
List of atom indices in the ring, or None if not found
|
||
"""
|
||
ring_info = mol.GetRingInfo()
|
||
rings = ring_info.AtomRings()
|
||
|
||
for ring in rings:
|
||
if len(ring) == ring_size:
|
||
return list(ring)
|
||
|
||
return None
|
||
|
||
|
||
def find_ester_smarts(mol: Chem.Mol, ring_atoms: List[int]) -> Tuple[Optional[List[int]], Optional[str]]:
|
||
"""
|
||
Find ester group atoms using SMARTS patterns.
|
||
|
||
Args:
|
||
mol: RDKit molecule object
|
||
ring_atoms: List of atom indices in the ring
|
||
|
||
Returns:
|
||
Tuple of (ester_atoms_list, pattern_string) or (None, None) if not found
|
||
"""
|
||
ring_atoms_set = set(ring_atoms)
|
||
|
||
# Try different SMARTS patterns
|
||
patterns = [
|
||
"[r16][C](=O)[O]", # Original pattern for 16-membered ring
|
||
"[C](=O)[O]", # General pattern
|
||
"C(=O)O", # Simple pattern
|
||
]
|
||
|
||
for pattern_str in patterns:
|
||
pattern = Chem.MolFromSmarts(pattern_str)
|
||
if pattern is None:
|
||
continue
|
||
|
||
matches = mol.GetSubstructMatches(pattern)
|
||
for match in matches:
|
||
# Check if first atom is in the ring
|
||
if match[0] in ring_atoms_set:
|
||
return list(match), pattern_str
|
||
|
||
return None, None
|
||
|
||
|
||
def get_carbonyl_carbon_in_ring(mol: Chem.Mol, ester_atoms: List[int], ring_atoms: List[int]) -> Optional[int]:
|
||
"""
|
||
Find the C=O carbonyl carbon atom in the ring.
|
||
|
||
Args:
|
||
mol: RDKit molecule object
|
||
ester_atoms: List of ester group atom indices
|
||
ring_atoms: List of ring atom indices
|
||
|
||
Returns:
|
||
Atom index of carbonyl carbon, or None if not found
|
||
"""
|
||
ring_atoms_set = set(ring_atoms)
|
||
|
||
for idx in ester_atoms:
|
||
atom = mol.GetAtomWithIdx(idx)
|
||
# Find C atom in the ring
|
||
if atom.GetSymbol() == 'C' and idx in ring_atoms_set:
|
||
# Check if this C has a double bond to O (carbonyl feature)
|
||
for neighbor in atom.GetNeighbors():
|
||
neighbor_idx = neighbor.GetIdx()
|
||
bond = mol.GetBondBetweenAtoms(idx, neighbor_idx)
|
||
if bond.GetBondType() == Chem.BondType.DOUBLE and neighbor.GetSymbol() == 'O':
|
||
return idx
|
||
|
||
return None
|
||
|
||
|
||
def get_ester_oxygen_in_ring(mol: Chem.Mol, ester_atoms: List[int], ring_atoms: List[int]) -> Optional[int]:
|
||
"""
|
||
Find the ester oxygen atom in the ring.
|
||
|
||
Args:
|
||
mol: RDKit molecule object
|
||
ester_atoms: List of ester group atom indices
|
||
ring_atoms: List of ring atom indices
|
||
|
||
Returns:
|
||
Atom index of ester oxygen in ring, or None if not found
|
||
"""
|
||
ring_atoms_set = set(ring_atoms)
|
||
|
||
for idx in ester_atoms:
|
||
atom = mol.GetAtomWithIdx(idx)
|
||
if atom.GetSymbol() == 'O' and idx in ring_atoms_set:
|
||
return idx
|
||
|
||
return None
|
||
|
||
|
||
def order_ring_atoms_from_start(
|
||
mol: Chem.Mol,
|
||
ring_atoms: List[int],
|
||
start_atom_idx: int,
|
||
target_atom_idx: Optional[int] = None
|
||
) -> List[int]:
|
||
"""
|
||
Order ring atoms starting from a specific atom, optionally prioritizing path to target atom.
|
||
|
||
Args:
|
||
mol: RDKit molecule object
|
||
ring_atoms: List of ring atom indices
|
||
start_atom_idx: Starting atom index
|
||
target_atom_idx: Optional target atom index to prioritize path towards
|
||
|
||
Returns:
|
||
Ordered list of atom indices
|
||
"""
|
||
ring_atoms_set = set(ring_atoms)
|
||
|
||
if start_atom_idx not in ring_atoms_set:
|
||
return ring_atoms
|
||
|
||
ordered = [start_atom_idx]
|
||
remaining = ring_atoms_set - {start_atom_idx}
|
||
current = start_atom_idx
|
||
|
||
# If target atom specified, prioritize path towards it
|
||
if target_atom_idx and target_atom_idx in ring_atoms_set:
|
||
queue = deque([(start_atom_idx, [start_atom_idx])])
|
||
visited = {start_atom_idx}
|
||
|
||
while queue:
|
||
node, path = queue.popleft()
|
||
if node == target_atom_idx:
|
||
# Found path, add atoms in path
|
||
for atom_idx in path[1:]: # Skip first (already in ordered)
|
||
if atom_idx in remaining:
|
||
ordered.append(atom_idx)
|
||
remaining.remove(atom_idx)
|
||
break
|
||
|
||
atom = mol.GetAtomWithIdx(node)
|
||
for neighbor in atom.GetNeighbors():
|
||
neighbor_idx = neighbor.GetIdx()
|
||
if neighbor_idx in ring_atoms_set and neighbor_idx not in visited:
|
||
visited.add(neighbor_idx)
|
||
queue.append((neighbor_idx, path + [neighbor_idx]))
|
||
|
||
# Continue adding remaining ring atoms
|
||
while remaining:
|
||
atom = mol.GetAtomWithIdx(current)
|
||
found_next = False
|
||
|
||
for neighbor in atom.GetNeighbors():
|
||
neighbor_idx = neighbor.GetIdx()
|
||
if neighbor_idx in remaining:
|
||
ordered.append(neighbor_idx)
|
||
remaining.remove(neighbor_idx)
|
||
current = neighbor_idx
|
||
found_next = True
|
||
break
|
||
|
||
if not found_next:
|
||
if remaining:
|
||
next_atom = remaining.pop()
|
||
ordered.append(next_atom)
|
||
current = next_atom
|
||
|
||
return ordered
|
||
|
||
|
||
def create_ring_numbering(
|
||
mol: Chem.Mol,
|
||
ring_atoms: List[int],
|
||
carbonyl_carbon_idx: int,
|
||
ester_oxygen_idx: int
|
||
) -> Tuple[Dict[int, int], List[int]]:
|
||
"""
|
||
Create ring numbering mapping starting from C=O carbonyl carbon.
|
||
|
||
Args:
|
||
mol: RDKit molecule object
|
||
ring_atoms: List of ring atom indices
|
||
carbonyl_carbon_idx: Index of C=O carbonyl carbon (will be position 1)
|
||
ester_oxygen_idx: Index of ester oxygen in ring (for ordering direction)
|
||
|
||
Returns:
|
||
Tuple of (numbering_dict, ordered_atoms_list)
|
||
numbering_dict: Maps atom index to position (1-N)
|
||
ordered_atoms_list: Ordered list of atom indices
|
||
"""
|
||
ordered_atoms = order_ring_atoms_from_start(
|
||
mol, ring_atoms, carbonyl_carbon_idx, ester_oxygen_idx
|
||
)
|
||
numbering = {}
|
||
for i, atom_idx in enumerate(ordered_atoms, start=1):
|
||
numbering[atom_idx] = i
|
||
|
||
return numbering, ordered_atoms
|
||
|
||
|
||
def get_macrolactone_numbering(
|
||
mol: Union[Chem.Mol, str],
|
||
ring_size: int = 16,
|
||
allowed_ring_atom_types: Optional[List[str]] = None
|
||
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int], Tuple[bool, str]]:
|
||
"""
|
||
Get ring numbering for a macrolactone molecule with optional atom type filtering.
|
||
|
||
This function performs the complete numbering workflow:
|
||
1. Find ring of specified size
|
||
2. Find ester group
|
||
3. Find carbonyl carbon and ester oxygen
|
||
4. Validate ring atom composition (if filtering requested)
|
||
5. Create numbering mapping
|
||
|
||
Args:
|
||
mol: RDKit molecule object or SMILES string
|
||
ring_size: Size of the macrolactone ring (12-20, default 16)
|
||
allowed_ring_atom_types: Allowed atom types for ring atoms (excluding ester oxygen)
|
||
- None: No restriction (default behavior)
|
||
- ["C"]: Only carbon atoms allowed (except ester oxygen)
|
||
- ["C", "N"]: Carbon and nitrogen atoms allowed (except ester oxygen)
|
||
- Can use element symbols or atomic number strings: ["6", "7"]
|
||
|
||
Returns:
|
||
Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason))
|
||
Returns (None, None, None, None, None, (False, reason)) if numbering fails
|
||
"""
|
||
# Convert SMILES to molecule if needed
|
||
if isinstance(mol, str):
|
||
mol = Chem.MolFromSmiles(mol)
|
||
if mol is None:
|
||
return None, None, None, None, None, (False, "无法解析SMILES字符串")
|
||
|
||
# 标准化原子类型
|
||
try:
|
||
allowed_atom_numbers = normalize_atom_types(allowed_ring_atom_types)
|
||
except ValueError as e:
|
||
return None, None, None, None, None, (False, f"原子类型参数错误: {str(e)}")
|
||
|
||
# Find ring of specified size
|
||
ring_atoms = get_ring_atoms_by_size(mol, ring_size)
|
||
if ring_atoms is None:
|
||
return None, None, None, None, None, (False, f"未找到{ring_size}元环")
|
||
|
||
# Find ester group
|
||
ester_atoms, pattern = find_ester_smarts(mol, ring_atoms)
|
||
if ester_atoms is None:
|
||
return None, None, None, None, None, (False, "未在环中找到酯键")
|
||
|
||
# Find carbonyl carbon and ester oxygen
|
||
carbonyl_carbon = get_carbonyl_carbon_in_ring(mol, ester_atoms, ring_atoms)
|
||
ester_oxygen = get_ester_oxygen_in_ring(mol, ester_atoms, ring_atoms)
|
||
|
||
if carbonyl_carbon is None or ester_oxygen is None:
|
||
return None, None, None, None, None, (False, "无法识别羰基碳或酯氧原子")
|
||
|
||
# 验证环原子组成(如果指定了允许的原子类型)
|
||
if allowed_atom_numbers is not None:
|
||
is_valid, validation_reason = validate_ring_atom_composition(
|
||
mol, ring_atoms, carbonyl_carbon, ester_oxygen, allowed_atom_numbers
|
||
)
|
||
if not is_valid:
|
||
return None, None, None, None, None, (False, validation_reason)
|
||
else:
|
||
validation_reason = "不限制原子类型"
|
||
|
||
# Create numbering
|
||
ring_numbering, ordered_atoms = create_ring_numbering(
|
||
mol, ring_atoms, carbonyl_carbon, ester_oxygen
|
||
)
|
||
|
||
return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (True, validation_reason)
|
||
|
||
|
||
def get_macrolactone_numbering_legacy(
|
||
mol: Union[Chem.Mol, str],
|
||
ring_size: int = 16
|
||
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int]]:
|
||
"""
|
||
向后兼容版本的get_macrolactone_numbering函数
|
||
|
||
Args:
|
||
mol: RDKit molecule object or SMILES string
|
||
ring_size: Size of the macrolactone ring (12-20, default 16)
|
||
|
||
Returns:
|
||
Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen)
|
||
Returns (None, None, None, None, None) if numbering fails
|
||
"""
|
||
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
|
||
get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types=None)
|
||
|
||
if not is_valid:
|
||
return None, None, None, None, None
|
||
|
||
return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen
|
||
|
||
|
||
def is_pure_carbon_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
|
||
"""
|
||
便捷函数:检查是否为纯碳大环内酯(除酯键氧外只含碳原子)
|
||
|
||
Args:
|
||
mol: RDKit molecule object or SMILES string
|
||
ring_size: Size of the macrolactone ring
|
||
|
||
Returns:
|
||
(is_valid, reason): 是否为纯碳大环内酯及原因
|
||
"""
|
||
_, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
|
||
mol, ring_size, allowed_ring_atom_types=["C"]
|
||
)
|
||
return is_valid, reason
|
||
|
||
|
||
def is_carbon_nitrogen_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
|
||
"""
|
||
便捷函数:检查是否为碳氮大环内酯(除酯键氧外只含碳和氮原子)
|
||
|
||
Args:
|
||
mol: RDKit molecule object or SMILES string
|
||
ring_size: Size of the macrolactone ring
|
||
|
||
Returns:
|
||
(is_valid, reason): 是否为碳氮大环内酯及原因
|
||
"""
|
||
_, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
|
||
mol, ring_size, allowed_ring_atom_types=["C", "N"]
|
||
)
|
||
return is_valid, reason
|
||
|
||
|
||
def visualize_macrolactone_with_auto_coloring(
|
||
mol: Union[Chem.Mol, str],
|
||
ring_size: Optional[int] = None,
|
||
allowed_ring_atom_types: Optional[List[str]] = None,
|
||
size: Tuple[int, int] = (800, 800),
|
||
title: str = "",
|
||
return_svg: bool = False,
|
||
show_atom_labels: bool = True
|
||
) -> Union[str, None]:
|
||
"""
|
||
可视化大环内酯,自动根据原子类型进行颜色编码
|
||
|
||
Args:
|
||
mol: RDKit分子对象或SMILES字符串
|
||
ring_size: 环大小(12-20),如果为None则自动检测
|
||
allowed_ring_atom_types: 允许的环原子类型(除酯氧外)
|
||
- None: 不限制,显示所有原子类型
|
||
- ["C"]: 只允许碳原子(默认筛选条件)
|
||
- ["C", "N"]: 允许碳和氮原子
|
||
size: 图像大小
|
||
title: 图像标题
|
||
return_svg: 是否返回SVG字符串
|
||
show_atom_labels: 是否显示原子编号标签
|
||
|
||
Returns:
|
||
SVG字符串(如果return_svg=True)或显示图像
|
||
"""
|
||
# 转换SMILES到分子对象
|
||
if isinstance(mol, str):
|
||
mol_obj = Chem.MolFromSmiles(mol)
|
||
if mol_obj is None:
|
||
if return_svg:
|
||
return None
|
||
else:
|
||
print("❌ 无法解析SMILES字符串")
|
||
return None
|
||
else:
|
||
mol_obj = mol
|
||
|
||
# 自动检测环大小(如果未指定)
|
||
if ring_size is None:
|
||
ring_sizes = []
|
||
for size in range(12, 21):
|
||
ring_atoms = get_ring_atoms_by_size(mol_obj, size)
|
||
if ring_atoms:
|
||
# 检查是否为有效大环内酯
|
||
analyzer = MacroLactoneAnalyzer()
|
||
if analyzer.is_valid_macrolactone(mol_obj, size):
|
||
ring_sizes.append(size)
|
||
|
||
if not ring_sizes:
|
||
if return_svg:
|
||
return None
|
||
else:
|
||
print("❌ 未找到12-20元大环内酯")
|
||
return None
|
||
|
||
# 使用找到的第一个环大小
|
||
ring_size = ring_sizes[0]
|
||
if len(ring_sizes) > 1:
|
||
print(f"⚠️ 找到多个环大小: {ring_sizes},使用 {ring_size}")
|
||
|
||
# 获取环编号信息
|
||
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
|
||
get_macrolactone_numbering(mol_obj, ring_size, allowed_ring_atom_types)
|
||
|
||
if not is_valid or not ring_atoms:
|
||
if return_svg:
|
||
return None
|
||
else:
|
||
print(f"❌ 无法获取环编号: {validation_reason}")
|
||
return None
|
||
|
||
# 创建分子副本并设置原子标签
|
||
mol_copy = Chem.Mol(mol_obj)
|
||
AllChem.Compute2DCoords(mol_copy)
|
||
|
||
if show_atom_labels:
|
||
for atom_idx in ring_atoms:
|
||
if atom_idx in ring_numbering:
|
||
atom = mol_copy.GetAtomWithIdx(atom_idx)
|
||
atom.SetProp("atomNote", str(ring_numbering[atom_idx]))
|
||
|
||
# 定义原子颜色方案
|
||
def get_atom_color(symbol: str, is_ester_oxygen: bool = False) -> Tuple[float, float, float]:
|
||
"""获取原子颜色"""
|
||
if is_ester_oxygen:
|
||
return (1.0, 0.4, 0.4) # 酯氧用深红色
|
||
elif symbol == 'C':
|
||
return (0.7, 0.8, 1.0) # 碳用蓝色
|
||
elif symbol == 'N':
|
||
return (1.0, 0.8, 1.0) # 氮用粉色
|
||
elif symbol == 'O':
|
||
return (1.0, 0.7, 0.7) # 氧用红色
|
||
elif symbol == 'S':
|
||
return (1.0, 1.0, 0.6) # 硫用黄色
|
||
elif symbol == 'P':
|
||
return (0.8, 0.6, 1.0) # 磷用紫色
|
||
else:
|
||
return (0.8, 1.0, 0.8) # 其他用绿色
|
||
|
||
# 设置原子颜色
|
||
atom_colors = {}
|
||
atom_type_stats = {}
|
||
|
||
for atom_idx in ring_atoms:
|
||
atom = mol_obj.GetAtomWithIdx(atom_idx)
|
||
symbol = atom.GetSymbol()
|
||
is_ester_oxygen = (atom_idx == ester_oxygen)
|
||
|
||
# 设置颜色
|
||
color = get_atom_color(symbol, is_ester_oxygen)
|
||
atom_colors[atom_idx] = color
|
||
|
||
# 统计原子类型(不包括酯氧)
|
||
if not is_ester_oxygen:
|
||
atom_type_stats[symbol] = atom_type_stats.get(symbol, 0) + 1
|
||
|
||
# 绘制分子
|
||
# 确保size是元组格式
|
||
if isinstance(size, int):
|
||
size = (size, size)
|
||
elif isinstance(size, (list, tuple)) and len(size) == 1:
|
||
size = (size[0], size[0])
|
||
elif not isinstance(size, (list, tuple)) or len(size) != 2:
|
||
size = (800, 800) # 默认大小
|
||
|
||
drawer = rdMolDraw2D.MolDraw2DSVG(int(size[0]), int(size[1]))
|
||
drawer.SetFontSize(12) # 设置合适的字体大小(最小为6)
|
||
|
||
# 注意:某些RDKit版本不支持DrawTitle,暂时注释掉
|
||
# if title:
|
||
# drawer.DrawTitle(title)
|
||
|
||
drawer.DrawMolecule(mol_copy,
|
||
highlightAtoms=ring_atoms,
|
||
highlightAtomColors=atom_colors)
|
||
drawer.FinishDrawing()
|
||
svg = drawer.GetDrawingText()
|
||
|
||
# 显示统计信息
|
||
if not return_svg:
|
||
display(SVG(svg))
|
||
|
||
print(f"\n📊 分子信息:")
|
||
print(f" 环大小: {ring_size} 元环")
|
||
print(f" 羰基碳位置: {ring_numbering.get(carbonyl_carbon, 'N/A')} (深红色标记)")
|
||
print(f" 酯氧位置: {ring_numbering.get(ester_oxygen, 'N/A')} (深红色标记)")
|
||
print(f" 环原子组成: {atom_type_stats}")
|
||
print(f" 筛选条件: {validation_reason}")
|
||
|
||
# 显示颜色说明
|
||
print(f"\n🎨 颜色说明:")
|
||
print(f" 深红色: 酯键氧原子 (位置2)")
|
||
print(f" 蓝色: 碳原子")
|
||
print(f" 粉色: 氮原子")
|
||
print(f" 红色: 氧原子")
|
||
print(f" 黄色: 硫原子")
|
||
print(f" 紫色: 磷原子")
|
||
print(f" 绿色: 其他原子")
|
||
|
||
return svg if return_svg else None
|
||
|
||
|
||
def batch_visualize_macrolactones(
|
||
data_file: Path,
|
||
ring_sizes: List[int] = None,
|
||
allowed_ring_atom_types: Optional[List[str]] = None,
|
||
max_examples_per_size: int = 3,
|
||
output_dir: Optional[Path] = None
|
||
) -> Dict[int, List[Dict]]:
|
||
"""
|
||
批量可视化大环内酯分子
|
||
|
||
Args:
|
||
data_file: CSV数据文件路径
|
||
ring_sizes: 要测试的环大小列表,默认为12-20
|
||
allowed_ring_atom_types: 允许的环原子类型
|
||
max_examples_per_size: 每种环大小最大示例数
|
||
output_dir: 输出目录,如果指定则保存SVG文件
|
||
|
||
Returns:
|
||
按环大小分组的可视化结果字典
|
||
"""
|
||
if ring_sizes is None:
|
||
ring_sizes = list(range(12, 21))
|
||
|
||
print(f"🔍 开始批量可视化大环内酯")
|
||
print(f" 数据文件: {data_file}")
|
||
print(f" 环大小范围: {ring_sizes}")
|
||
print(f" 筛选条件: {allowed_ring_atom_types or '无限制'}")
|
||
|
||
# 加载数据
|
||
if not data_file.exists():
|
||
print(f"❌ 数据文件不存在: {data_file}")
|
||
return {}
|
||
|
||
df = pd.read_csv(data_file)
|
||
print(f"✓ 加载数据: {len(df)} 个分子")
|
||
|
||
results = {}
|
||
|
||
for ring_size in ring_sizes:
|
||
print(f"\n🔄 处理 {ring_size} 元环...")
|
||
|
||
size_results = []
|
||
found_count = 0
|
||
|
||
for idx, row in df.iterrows():
|
||
if found_count >= max_examples_per_size:
|
||
break
|
||
|
||
smiles = row.get('smiles', '')
|
||
if not smiles:
|
||
continue
|
||
|
||
try:
|
||
# 测试分子
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if not mol:
|
||
continue
|
||
|
||
# 检查是否为指定大小的有效大环内酯
|
||
analyzer = MacroLactoneAnalyzer()
|
||
if not analyzer.is_valid_macrolactone(mol, ring_size):
|
||
continue
|
||
|
||
# 应用筛选条件(如果指定)
|
||
if allowed_ring_atom_types:
|
||
is_valid, validation_reason = get_macrolactone_numbering(
|
||
mol, ring_size, allowed_ring_atom_types
|
||
)[5]
|
||
if not is_valid:
|
||
continue
|
||
|
||
# 获取详细信息
|
||
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
|
||
get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types)
|
||
|
||
if is_valid:
|
||
# 分析原子组成
|
||
composition = analyze_ring_atom_composition(smiles, ring_size)
|
||
|
||
result = {
|
||
'index': idx,
|
||
'smiles': smiles,
|
||
'molecule_id': row.get('molecule_id', f'mol_{idx}'),
|
||
'ring_size': ring_size,
|
||
'composition': composition,
|
||
'validation_reason': validation_reason,
|
||
'carbonyl_carbon_pos': ring_numbering.get(carbonyl_carbon, 'N/A'),
|
||
'ester_oxygen_pos': ring_numbering.get(ester_oxygen, 'N/A')
|
||
}
|
||
|
||
size_results.append(result)
|
||
found_count += 1
|
||
|
||
print(f" ✓ 找到示例 {found_count}: 分子{idx} ({composition})")
|
||
|
||
# 保存可视化(如果指定了输出目录)
|
||
if output_dir:
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
filename = f"ring{ring_size}_example{found_count}_mol{idx}.svg"
|
||
output_path = output_dir / filename
|
||
|
||
svg = visualize_macrolactone_with_auto_coloring(
|
||
mol, ring_size, allowed_ring_atom_types,
|
||
return_svg=True,
|
||
title=f"{ring_size}-元环示例 {found_count}"
|
||
)
|
||
|
||
if svg:
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
f.write(svg)
|
||
print(f" 💾 保存到: {output_path}")
|
||
|
||
except Exception as e:
|
||
print(f" ⚠️ 处理分子 {idx} 时出错: {str(e)}")
|
||
continue
|
||
|
||
results[ring_size] = size_results
|
||
print(f" 📊 {ring_size} 元环: 找到 {len(size_results)} 个示例")
|
||
|
||
# 显示总体统计
|
||
print(f"\n📈 总体统计:")
|
||
total_found = sum(len(results[size]) for size in ring_sizes)
|
||
print(f" 总示例数: {total_found}")
|
||
|
||
for ring_size in ring_sizes:
|
||
count = len(results[ring_size])
|
||
print(f" {ring_size} 元环: {count} 个示例")
|
||
|
||
return results
|
||
|
||
|
||
def test_all_ring_sizes_with_filtering(
|
||
filtered_data_dir: Path,
|
||
allowed_ring_atom_types: Optional[List[str]] = ["C"], # 默认只允许碳原子
|
||
output_dir: Optional[Path] = None
|
||
) -> Dict[int, Dict]:
|
||
"""
|
||
测试所有环大小(12-20)的筛选效果
|
||
|
||
Args:
|
||
filtered_data_dir: 包含filtered CSV文件的目录
|
||
allowed_ring_atom_types: 允许的环原子类型
|
||
output_dir: 输出目录
|
||
|
||
Returns:
|
||
详细的测试结果
|
||
"""
|
||
print(f"🧪 开始测试所有环大小的筛选效果")
|
||
print(f" 数据目录: {filtered_data_dir}")
|
||
print(f" 筛选条件: {allowed_ring_atom_types or '无限制'}")
|
||
|
||
all_results = {}
|
||
|
||
for ring_size in range(12, 21):
|
||
print(f"\n{'='*60}")
|
||
print(f"🔍 测试 {ring_size} 元环")
|
||
print(f"{'='*60}")
|
||
|
||
# 查找对应的数据文件
|
||
data_file = filtered_data_dir / f'macrolactone_ring{ring_size}_filtered.csv'
|
||
|
||
if not data_file.exists():
|
||
print(f"⚠️ 未找到 {ring_size} 元环数据文件: {data_file}")
|
||
all_results[ring_size] = {
|
||
'data_file_exists': False,
|
||
'total_molecules': 0,
|
||
'valid_macrolactones': 0,
|
||
'filtered_molecules': 0,
|
||
'examples': []
|
||
}
|
||
continue
|
||
|
||
# 批量测试
|
||
batch_results = batch_visualize_macrolactones(
|
||
data_file,
|
||
ring_sizes=[ring_size],
|
||
allowed_ring_atom_types=allowed_ring_atom_types,
|
||
max_examples_per_size=5,
|
||
output_dir=output_dir / f'ring{ring_size}_examples' if output_dir else None
|
||
)
|
||
|
||
# 统计结果
|
||
size_results = batch_results.get(ring_size, [])
|
||
|
||
# 加载完整数据进行统计
|
||
try:
|
||
df = pd.read_csv(data_file)
|
||
total_molecules = len(df)
|
||
|
||
# 统计有效大环内酯数量
|
||
valid_count = 0
|
||
filtered_count = 0
|
||
|
||
for _, row in df.iterrows():
|
||
smiles = row.get('smiles', '')
|
||
if not smiles:
|
||
continue
|
||
|
||
try:
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if mol:
|
||
analyzer = MacroLactoneAnalyzer()
|
||
if analyzer.is_valid_macrolactone(mol, ring_size):
|
||
valid_count += 1
|
||
|
||
if allowed_ring_atom_types:
|
||
is_valid, _ = get_macrolactone_numbering(
|
||
mol, ring_size, allowed_ring_atom_types
|
||
)[5]
|
||
if is_valid:
|
||
filtered_count += 1
|
||
else:
|
||
filtered_count += 1
|
||
except:
|
||
continue
|
||
|
||
all_results[ring_size] = {
|
||
'data_file_exists': True,
|
||
'total_molecules': total_molecules,
|
||
'valid_macrolactones': valid_count,
|
||
'filtered_molecules': filtered_count,
|
||
'filter_rate': filtered_count / valid_count * 100 if valid_count > 0 else 0,
|
||
'examples': size_results
|
||
}
|
||
|
||
print(f"\n📊 {ring_size} 元环统计:")
|
||
print(f" 总分子数: {total_molecules}")
|
||
print(f" 有效大环内酯: {valid_count}")
|
||
print(f" 通过筛选: {filtered_count}")
|
||
print(f" 筛选通过率: {filtered_count/valid_count*100:.1f}%" if valid_count > 0 else " 筛选通过率: 0%")
|
||
|
||
except Exception as e:
|
||
print(f"❌ 统计 {ring_size} 元环时出错: {str(e)}")
|
||
all_results[ring_size] = {
|
||
'data_file_exists': True,
|
||
'error': str(e),
|
||
'examples': size_results
|
||
}
|
||
|
||
# 显示总体统计
|
||
print(f"\n{'='*60}")
|
||
print(f"📈 所有环大小测试总结")
|
||
print(f"{'='*60}")
|
||
|
||
total_molecules = sum(result.get('total_molecules', 0) for result in all_results.values())
|
||
total_valid = sum(result.get('valid_macrolactones', 0) for result in all_results.values())
|
||
total_filtered = sum(result.get('filtered_molecules', 0) for result in all_results.values())
|
||
|
||
print(f"总分子数: {total_molecules}")
|
||
print(f"总有效大环内酯: {total_valid}")
|
||
print(f"总通过筛选: {total_filtered}")
|
||
print(f"总体筛选通过率: {total_filtered/total_valid*100:.1f}%" if total_valid > 0 else "总体筛选通过率: 0%")
|
||
|
||
return all_results
|
||
|
||
|
||
def analyze_ring_atom_composition(mol: Union[Chem.Mol, str], ring_size: int) -> Dict[str, int]:
|
||
# 首先获取环编号信息
|
||
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
|
||
get_macrolactone_numbering(mol, ring_size)
|
||
|
||
if not is_valid or not ring_atoms:
|
||
return {}
|
||
|
||
# 转换SMILES到分子对象(如果需要)
|
||
if isinstance(mol, str):
|
||
mol = Chem.MolFromSmiles(mol)
|
||
if mol is None:
|
||
return {}
|
||
|
||
# 统计原子类型
|
||
composition = {}
|
||
|
||
for atom_idx in ring_atoms:
|
||
# 跳过酯键氧原子
|
||
if atom_idx == ester_oxygen:
|
||
continue
|
||
|
||
atom = mol.GetAtomWithIdx(atom_idx)
|
||
symbol = atom.GetSymbol()
|
||
composition[symbol] = composition.get(symbol, 0) + 1
|
||
|
||
return composition
|
||
|
||
|
||
def draw_mol_with_ring_numbering(
|
||
mol: Union[Chem.Mol, str],
|
||
ring_numbering: Optional[Dict[int, int]] = None,
|
||
ring_atoms: Optional[List[int]] = None,
|
||
size: Tuple[int, int] = (800, 800),
|
||
title: str = "",
|
||
ring_size: int = 16,
|
||
return_svg: bool = False
|
||
) -> Optional[str]:
|
||
"""
|
||
Draw molecule with ring numbering displayed.
|
||
|
||
This function can work in two modes:
|
||
1. If ring_numbering is provided, use it directly
|
||
2. If ring_numbering is None, automatically compute it for macrolactones
|
||
|
||
Args:
|
||
mol: RDKit molecule object or SMILES string
|
||
ring_numbering: Optional pre-computed numbering dictionary
|
||
ring_atoms: Optional pre-computed ring atoms list
|
||
size: Image size (width, height)
|
||
title: Optional title for the image
|
||
ring_size: Ring size for auto-numbering (default 16)
|
||
return_svg: If True, return SVG string instead of displaying
|
||
|
||
Returns:
|
||
SVG string if return_svg=True, None otherwise (displays in notebook)
|
||
"""
|
||
# Convert SMILES to molecule if needed
|
||
if isinstance(mol, str):
|
||
mol = Chem.MolFromSmiles(mol)
|
||
if mol is None:
|
||
print("Error: Could not parse SMILES")
|
||
return None
|
||
|
||
# Auto-compute numbering if not provided
|
||
if ring_numbering is None:
|
||
ring_atoms, ring_numbering, _, _, _ = get_macrolactone_numbering(mol, ring_size)
|
||
if ring_numbering is None:
|
||
print("Error: Could not compute ring numbering")
|
||
return None
|
||
|
||
# Get ring atoms if not provided
|
||
if ring_atoms is None:
|
||
ring_atoms = list(ring_numbering.keys())
|
||
|
||
# Create drawer
|
||
drawer = rdMolDraw2D.MolDraw2DSVG(size[0], size[1])
|
||
drawer.SetFontSize(6) # Minimum font size is 6
|
||
|
||
draw_options = drawer.drawOptions()
|
||
draw_options.addAtomIndices = False
|
||
|
||
# Highlight ring atoms
|
||
highlight_atoms = list(ring_atoms)
|
||
atom_colors = {}
|
||
for atom_idx in ring_atoms:
|
||
atom_colors[atom_idx] = (0.8, 0.9, 1.0) # Light blue
|
||
|
||
# Create copy and set atom notes (ring numbering)
|
||
mol_copy = Chem.Mol(mol)
|
||
for atom_idx in ring_atoms:
|
||
if atom_idx in ring_numbering:
|
||
atom = mol_copy.GetAtomWithIdx(atom_idx)
|
||
atom.SetProp("atomNote", str(ring_numbering[atom_idx]))
|
||
|
||
# Draw molecule
|
||
drawer.DrawMolecule(
|
||
mol_copy,
|
||
highlightAtoms=highlight_atoms,
|
||
highlightAtomColors=atom_colors
|
||
)
|
||
|
||
drawer.FinishDrawing()
|
||
svg = drawer.GetDrawingText()
|
||
|
||
if return_svg:
|
||
return svg
|
||
else:
|
||
display(SVG(svg))
|
||
return None
|
||
|
||
|
||
def visualize_molecule_with_numbering(
|
||
mol: Union[Chem.Mol, str],
|
||
ring_size: int = 16,
|
||
size: Tuple[int, int] = (800, 800),
|
||
title: str = ""
|
||
) -> Tuple[Optional[Dict[int, int]], Optional[List[int]]]:
|
||
"""
|
||
Convenience function to visualize a molecule with automatic ring numbering.
|
||
|
||
This is the main function to use in Jupyter notebooks for quick visualization.
|
||
|
||
Args:
|
||
mol: RDKit molecule object or SMILES string
|
||
ring_size: Ring size for macrolactones (default 16)
|
||
size: Image size (width, height)
|
||
title: Optional title
|
||
|
||
Returns:
|
||
Tuple of (ring_numbering_dict, ring_atoms_list)
|
||
"""
|
||
# Get numbering
|
||
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen = \
|
||
get_macrolactone_numbering(mol, ring_size)
|
||
|
||
if ring_numbering is None:
|
||
print("Error: Could not compute ring numbering")
|
||
return None, None
|
||
|
||
# Display
|
||
print(f"Ring size: {ring_size}")
|
||
print(f"Carbonyl C position: {ring_numbering.get(carbonyl_carbon, 'N/A')}")
|
||
print(f"Ester O position: {ring_numbering.get(ester_oxygen, 'N/A')}")
|
||
print(f"Numbering range: 1-{len(ring_numbering)}")
|
||
|
||
draw_mol_with_ring_numbering(
|
||
mol, ring_numbering, ring_atoms, size, title, ring_size
|
||
)
|
||
|
||
return ring_numbering, ring_atoms
|
||
|
||
|
||
def identify_side_chains(mol: Chem.Mol, ring_atoms: List[int]) -> List[Tuple[int, int]]:
|
||
"""
|
||
Identify side chains attached to the ring.
|
||
|
||
Args:
|
||
mol: RDKit molecule object
|
||
ring_atoms: List of atom indices in the ring
|
||
|
||
Returns:
|
||
List of tuples (ring_atom_idx, side_chain_first_atom_idx)
|
||
"""
|
||
side_chains = []
|
||
ring_atom_set = set(ring_atoms)
|
||
|
||
for ring_atom_idx in ring_atoms:
|
||
atom = mol.GetAtomWithIdx(ring_atom_idx)
|
||
for neighbor in atom.GetNeighbors():
|
||
neighbor_idx = neighbor.GetIdx()
|
||
# If neighbor is not in the ring, it's a side chain
|
||
if neighbor_idx not in ring_atom_set:
|
||
side_chains.append((ring_atom_idx, neighbor_idx))
|
||
|
||
return side_chains
|
||
|
||
|
||
def extract_side_chain_fragment(
|
||
mol: Chem.Mol,
|
||
ring_atom_idx: int,
|
||
side_chain_start_idx: int,
|
||
ring_atoms: List[int]
|
||
) -> Optional[str]:
|
||
"""
|
||
Extract a side chain fragment as a SMILES string with dummy atom (*) at attachment point.
|
||
|
||
Args:
|
||
mol: RDKit molecule object
|
||
ring_atom_idx: Index of the ring atom where side chain is attached
|
||
side_chain_start_idx: Index of the first atom in the side chain
|
||
ring_atoms: List of all ring atom indices
|
||
|
||
Returns:
|
||
SMILES string of the fragment (contains dummy atom *), or None if extraction failed
|
||
"""
|
||
ring_atom_set = set(ring_atoms)
|
||
visited = set()
|
||
queue = [side_chain_start_idx]
|
||
side_chain_atoms = []
|
||
|
||
# Use BFS to collect all atoms in the side chain
|
||
while queue:
|
||
current_idx = queue.pop(0)
|
||
if current_idx in visited:
|
||
continue
|
||
|
||
visited.add(current_idx)
|
||
side_chain_atoms.append(current_idx)
|
||
|
||
atom = mol.GetAtomWithIdx(current_idx)
|
||
for neighbor in atom.GetNeighbors():
|
||
neighbor_idx = neighbor.GetIdx()
|
||
# Only continue into non-ring atoms
|
||
if neighbor_idx not in ring_atom_set and neighbor_idx not in visited:
|
||
queue.append(neighbor_idx)
|
||
|
||
if not side_chain_atoms:
|
||
return None
|
||
|
||
# Get the bond type to the ring
|
||
bond_to_ring = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx)
|
||
if bond_to_ring is None:
|
||
return None
|
||
bond_type = bond_to_ring.GetBondType()
|
||
|
||
# Create a new molecule with only the side chain atoms
|
||
fragment_mol = Chem.RWMol()
|
||
old_to_new = {}
|
||
|
||
# Add atoms
|
||
for old_idx in side_chain_atoms:
|
||
atom = mol.GetAtomWithIdx(old_idx)
|
||
new_atom = Chem.Atom(atom.GetAtomicNum())
|
||
new_atom.SetFormalCharge(atom.GetFormalCharge())
|
||
new_atom.SetIsAromatic(atom.GetIsAromatic())
|
||
new_idx = fragment_mol.AddAtom(new_atom)
|
||
old_to_new[old_idx] = new_idx
|
||
|
||
# Add bonds (within side chain)
|
||
for old_idx in side_chain_atoms:
|
||
atom = mol.GetAtomWithIdx(old_idx)
|
||
for neighbor in atom.GetNeighbors():
|
||
neighbor_idx = neighbor.GetIdx()
|
||
if neighbor_idx in old_to_new and old_idx < neighbor_idx:
|
||
bond = mol.GetBondBetweenAtoms(old_idx, neighbor_idx)
|
||
fragment_mol.AddBond(
|
||
old_to_new[old_idx],
|
||
old_to_new[neighbor_idx],
|
||
bond.GetBondType()
|
||
)
|
||
|
||
# Add dummy atom (atomic number 0, displays as * in SMILES)
|
||
# Dummy atom connects to the atom that was originally connected to the ring
|
||
attachment_point = old_to_new[side_chain_start_idx]
|
||
dummy_atom_idx = fragment_mol.AddAtom(Chem.Atom(0))
|
||
|
||
# Add bond between dummy atom and attachment point, keeping original bond type
|
||
fragment_mol.AddBond(attachment_point, dummy_atom_idx, bond_type)
|
||
|
||
# Convert to molecule and get SMILES
|
||
try:
|
||
fragment_mol = fragment_mol.GetMol()
|
||
Chem.SanitizeMol(fragment_mol)
|
||
fragment_smiles = Chem.MolToSmiles(fragment_mol)
|
||
return fragment_smiles
|
||
except Exception as e:
|
||
return None
|
||
|
||
|
||
def cleave_side_chain_at_position(
|
||
mol: Chem.Mol,
|
||
ring_atoms: List[int],
|
||
ring_numbering: Dict[int, int],
|
||
position: int
|
||
) -> List[str]:
|
||
"""
|
||
Cleave side chains at a specified ring position.
|
||
|
||
Args:
|
||
mol: RDKit molecule object
|
||
ring_atoms: List of ring atom indices
|
||
ring_numbering: Ring numbering dictionary mapping atom index to position
|
||
position: Position number to cleave (1-N, where N is ring size)
|
||
|
||
Returns:
|
||
List of SMILES strings, one for each side chain (contains dummy atom *,
|
||
compatible with molzip for reconstruction)
|
||
"""
|
||
# Find the ring atom index for this position
|
||
ring_atom_idx = None
|
||
for atom_idx, pos in ring_numbering.items():
|
||
if pos == position:
|
||
ring_atom_idx = atom_idx
|
||
break
|
||
|
||
if ring_atom_idx is None:
|
||
return []
|
||
|
||
# Find all side chains attached to this ring atom
|
||
side_chains = identify_side_chains(mol, ring_atoms)
|
||
fragments = []
|
||
|
||
for ring_atom, side_start in side_chains:
|
||
if ring_atom == ring_atom_idx:
|
||
fragment_smiles = extract_side_chain_fragment(mol, ring_atom, side_start, ring_atoms)
|
||
if fragment_smiles:
|
||
fragments.append(fragment_smiles)
|
||
|
||
return fragments
|
||
|