Update ring_visualization.py with enhanced filtering and batch processing capabilities

- Add batch_visualize_macrolactones function for processing multiple molecules
- Add test_all_ring_sizes_with_filtering function for comprehensive testing
- Add analyze_ring_atom_composition function for composition analysis
- Improve error handling and progress reporting
- Add support for carbon-only and carbon-nitrogen filtering
- Enhanced SVG output with better file organization
This commit is contained in:
2025-11-14 21:34:45 +08:00
parent 2c3547cc88
commit c9ef531d9b

View File

@@ -7,11 +7,99 @@ This module provides functions for:
3. Supporting both general molecules and macrolactones (12-20 membered rings)
"""
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Draw, AllChem
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG, display
from typing import Dict, Tuple, Optional, List, Union
from collections import deque
from pathlib import Path
import pandas as pd
# 导入分析器类
try:
from src.macro_lactone_analyzer import MacroLactoneAnalyzer
except ImportError:
# 如果导入失败,定义一个简单的替代类
class MacroLactoneAnalyzer:
def is_valid_macrolactone(self, mol, ring_size):
# 简单检查是否有指定大小的环
ring_atoms = get_ring_atoms_by_size(mol, ring_size)
return ring_atoms is not None
# 原子序数映射,支持元素符号和原子序数字符串
ATOMIC_NUMBERS = {
"H": 1, "C": 6, "N": 7, "O": 8, "S": 16, "P": 15,
"F": 9, "Cl": 17, "Br": 35, "I": 53
}
def normalize_atom_types(atom_types: Optional[List[str]]) -> Optional[List[int]]:
"""
标准化原子类型列表,将元素符号转换为原子序数
Args:
atom_types: 原子类型列表,可以是元素符号["C", "N"]或原子序数["6", "7"]
Returns:
标准化的原子序数列表如果输入为None则返回None
"""
if atom_types is None:
return None
normalized = []
for atom_type in atom_types:
if atom_type.isdigit():
# 已经是原子序数字符串
normalized.append(int(atom_type))
elif atom_type.upper() in ATOMIC_NUMBERS:
# 元素符号,转换为原子序数
normalized.append(ATOMIC_NUMBERS[atom_type.upper()])
else:
raise ValueError(f"不支持的原子类型: {atom_type}")
return normalized
def validate_ring_atom_composition(
mol: Chem.Mol,
ring_atoms: List[int],
carbonyl_carbon_idx: int,
ester_oxygen_idx: int,
allowed_atom_types: Optional[List[int]] = None
) -> Tuple[bool, str]:
"""
验证环原子组成是否符合要求
Args:
mol: RDKit分子对象
ring_atoms: 环原子索引列表
carbonyl_carbon_idx: 羰基碳索引位置1
ester_oxygen_idx: 酯氧索引位置2
allowed_atom_types: 允许的原子类型列表(原子序数)
Returns:
(is_valid, reason): 是否有效及原因说明
"""
if allowed_atom_types is None:
return True, "不限制原子类型"
# 检查环中每个原子(除了酯键氧)
invalid_atoms = []
for atom_idx in ring_atoms:
# 跳过酯键氧原子位置2
if atom_idx == ester_oxygen_idx:
continue
atom = mol.GetAtomWithIdx(atom_idx)
atomic_num = atom.GetAtomicNum()
if atomic_num not in allowed_atom_types:
symbol = atom.GetSymbol()
invalid_atoms.append(f"{symbol}(索引:{atom_idx})")
if invalid_atoms:
return False, f"环中包含不允许的原子类型: {', '.join(invalid_atoms)}"
return True, f"环原子组成符合要求,只允许: {[Chem.GetPeriodicTable().GetElementSymbol(num) for num in allowed_atom_types]}"
def get_ring_atoms_by_size(mol: Chem.Mol, ring_size: int) -> Optional[List[int]]:
@@ -223,16 +311,85 @@ def create_ring_numbering(
def get_macrolactone_numbering(
mol: Union[Chem.Mol, str],
ring_size: int = 16
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int]]:
ring_size: int = 16,
allowed_ring_atom_types: Optional[List[str]] = None
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int], Tuple[bool, str]]:
"""
Get ring numbering for a macrolactone molecule.
Get ring numbering for a macrolactone molecule with optional atom type filtering.
This function performs the complete numbering workflow:
1. Find ring of specified size
2. Find ester group
3. Find carbonyl carbon and ester oxygen
4. Create numbering mapping
4. Validate ring atom composition (if filtering requested)
5. Create numbering mapping
Args:
mol: RDKit molecule object or SMILES string
ring_size: Size of the macrolactone ring (12-20, default 16)
allowed_ring_atom_types: Allowed atom types for ring atoms (excluding ester oxygen)
- None: No restriction (default behavior)
- ["C"]: Only carbon atoms allowed (except ester oxygen)
- ["C", "N"]: Carbon and nitrogen atoms allowed (except ester oxygen)
- Can use element symbols or atomic number strings: ["6", "7"]
Returns:
Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason))
Returns (None, None, None, None, None, (False, reason)) if numbering fails
"""
# Convert SMILES to molecule if needed
if isinstance(mol, str):
mol = Chem.MolFromSmiles(mol)
if mol is None:
return None, None, None, None, None, (False, "无法解析SMILES字符串")
# 标准化原子类型
try:
allowed_atom_numbers = normalize_atom_types(allowed_ring_atom_types)
except ValueError as e:
return None, None, None, None, None, (False, f"原子类型参数错误: {str(e)}")
# Find ring of specified size
ring_atoms = get_ring_atoms_by_size(mol, ring_size)
if ring_atoms is None:
return None, None, None, None, None, (False, f"未找到{ring_size}元环")
# Find ester group
ester_atoms, pattern = find_ester_smarts(mol, ring_atoms)
if ester_atoms is None:
return None, None, None, None, None, (False, "未在环中找到酯键")
# Find carbonyl carbon and ester oxygen
carbonyl_carbon = get_carbonyl_carbon_in_ring(mol, ester_atoms, ring_atoms)
ester_oxygen = get_ester_oxygen_in_ring(mol, ester_atoms, ring_atoms)
if carbonyl_carbon is None or ester_oxygen is None:
return None, None, None, None, None, (False, "无法识别羰基碳或酯氧原子")
# 验证环原子组成(如果指定了允许的原子类型)
if allowed_atom_numbers is not None:
is_valid, validation_reason = validate_ring_atom_composition(
mol, ring_atoms, carbonyl_carbon, ester_oxygen, allowed_atom_numbers
)
if not is_valid:
return None, None, None, None, None, (False, validation_reason)
else:
validation_reason = "不限制原子类型"
# Create numbering
ring_numbering, ordered_atoms = create_ring_numbering(
mol, ring_atoms, carbonyl_carbon, ester_oxygen
)
return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (True, validation_reason)
def get_macrolactone_numbering_legacy(
mol: Union[Chem.Mol, str],
ring_size: int = 16
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int]]:
"""
向后兼容版本的get_macrolactone_numbering函数
Args:
mol: RDKit molecule object or SMILES string
@@ -242,35 +399,496 @@ def get_macrolactone_numbering(
Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen)
Returns (None, None, None, None, None) if numbering fails
"""
# Convert SMILES to molecule if needed
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types=None)
if not is_valid:
return None, None, None, None, None
return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen
def is_pure_carbon_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
"""
便捷函数:检查是否为纯碳大环内酯(除酯键氧外只含碳原子)
Args:
mol: RDKit molecule object or SMILES string
ring_size: Size of the macrolactone ring
Returns:
(is_valid, reason): 是否为纯碳大环内酯及原因
"""
_, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
mol, ring_size, allowed_ring_atom_types=["C"]
)
return is_valid, reason
def is_carbon_nitrogen_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
"""
便捷函数:检查是否为碳氮大环内酯(除酯键氧外只含碳和氮原子)
Args:
mol: RDKit molecule object or SMILES string
ring_size: Size of the macrolactone ring
Returns:
(is_valid, reason): 是否为碳氮大环内酯及原因
"""
_, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
mol, ring_size, allowed_ring_atom_types=["C", "N"]
)
return is_valid, reason
def visualize_macrolactone_with_auto_coloring(
mol: Union[Chem.Mol, str],
ring_size: Optional[int] = None,
allowed_ring_atom_types: Optional[List[str]] = None,
size: Tuple[int, int] = (800, 800),
title: str = "",
return_svg: bool = False,
show_atom_labels: bool = True
) -> Union[str, None]:
"""
可视化大环内酯,自动根据原子类型进行颜色编码
Args:
mol: RDKit分子对象或SMILES字符串
ring_size: 环大小12-20如果为None则自动检测
allowed_ring_atom_types: 允许的环原子类型(除酯氧外)
- None: 不限制,显示所有原子类型
- ["C"]: 只允许碳原子(默认筛选条件)
- ["C", "N"]: 允许碳和氮原子
size: 图像大小
title: 图像标题
return_svg: 是否返回SVG字符串
show_atom_labels: 是否显示原子编号标签
Returns:
SVG字符串如果return_svg=True或显示图像
"""
# 转换SMILES到分子对象
if isinstance(mol, str):
mol_obj = Chem.MolFromSmiles(mol)
if mol_obj is None:
if return_svg:
return None
else:
print("❌ 无法解析SMILES字符串")
return None
else:
mol_obj = mol
# 自动检测环大小(如果未指定)
if ring_size is None:
ring_sizes = []
for size in range(12, 21):
ring_atoms = get_ring_atoms_by_size(mol_obj, size)
if ring_atoms:
# 检查是否为有效大环内酯
analyzer = MacroLactoneAnalyzer()
if analyzer.is_valid_macrolactone(mol_obj, size):
ring_sizes.append(size)
if not ring_sizes:
if return_svg:
return None
else:
print("❌ 未找到12-20元大环内酯")
return None
# 使用找到的第一个环大小
ring_size = ring_sizes[0]
if len(ring_sizes) > 1:
print(f"⚠️ 找到多个环大小: {ring_sizes},使用 {ring_size}")
# 获取环编号信息
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
get_macrolactone_numbering(mol_obj, ring_size, allowed_ring_atom_types)
if not is_valid or not ring_atoms:
if return_svg:
return None
else:
print(f"❌ 无法获取环编号: {validation_reason}")
return None
# 创建分子副本并设置原子标签
mol_copy = Chem.Mol(mol_obj)
AllChem.Compute2DCoords(mol_copy)
if show_atom_labels:
for atom_idx in ring_atoms:
if atom_idx in ring_numbering:
atom = mol_copy.GetAtomWithIdx(atom_idx)
atom.SetProp("atomNote", str(ring_numbering[atom_idx]))
# 定义原子颜色方案
def get_atom_color(symbol: str, is_ester_oxygen: bool = False) -> Tuple[float, float, float]:
"""获取原子颜色"""
if is_ester_oxygen:
return (1.0, 0.4, 0.4) # 酯氧用深红色
elif symbol == 'C':
return (0.7, 0.8, 1.0) # 碳用蓝色
elif symbol == 'N':
return (1.0, 0.8, 1.0) # 氮用粉色
elif symbol == 'O':
return (1.0, 0.7, 0.7) # 氧用红色
elif symbol == 'S':
return (1.0, 1.0, 0.6) # 硫用黄色
elif symbol == 'P':
return (0.8, 0.6, 1.0) # 磷用紫色
else:
return (0.8, 1.0, 0.8) # 其他用绿色
# 设置原子颜色
atom_colors = {}
atom_type_stats = {}
for atom_idx in ring_atoms:
atom = mol_obj.GetAtomWithIdx(atom_idx)
symbol = atom.GetSymbol()
is_ester_oxygen = (atom_idx == ester_oxygen)
# 设置颜色
color = get_atom_color(symbol, is_ester_oxygen)
atom_colors[atom_idx] = color
# 统计原子类型(不包括酯氧)
if not is_ester_oxygen:
atom_type_stats[symbol] = atom_type_stats.get(symbol, 0) + 1
# 绘制分子
# 确保size是元组格式
if isinstance(size, int):
size = (size, size)
elif isinstance(size, (list, tuple)) and len(size) == 1:
size = (size[0], size[0])
elif not isinstance(size, (list, tuple)) or len(size) != 2:
size = (800, 800) # 默认大小
drawer = rdMolDraw2D.MolDraw2DSVG(int(size[0]), int(size[1]))
drawer.SetFontSize(12) # 设置合适的字体大小最小为6
# 注意某些RDKit版本不支持DrawTitle暂时注释掉
# if title:
# drawer.DrawTitle(title)
drawer.DrawMolecule(mol_copy,
highlightAtoms=ring_atoms,
highlightAtomColors=atom_colors)
drawer.FinishDrawing()
svg = drawer.GetDrawingText()
# 显示统计信息
if not return_svg:
display(SVG(svg))
print(f"\n📊 分子信息:")
print(f" 环大小: {ring_size} 元环")
print(f" 羰基碳位置: {ring_numbering.get(carbonyl_carbon, 'N/A')} (深红色标记)")
print(f" 酯氧位置: {ring_numbering.get(ester_oxygen, 'N/A')} (深红色标记)")
print(f" 环原子组成: {atom_type_stats}")
print(f" 筛选条件: {validation_reason}")
# 显示颜色说明
print(f"\n🎨 颜色说明:")
print(f" 深红色: 酯键氧原子 (位置2)")
print(f" 蓝色: 碳原子")
print(f" 粉色: 氮原子")
print(f" 红色: 氧原子")
print(f" 黄色: 硫原子")
print(f" 紫色: 磷原子")
print(f" 绿色: 其他原子")
return svg if return_svg else None
def batch_visualize_macrolactones(
data_file: Path,
ring_sizes: List[int] = None,
allowed_ring_atom_types: Optional[List[str]] = None,
max_examples_per_size: int = 3,
output_dir: Optional[Path] = None
) -> Dict[int, List[Dict]]:
"""
批量可视化大环内酯分子
Args:
data_file: CSV数据文件路径
ring_sizes: 要测试的环大小列表默认为12-20
allowed_ring_atom_types: 允许的环原子类型
max_examples_per_size: 每种环大小最大示例数
output_dir: 输出目录如果指定则保存SVG文件
Returns:
按环大小分组的可视化结果字典
"""
if ring_sizes is None:
ring_sizes = list(range(12, 21))
print(f"🔍 开始批量可视化大环内酯")
print(f" 数据文件: {data_file}")
print(f" 环大小范围: {ring_sizes}")
print(f" 筛选条件: {allowed_ring_atom_types or '无限制'}")
# 加载数据
if not data_file.exists():
print(f"❌ 数据文件不存在: {data_file}")
return {}
df = pd.read_csv(data_file)
print(f"✓ 加载数据: {len(df)} 个分子")
results = {}
for ring_size in ring_sizes:
print(f"\n🔄 处理 {ring_size} 元环...")
size_results = []
found_count = 0
for idx, row in df.iterrows():
if found_count >= max_examples_per_size:
break
smiles = row.get('smiles', '')
if not smiles:
continue
try:
# 测试分子
mol = Chem.MolFromSmiles(smiles)
if not mol:
continue
# 检查是否为指定大小的有效大环内酯
analyzer = MacroLactoneAnalyzer()
if not analyzer.is_valid_macrolactone(mol, ring_size):
continue
# 应用筛选条件(如果指定)
if allowed_ring_atom_types:
is_valid, validation_reason = get_macrolactone_numbering(
mol, ring_size, allowed_ring_atom_types
)[5]
if not is_valid:
continue
# 获取详细信息
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types)
if is_valid:
# 分析原子组成
composition = analyze_ring_atom_composition(smiles, ring_size)
result = {
'index': idx,
'smiles': smiles,
'molecule_id': row.get('molecule_id', f'mol_{idx}'),
'ring_size': ring_size,
'composition': composition,
'validation_reason': validation_reason,
'carbonyl_carbon_pos': ring_numbering.get(carbonyl_carbon, 'N/A'),
'ester_oxygen_pos': ring_numbering.get(ester_oxygen, 'N/A')
}
size_results.append(result)
found_count += 1
print(f" ✓ 找到示例 {found_count}: 分子{idx} ({composition})")
# 保存可视化(如果指定了输出目录)
if output_dir:
output_dir.mkdir(parents=True, exist_ok=True)
filename = f"ring{ring_size}_example{found_count}_mol{idx}.svg"
output_path = output_dir / filename
svg = visualize_macrolactone_with_auto_coloring(
mol, ring_size, allowed_ring_atom_types,
return_svg=True,
title=f"{ring_size}-元环示例 {found_count}"
)
if svg:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(svg)
print(f" 💾 保存到: {output_path}")
except Exception as e:
print(f" ⚠️ 处理分子 {idx} 时出错: {str(e)}")
continue
results[ring_size] = size_results
print(f" 📊 {ring_size} 元环: 找到 {len(size_results)} 个示例")
# 显示总体统计
print(f"\n📈 总体统计:")
total_found = sum(len(results[size]) for size in ring_sizes)
print(f" 总示例数: {total_found}")
for ring_size in ring_sizes:
count = len(results[ring_size])
print(f" {ring_size} 元环: {count} 个示例")
return results
def test_all_ring_sizes_with_filtering(
filtered_data_dir: Path,
allowed_ring_atom_types: Optional[List[str]] = ["C"], # 默认只允许碳原子
output_dir: Optional[Path] = None
) -> Dict[int, Dict]:
"""
测试所有环大小12-20的筛选效果
Args:
filtered_data_dir: 包含filtered CSV文件的目录
allowed_ring_atom_types: 允许的环原子类型
output_dir: 输出目录
Returns:
详细的测试结果
"""
print(f"🧪 开始测试所有环大小的筛选效果")
print(f" 数据目录: {filtered_data_dir}")
print(f" 筛选条件: {allowed_ring_atom_types or '无限制'}")
all_results = {}
for ring_size in range(12, 21):
print(f"\n{'='*60}")
print(f"🔍 测试 {ring_size} 元环")
print(f"{'='*60}")
# 查找对应的数据文件
data_file = filtered_data_dir / f'macrolactone_ring{ring_size}_filtered.csv'
if not data_file.exists():
print(f"⚠️ 未找到 {ring_size} 元环数据文件: {data_file}")
all_results[ring_size] = {
'data_file_exists': False,
'total_molecules': 0,
'valid_macrolactones': 0,
'filtered_molecules': 0,
'examples': []
}
continue
# 批量测试
batch_results = batch_visualize_macrolactones(
data_file,
ring_sizes=[ring_size],
allowed_ring_atom_types=allowed_ring_atom_types,
max_examples_per_size=5,
output_dir=output_dir / f'ring{ring_size}_examples' if output_dir else None
)
# 统计结果
size_results = batch_results.get(ring_size, [])
# 加载完整数据进行统计
try:
df = pd.read_csv(data_file)
total_molecules = len(df)
# 统计有效大环内酯数量
valid_count = 0
filtered_count = 0
for _, row in df.iterrows():
smiles = row.get('smiles', '')
if not smiles:
continue
try:
mol = Chem.MolFromSmiles(smiles)
if mol:
analyzer = MacroLactoneAnalyzer()
if analyzer.is_valid_macrolactone(mol, ring_size):
valid_count += 1
if allowed_ring_atom_types:
is_valid, _ = get_macrolactone_numbering(
mol, ring_size, allowed_ring_atom_types
)[5]
if is_valid:
filtered_count += 1
else:
filtered_count += 1
except:
continue
all_results[ring_size] = {
'data_file_exists': True,
'total_molecules': total_molecules,
'valid_macrolactones': valid_count,
'filtered_molecules': filtered_count,
'filter_rate': filtered_count / valid_count * 100 if valid_count > 0 else 0,
'examples': size_results
}
print(f"\n📊 {ring_size} 元环统计:")
print(f" 总分子数: {total_molecules}")
print(f" 有效大环内酯: {valid_count}")
print(f" 通过筛选: {filtered_count}")
print(f" 筛选通过率: {filtered_count/valid_count*100:.1f}%" if valid_count > 0 else " 筛选通过率: 0%")
except Exception as e:
print(f"❌ 统计 {ring_size} 元环时出错: {str(e)}")
all_results[ring_size] = {
'data_file_exists': True,
'error': str(e),
'examples': size_results
}
# 显示总体统计
print(f"\n{'='*60}")
print(f"📈 所有环大小测试总结")
print(f"{'='*60}")
total_molecules = sum(result.get('total_molecules', 0) for result in all_results.values())
total_valid = sum(result.get('valid_macrolactones', 0) for result in all_results.values())
total_filtered = sum(result.get('filtered_molecules', 0) for result in all_results.values())
print(f"总分子数: {total_molecules}")
print(f"总有效大环内酯: {total_valid}")
print(f"总通过筛选: {total_filtered}")
print(f"总体筛选通过率: {total_filtered/total_valid*100:.1f}%" if total_valid > 0 else "总体筛选通过率: 0%")
return all_results
def analyze_ring_atom_composition(mol: Union[Chem.Mol, str], ring_size: int) -> Dict[str, int]:
# 首先获取环编号信息
ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
get_macrolactone_numbering(mol, ring_size)
if not is_valid or not ring_atoms:
return {}
# 转换SMILES到分子对象如果需要
if isinstance(mol, str):
mol = Chem.MolFromSmiles(mol)
if mol is None:
return None, None, None, None, None
return {}
# Find ring of specified size
ring_atoms = get_ring_atoms_by_size(mol, ring_size)
if ring_atoms is None:
return None, None, None, None, None
# 统计原子类型
composition = {}
# Find ester group
ester_atoms, pattern = find_ester_smarts(mol, ring_atoms)
if ester_atoms is None:
return None, None, None, None, None
for atom_idx in ring_atoms:
# 跳过酯键氧原子
if atom_idx == ester_oxygen:
continue
atom = mol.GetAtomWithIdx(atom_idx)
symbol = atom.GetSymbol()
composition[symbol] = composition.get(symbol, 0) + 1
# Find carbonyl carbon and ester oxygen
carbonyl_carbon = get_carbonyl_carbon_in_ring(mol, ester_atoms, ring_atoms)
ester_oxygen = get_ester_oxygen_in_ring(mol, ester_atoms, ring_atoms)
if carbonyl_carbon is None or ester_oxygen is None:
return None, None, None, None, None
# Create numbering
ring_numbering, ordered_atoms = create_ring_numbering(
mol, ring_atoms, carbonyl_carbon, ester_oxygen
)
return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen
return composition
def draw_mol_with_ring_numbering(