495 lines
17 KiB
Python
495 lines
17 KiB
Python
#!/usr/bin/env python
|
||
# -*- encoding: utf-8 -*-
|
||
"""
|
||
@file :macro_lactone_analyzer.py
|
||
@Description: :大环内酯分析器 - 环数识别、验证和分类
|
||
:
|
||
:主要功能:
|
||
1. 识别分子中所有12-20元环的大小
|
||
2. 验证是否为有效的大环内酯(含有酯键)
|
||
3. 支持单分子和批量分析
|
||
4. 分类为单环和桥环分子
|
||
5. 支持动态SMARTS模式匹配
|
||
:
|
||
@Date :2025-11-09 (更新:2025-11-09)
|
||
@Author :参考 lyzeng 的设计
|
||
@version :2.0
|
||
"""
|
||
|
||
from dataclasses import dataclass, field
|
||
from typing import List, Dict, Optional, Union, Tuple, Set
|
||
from pathlib import Path
|
||
import pandas as pd
|
||
from rdkit import Chem
|
||
from rdkit.Chem import Descriptors, Crippen, QED
|
||
|
||
# 导入项目内的辅助函数
|
||
from .ring_visualization import get_ring_atoms_by_size
|
||
|
||
|
||
@dataclass
|
||
class MacroLactoneAnalyzer:
|
||
"""
|
||
大环内酯分析器 - 用于识别、验证和分类大环内酯分子。
|
||
|
||
属性:
|
||
smiles_list (List[str]): SMILES字符串列表,可通过add_smiles方法添加。
|
||
ester_patterns (List[str]): 用于识别酯键的SMARTS模式列表。
|
||
"""
|
||
smiles_list: List[str] = field(default_factory=list)
|
||
ester_patterns: List[str] = field(default_factory=list)
|
||
|
||
def __post_init__(self):
|
||
"""初始化后设置默认的酯键SMARTS模式"""
|
||
if not self.ester_patterns:
|
||
self.ester_patterns = [
|
||
"[C](=O)[O]", # 通用酯键
|
||
"C(=O)O", # 简单酯键
|
||
]
|
||
|
||
@staticmethod
|
||
def detect_ring_sizes(mol: Chem.Mol) -> List[int]:
|
||
"""
|
||
识别分子中所有12-20元环的大小
|
||
|
||
Args:
|
||
mol: RDKit分子对象
|
||
|
||
Returns:
|
||
环大小列表,按升序排列
|
||
"""
|
||
if mol is None:
|
||
return []
|
||
|
||
ring_sizes = []
|
||
ring_info = mol.GetRingInfo()
|
||
rings = ring_info.AtomRings()
|
||
|
||
for ring in rings:
|
||
ring_size = len(ring)
|
||
if 12 <= ring_size <= 20:
|
||
ring_sizes.append(ring_size)
|
||
|
||
# 去重并排序
|
||
return sorted(list(set(ring_sizes)))
|
||
|
||
@staticmethod
|
||
def has_ester_on_ring(mol: Chem.Mol, ring_atoms: List[int], ester_patterns: Optional[List[str]] = None) -> bool:
|
||
"""
|
||
检查环上是否有酯键(大环内酯特征)
|
||
|
||
Args:
|
||
mol: RDKit分子对象
|
||
ring_atoms: 环原子索引列表
|
||
ester_patterns: 可选的酯键SMARTS模式列表
|
||
|
||
Returns:
|
||
是否为大环内酯
|
||
"""
|
||
if mol is None or not ring_atoms:
|
||
return False
|
||
|
||
ring_atoms_set = set(ring_atoms)
|
||
patterns = ester_patterns or ["[C](=O)[O]", "C(=O)O"]
|
||
|
||
for pattern_str in patterns:
|
||
pattern = Chem.MolFromSmarts(pattern_str)
|
||
if pattern is None:
|
||
continue
|
||
|
||
matches = mol.GetSubstructMatches(pattern)
|
||
for match in matches:
|
||
if len(match) >= 3:
|
||
carbonyl_c = match[0]
|
||
ester_o = match[2]
|
||
# 检查酯键原子是否在环上
|
||
if carbonyl_c in ring_atoms_set and ester_o in ring_atoms_set:
|
||
return True
|
||
|
||
return False
|
||
|
||
@staticmethod
|
||
def is_valid_macrolactone(mol: Chem.Mol, size: int, ester_patterns: Optional[List[str]] = None) -> bool:
|
||
"""
|
||
检查分子是否为指定大小的有效大环内酯
|
||
|
||
Args:
|
||
mol: RDKit分子对象
|
||
size: 环大小
|
||
ester_patterns: 可选的酯键SMARTS模式列表
|
||
|
||
Returns:
|
||
是否为有效大环内酯
|
||
"""
|
||
if mol is None:
|
||
return False
|
||
|
||
# 找到指定大小的环
|
||
ring_atoms = get_ring_atoms_by_size(mol, size)
|
||
if not ring_atoms:
|
||
return False
|
||
|
||
# 检查环上是否有酯键
|
||
return MacroLactoneAnalyzer.has_ester_on_ring(mol, ring_atoms, ester_patterns)
|
||
|
||
@staticmethod
|
||
def analyze_smiles(smiles: str,
|
||
ring_range: range = range(12, 21),
|
||
ester_patterns: Optional[List[str]] = None) -> Optional[Dict[str, Union[int, List[int], bool]]]:
|
||
"""
|
||
对单个SMILES字符串进行大环内酯检测和分析。
|
||
|
||
Args:
|
||
smiles: 分子的SMILES表示
|
||
ring_range: 需要检测的环大小范围,默认为12至20
|
||
ester_patterns: 可选的酯键SMARTS模式列表
|
||
|
||
Returns:
|
||
包含分析结果的字典:
|
||
- 'ring_sizes': 检测到的所有环大小列表
|
||
- 'valid_sizes': 有效大环内酯的环大小列表
|
||
- 'is_macrolactone': 是否为大环内酯(布尔值)
|
||
- 'has_ester': 是否含有酯键(布尔值)
|
||
- 'is_bridge': 是否为桥环(布尔值)
|
||
如果未匹配到任何大环内酯或解析失败则返回None
|
||
"""
|
||
if not isinstance(smiles, str):
|
||
raise TypeError("输入必须为SMILES字符串。")
|
||
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if mol is None:
|
||
raise ValueError(f"无法解析SMILES: {smiles}")
|
||
|
||
# 检测环大小
|
||
ring_sizes = MacroLactoneAnalyzer.detect_ring_sizes(mol)
|
||
|
||
# 过滤出有效的内酯环
|
||
valid_sizes = []
|
||
for size in ring_range:
|
||
if size in ring_sizes:
|
||
if MacroLactoneAnalyzer.is_valid_macrolactone(mol, size, ester_patterns):
|
||
valid_sizes.append(size)
|
||
|
||
# 分析结果
|
||
result = {
|
||
'ring_sizes': ring_sizes,
|
||
'valid_sizes': valid_sizes,
|
||
'is_macrolactone': len(valid_sizes) > 0,
|
||
'has_ester': any(
|
||
MacroLactoneAnalyzer.has_ester_on_ring(
|
||
mol, get_ring_atoms_by_size(mol, size), ester_patterns
|
||
) for size in ring_sizes
|
||
),
|
||
'is_bridge': len(valid_sizes) > 1
|
||
}
|
||
|
||
return result if result['is_macrolactone'] else None
|
||
|
||
@staticmethod
|
||
def dynamic_smarts_match(smiles: str, ring_size: int) -> Optional[List[int]]:
|
||
"""
|
||
使用动态构造的SMARTS模式匹配大环内酯
|
||
|
||
参考用户的analyze_smiles方法实现
|
||
|
||
Args:
|
||
smiles: SMILES字符串
|
||
ring_size: 环大小
|
||
|
||
Returns:
|
||
匹配到的原子索引列表,未匹配到则返回None
|
||
"""
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if mol is None:
|
||
return None
|
||
|
||
# 动态构造SMARTS模式:[r{ring_size}]([#8][#6](=[#8]))
|
||
smarts = f'[r{ring_size}]([#8][#6](=[#8]))'
|
||
query = Chem.MolFromSmarts(smarts)
|
||
if query is None:
|
||
return None
|
||
|
||
matches = mol.GetSubstructMatches(query)
|
||
return matches[0] if matches else None
|
||
|
||
def get_single_ring_info(self,
|
||
smiles: str,
|
||
ring_range: range = range(12, 21),
|
||
ester_patterns: Optional[List[str]] = None) -> Optional[Dict[str, Union[int, List[int], bool]]]:
|
||
"""
|
||
对单个SMILES字符串返回大环内酯的详细信息。
|
||
|
||
Args:
|
||
smiles: 分子的SMILES表示
|
||
ring_range: 环大小检测范围,默认为12至20
|
||
ester_patterns: 可选的酯键SMARTS模式列表
|
||
|
||
Returns:
|
||
包含分析结果的字典;若未匹配到则返回None
|
||
"""
|
||
return self.analyze_smiles(smiles, ring_range, ester_patterns)
|
||
|
||
def analyze_list(self,
|
||
smiles_list: Optional[List[str]] = None,
|
||
ring_range: range = range(12, 21),
|
||
ester_patterns: Optional[List[str]] = None) -> Dict[str, Union[int, List[str]]]:
|
||
"""
|
||
对SMILES字符串列表进行统计分析。
|
||
|
||
Args:
|
||
smiles_list: SMILES字符串列表,如果为None则使用实例的smiles_list
|
||
ring_range: 环大小检测范围
|
||
ester_patterns: 可选的酯键SMARTS模式列表
|
||
|
||
Returns:
|
||
统计结果字典,包含:
|
||
- 'total': 总分子数
|
||
- 'macrolactones': 大环内酯分子数
|
||
- 'bridge_rings': 桥环分子数
|
||
- 'ring_size_stats': 各环大小分子数统计
|
||
- 'valid_molecules': 有效大环内酯的SMILES列表
|
||
- 'bridge_molecules': 桥环分子的SMILES列表
|
||
"""
|
||
target_list = smiles_list if smiles_list is not None else self.smiles_list
|
||
|
||
if not target_list:
|
||
return {'total': 0, 'macrolactones': 0}
|
||
|
||
stats = {
|
||
'total': len(target_list),
|
||
'macrolactones': 0,
|
||
'bridge_rings': 0,
|
||
'ring_size_stats': {size: 0 for size in ring_range},
|
||
'valid_molecules': [],
|
||
'bridge_molecules': [],
|
||
'failed_molecules': []
|
||
}
|
||
|
||
for smiles in target_list:
|
||
try:
|
||
result = self.analyze_smiles(smiles, ring_range, ester_patterns)
|
||
if result:
|
||
stats['macrolactones'] += 1
|
||
if result['is_bridge']:
|
||
stats['bridge_rings'] += 1
|
||
stats['bridge_molecules'].append(smiles)
|
||
else:
|
||
size = result['valid_sizes'][0]
|
||
stats['ring_size_stats'][size] += 1
|
||
stats['valid_molecules'].append(smiles)
|
||
except Exception as e:
|
||
stats['failed_molecules'].append((smiles, str(e)))
|
||
|
||
return stats
|
||
|
||
def classify_molecules(self,
|
||
df: pd.DataFrame,
|
||
smiles_column: str = 'smiles',
|
||
id_column: Optional[str] = None,
|
||
ring_range: range = range(12, 21),
|
||
ester_patterns: Optional[List[str]] = None) -> Tuple[Dict[int, pd.DataFrame], pd.DataFrame]:
|
||
"""
|
||
将DataFrame中的分子按环大小分类(参考notebook中的classify_molecules_by_ring_size函数)
|
||
|
||
Args:
|
||
df: 包含SMILES的DataFrame
|
||
smiles_column: SMILES列名
|
||
id_column: ID列名(可选)
|
||
ring_range: 环大小检测范围
|
||
ester_patterns: 可选的酯键SMARTS模式列表
|
||
|
||
Returns:
|
||
Tuple of (ring_size_to_df_dict, ambiguous_df)
|
||
- ring_size_to_df_dict: 环大小 -> DataFrame映射
|
||
- ambiguous_df: 有多个可能环数的分子DataFrame
|
||
"""
|
||
print(f"开始对 {len(df)} 个分子进行环数识别和分类...")
|
||
|
||
# 初始化结果
|
||
ring_size_dfs = {size: [] for size in ring_range}
|
||
ambiguous_molecules = []
|
||
|
||
stats = {
|
||
'processed': 0,
|
||
'single_ring': {size: 0 for size in ring_range},
|
||
'ambiguous': 0,
|
||
'no_valid_ring': 0,
|
||
'failed': 0
|
||
}
|
||
|
||
for idx, row in df.iterrows():
|
||
smiles = row[smiles_column]
|
||
mol_id = row[id_column] if id_column else f"row_{idx}"
|
||
|
||
try:
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if mol is None:
|
||
stats['no_valid_ring'] += 1
|
||
continue
|
||
|
||
# 检测环大小
|
||
ring_sizes = self.detect_ring_sizes(mol)
|
||
|
||
# 过滤出有效的内酯环
|
||
valid_sizes = []
|
||
for size in ring_sizes:
|
||
if self.is_valid_macrolactone(mol, size, ester_patterns):
|
||
valid_sizes.append(size)
|
||
|
||
# 分类
|
||
if len(valid_sizes) == 1:
|
||
# 单个有效环大小
|
||
size = valid_sizes[0]
|
||
row_copy = row.copy()
|
||
row_copy['detected_ring_sizes'] = valid_sizes
|
||
row_copy['ring_size'] = size
|
||
ring_size_dfs[size].append(row_copy)
|
||
stats['single_ring'][size] += 1
|
||
elif len(valid_sizes) > 1:
|
||
# 多个可能环大小(桥环情况)
|
||
row_copy = row.copy()
|
||
row_copy['detected_ring_sizes'] = valid_sizes
|
||
ambiguous_molecules.append(row_copy)
|
||
stats['ambiguous'] += 1
|
||
else:
|
||
# 没有有效环
|
||
stats['no_valid_ring'] += 1
|
||
|
||
except Exception as e:
|
||
stats['failed'] += 1
|
||
print(f"警告:分子 {mol_id} 处理失败: {e}")
|
||
|
||
stats['processed'] += 1
|
||
|
||
# 进度显示
|
||
if stats['processed'] % 100 == 0:
|
||
print(f"已处理 {stats['processed']}/{len(df)} 个分子...")
|
||
|
||
# 转换为DataFrame
|
||
ring_size_to_df = {}
|
||
for size in ring_range:
|
||
if ring_size_dfs[size]:
|
||
ring_size_to_df[size] = pd.DataFrame(ring_size_dfs[size])
|
||
else:
|
||
ring_size_to_df[size] = pd.DataFrame()
|
||
|
||
ambiguous_df = pd.DataFrame(ambiguous_molecules) if ambiguous_molecules else pd.DataFrame()
|
||
|
||
# 打印统计结果
|
||
print("\n" + "="*60)
|
||
print("环数识别和分类结果:")
|
||
print("="*60)
|
||
print(f"总处理分子数: {stats['processed']}")
|
||
print(f"无有效环分子: {stats['no_valid_ring']}")
|
||
print(f"桥环分子(多个环数): {stats['ambiguous']}")
|
||
if stats['failed'] > 0:
|
||
print(f"处理失败分子: {stats['failed']}")
|
||
print("\n各环大小分子数:")
|
||
for size in ring_range:
|
||
count = stats['single_ring'][size]
|
||
if count > 0:
|
||
print(f" {size}元环: {count} 个分子")
|
||
print("="*60)
|
||
|
||
return ring_size_to_df, ambiguous_df
|
||
|
||
def add_smiles(self, smiles: Union[str, List[str]]):
|
||
"""
|
||
添加一个或多个SMILES字符串到当前分析器中。
|
||
|
||
Args:
|
||
smiles: 单个SMILES字符串或字符串列表
|
||
|
||
Raises:
|
||
TypeError: 如果输入既不是字符串也不是字符串列表
|
||
"""
|
||
if isinstance(smiles, str):
|
||
self.smiles_list.append(smiles)
|
||
elif isinstance(smiles, list):
|
||
for s in smiles:
|
||
if not isinstance(s, str):
|
||
raise TypeError("列表中的每个元素必须为SMILES字符串。")
|
||
self.smiles_list.append(s)
|
||
else:
|
||
raise TypeError("输入必须为SMILES字符串或字符串列表。")
|
||
|
||
def calculate_molecular_properties(self, smiles: str) -> Optional[Dict[str, float]]:
|
||
"""
|
||
计算分子的基本性质
|
||
|
||
Args:
|
||
smiles: SMILES字符串
|
||
|
||
Returns:
|
||
包含分子性质的字典,如果计算失败则返回None
|
||
"""
|
||
try:
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if mol is None:
|
||
return None
|
||
|
||
return {
|
||
'mol_weight': Descriptors.MolWt(mol),
|
||
'logP': Crippen.MolLogP(mol),
|
||
'qed': QED.qed(mol),
|
||
'tpsa': Descriptors.TPSA(mol),
|
||
'num_atoms': mol.GetNumAtoms()
|
||
}
|
||
except Exception as e:
|
||
print(f"警告:计算分子性质失败: {e}")
|
||
return None
|
||
|
||
|
||
# 示例用法和测试
|
||
if __name__ == "__main__":
|
||
# 创建分析器实例
|
||
analyzer = MacroLactoneAnalyzer()
|
||
|
||
# 测试SMILES
|
||
test_smiles = [
|
||
"O=C1CCCCCCCC(=O)OCC/C=C/C=C/1", # 16元环大环内酯
|
||
"CC(=O)OC1=CC=CC=C1C(=O)O", # 非大环内酯
|
||
]
|
||
|
||
# 单分子分析
|
||
print("="*60)
|
||
print("单分子分析测试")
|
||
print("="*60)
|
||
for smiles in test_smiles:
|
||
result = analyzer.get_single_ring_info(smiles)
|
||
print(f"\nSMILES: {smiles[:50]}...")
|
||
if result:
|
||
print(f" ✓ 有效大环内酯")
|
||
print(f" 环大小: {result['valid_sizes']}")
|
||
print(f" 是否桥环: {result['is_bridge']}")
|
||
else:
|
||
print(f" ✗ 非大环内酯或无有效环")
|
||
|
||
# 批量分析
|
||
print("\n" + "="*60)
|
||
print("批量分析测试")
|
||
print("="*60)
|
||
analyzer.add_smiles(test_smiles)
|
||
stats = analyzer.analyze_list()
|
||
print(f"\n统计结果:")
|
||
print(f" 总分子数: {stats['total']}")
|
||
print(f" 大环内酯: {stats['macrolactones']}")
|
||
print(f" 桥环: {stats['bridge_rings']}")
|
||
print(f" 环大小统计: {stats['ring_size_stats']}")
|
||
|
||
# DataFrame分类测试
|
||
print("\n" + "="*60)
|
||
print("DataFrame分类测试")
|
||
print("="*60)
|
||
test_df = pd.DataFrame({
|
||
'ID': ['mol1', 'mol2', 'mol3'],
|
||
'smiles': test_smiles + ["O=C1CCCCCCCC(=O)OCC/C=C/C=C/1"]
|
||
})
|
||
|
||
ring_size_dfs, ambiguous_df = analyzer.classify_molecules(test_df, smiles_column='smiles', id_column='ID')
|
||
print(f"\n分类结果:")
|
||
for size, df_size in ring_size_dfs.items():
|
||
if not df_size.empty:
|
||
print(f" {size}元环: {len(df_size)} 个分子")
|
||
print(f"桥环分子: {len(ambiguous_df)} 个")
|