Update ring_visualization.py with enhanced filtering and batch processing capabilities

- Add batch_visualize_macrolactones function for processing multiple molecules - Add test_all_ring_sizes_with_filtering function for comprehensive testing - Add analyze_ring_atom_composition function for composition analysis - Improve error handling and progress reporting - Add support for carbon-only and carbon-nitrogen filtering - Enhanced SVG output with better file organization
2025-11-14 21:34:45 +08:00
parent 2c3547cc88
commit c9ef531d9b
1 changed files with 646 additions and 28 deletions
--- a/src/ring_visualization.py
+++ b/src/ring_visualization.py
@@ -7,11 +7,99 @@ This module provides functions for:
 3. Supporting both general molecules and macrolactones (12-20 membered rings)
 """
 from rdkit import Chem
-from rdkit.Chem import Draw
+from rdkit.Chem import Draw, AllChem
 from rdkit.Chem.Draw import rdMolDraw2D
 from IPython.display import SVG, display
 from typing import Dict, Tuple, Optional, List, Union
 from collections import deque
+from pathlib import Path
+import pandas as pd
+
+# 导入分析器类
+try:
+    from src.macro_lactone_analyzer import MacroLactoneAnalyzer
+except ImportError:
+    # 如果导入失败，定义一个简单的替代类
+    class MacroLactoneAnalyzer:
+        def is_valid_macrolactone(self, mol, ring_size):
+            # 简单检查是否有指定大小的环
+            ring_atoms = get_ring_atoms_by_size(mol, ring_size)
+            return ring_atoms is not None
+
+# 原子序数映射，支持元素符号和原子序数字符串
+ATOMIC_NUMBERS = {
+    "H": 1, "C": 6, "N": 7, "O": 8, "S": 16, "P": 15, 
+    "F": 9, "Cl": 17, "Br": 35, "I": 53
+}
+
+def normalize_atom_types(atom_types: Optional[List[str]]) -> Optional[List[int]]:
+    """
+    标准化原子类型列表，将元素符号转换为原子序数
+    
+    Args:
+        atom_types: 原子类型列表，可以是元素符号["C", "N"]或原子序数["6", "7"]
+    
+    Returns:
+        标准化的原子序数列表，如果输入为None则返回None
+    """
+    if atom_types is None:
+        return None
+    
+    normalized = []
+    for atom_type in atom_types:
+        if atom_type.isdigit():
+            # 已经是原子序数字符串
+            normalized.append(int(atom_type))
+        elif atom_type.upper() in ATOMIC_NUMBERS:
+            # 元素符号，转换为原子序数
+            normalized.append(ATOMIC_NUMBERS[atom_type.upper()])
+        else:
+            raise ValueError(f"不支持的原子类型: {atom_type}")
+    
+    return normalized
+
+def validate_ring_atom_composition(
+    mol: Chem.Mol, 
+    ring_atoms: List[int], 
+    carbonyl_carbon_idx: int, 
+    ester_oxygen_idx: int,
+    allowed_atom_types: Optional[List[int]] = None
+) -> Tuple[bool, str]:
+    """
+    验证环原子组成是否符合要求
+    
+    Args:
+        mol: RDKit分子对象
+        ring_atoms: 环原子索引列表
+        carbonyl_carbon_idx: 羰基碳索引（位置1）
+        ester_oxygen_idx: 酯氧索引（位置2）
+        allowed_atom_types: 允许的原子类型列表（原子序数）
+    
+    Returns:
+        (is_valid, reason): 是否有效及原因说明
+    """
+    if allowed_atom_types is None:
+        return True, "不限制原子类型"
+    
+    # 检查环中每个原子（除了酯键氧）
+    invalid_atoms = []
+    
+    for atom_idx in ring_atoms:
+        # 跳过酯键氧原子（位置2）
+        if atom_idx == ester_oxygen_idx:
+            continue
+            
+        atom = mol.GetAtomWithIdx(atom_idx)
+        atomic_num = atom.GetAtomicNum()
+        
+        if atomic_num not in allowed_atom_types:
+            symbol = atom.GetSymbol()
+            invalid_atoms.append(f"{symbol}(索引:{atom_idx})")
+    
+    if invalid_atoms:
+        return False, f"环中包含不允许的原子类型: {', '.join(invalid_atoms)}"
+    
+    return True, f"环原子组成符合要求，只允许: {[Chem.GetPeriodicTable().GetElementSymbol(num) for num in allowed_atom_types]}"


 def get_ring_atoms_by_size(mol: Chem.Mol, ring_size: int) -> Optional[List[int]]:
@@ -223,16 +311,85 @@ def create_ring_numbering(

 def get_macrolactone_numbering(
    mol: Union[Chem.Mol, str], 
-    ring_size: int = 16
-) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int]]:
+    ring_size: int = 16,
+    allowed_ring_atom_types: Optional[List[str]] = None
+) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int], Tuple[bool, str]]:
    """
-    Get ring numbering for a macrolactone molecule.
+    Get ring numbering for a macrolactone molecule with optional atom type filtering.
    
    This function performs the complete numbering workflow:
    1. Find ring of specified size
    2. Find ester group
    3. Find carbonyl carbon and ester oxygen
-    4. Create numbering mapping
+    4. Validate ring atom composition (if filtering requested)
+    5. Create numbering mapping
+    
+    Args:
+        mol: RDKit molecule object or SMILES string
+        ring_size: Size of the macrolactone ring (12-20, default 16)
+        allowed_ring_atom_types: Allowed atom types for ring atoms (excluding ester oxygen)
+            - None: No restriction (default behavior)
+            - ["C"]: Only carbon atoms allowed (except ester oxygen)
+            - ["C", "N"]: Carbon and nitrogen atoms allowed (except ester oxygen)
+            - Can use element symbols or atomic number strings: ["6", "7"]
+    
+    Returns:
+        Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason))
+        Returns (None, None, None, None, None, (False, reason)) if numbering fails
+    """
+    # Convert SMILES to molecule if needed
+    if isinstance(mol, str):
+        mol = Chem.MolFromSmiles(mol)
+        if mol is None:
+            return None, None, None, None, None, (False, "无法解析SMILES字符串")
+    
+    # 标准化原子类型
+    try:
+        allowed_atom_numbers = normalize_atom_types(allowed_ring_atom_types)
+    except ValueError as e:
+        return None, None, None, None, None, (False, f"原子类型参数错误: {str(e)}")
+    
+    # Find ring of specified size
+    ring_atoms = get_ring_atoms_by_size(mol, ring_size)
+    if ring_atoms is None:
+        return None, None, None, None, None, (False, f"未找到{ring_size}元环")
+    
+    # Find ester group
+    ester_atoms, pattern = find_ester_smarts(mol, ring_atoms)
+    if ester_atoms is None:
+        return None, None, None, None, None, (False, "未在环中找到酯键")
+    
+    # Find carbonyl carbon and ester oxygen
+    carbonyl_carbon = get_carbonyl_carbon_in_ring(mol, ester_atoms, ring_atoms)
+    ester_oxygen = get_ester_oxygen_in_ring(mol, ester_atoms, ring_atoms)
+    
+    if carbonyl_carbon is None or ester_oxygen is None:
+        return None, None, None, None, None, (False, "无法识别羰基碳或酯氧原子")
+    
+    # 验证环原子组成（如果指定了允许的原子类型）
+    if allowed_atom_numbers is not None:
+        is_valid, validation_reason = validate_ring_atom_composition(
+            mol, ring_atoms, carbonyl_carbon, ester_oxygen, allowed_atom_numbers
+        )
+        if not is_valid:
+            return None, None, None, None, None, (False, validation_reason)
+    else:
+        validation_reason = "不限制原子类型"
+    
+    # Create numbering
+    ring_numbering, ordered_atoms = create_ring_numbering(
+        mol, ring_atoms, carbonyl_carbon, ester_oxygen
+    )
+    
+    return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (True, validation_reason)
+
+
+def get_macrolactone_numbering_legacy(
+    mol: Union[Chem.Mol, str], 
+    ring_size: int = 16
+) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int]]:
+    """
+    向后兼容版本的get_macrolactone_numbering函数
    
    Args:
        mol: RDKit molecule object or SMILES string
@@ -242,35 +399,496 @@ def get_macrolactone_numbering(
        Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen)
        Returns (None, None, None, None, None) if numbering fails
    """
-    # Convert SMILES to molecule if needed
+    ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
+        get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types=None)
+    
+    if not is_valid:
+        return None, None, None, None, None
+    
+    return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen
+
+
+def is_pure_carbon_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
+    """
+    便捷函数：检查是否为纯碳大环内酯（除酯键氧外只含碳原子）
+    
+    Args:
+        mol: RDKit molecule object or SMILES string
+        ring_size: Size of the macrolactone ring
+    
+    Returns:
+        (is_valid, reason): 是否为纯碳大环内酯及原因
+    """
+    _, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
+        mol, ring_size, allowed_ring_atom_types=["C"]
+    )
+    return is_valid, reason
+
+
+def is_carbon_nitrogen_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
+    """
+    便捷函数：检查是否为碳氮大环内酯（除酯键氧外只含碳和氮原子）
+    
+    Args:
+        mol: RDKit molecule object or SMILES string
+        ring_size: Size of the macrolactone ring
+    
+    Returns:
+        (is_valid, reason): 是否为碳氮大环内酯及原因
+    """
+    _, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
+        mol, ring_size, allowed_ring_atom_types=["C", "N"]
+    )
+    return is_valid, reason
+
+
+def visualize_macrolactone_with_auto_coloring(
+    mol: Union[Chem.Mol, str],
+    ring_size: Optional[int] = None,
+    allowed_ring_atom_types: Optional[List[str]] = None,
+    size: Tuple[int, int] = (800, 800),
+    title: str = "",
+    return_svg: bool = False,
+    show_atom_labels: bool = True
+) -> Union[str, None]:
+    """
+    可视化大环内酯，自动根据原子类型进行颜色编码
+    
+    Args:
+        mol: RDKit分子对象或SMILES字符串
+        ring_size: 环大小（12-20），如果为None则自动检测
+        allowed_ring_atom_types: 允许的环原子类型（除酯氧外）
+            - None: 不限制，显示所有原子类型
+            - ["C"]: 只允许碳原子（默认筛选条件）
+            - ["C", "N"]: 允许碳和氮原子
+        size: 图像大小
+        title: 图像标题
+        return_svg: 是否返回SVG字符串
+        show_atom_labels: 是否显示原子编号标签
+    
+    Returns:
+        SVG字符串（如果return_svg=True）或显示图像
+    """
+    # 转换SMILES到分子对象
+    if isinstance(mol, str):
+        mol_obj = Chem.MolFromSmiles(mol)
+        if mol_obj is None:
+            if return_svg:
+                return None
+            else:
+                print("❌ 无法解析SMILES字符串")
+                return None
+    else:
+        mol_obj = mol
+    
+    # 自动检测环大小（如果未指定）
+    if ring_size is None:
+        ring_sizes = []
+        for size in range(12, 21):
+            ring_atoms = get_ring_atoms_by_size(mol_obj, size)
+            if ring_atoms:
+                # 检查是否为有效大环内酯
+                analyzer = MacroLactoneAnalyzer()
+                if analyzer.is_valid_macrolactone(mol_obj, size):
+                    ring_sizes.append(size)
+        
+        if not ring_sizes:
+            if return_svg:
+                return None
+            else:
+                print("❌ 未找到12-20元大环内酯")
+                return None
+        
+        # 使用找到的第一个环大小
+        ring_size = ring_sizes[0]
+        if len(ring_sizes) > 1:
+            print(f"⚠️  找到多个环大小: {ring_sizes}，使用 {ring_size}")
+    
+    # 获取环编号信息
+    ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
+        get_macrolactone_numbering(mol_obj, ring_size, allowed_ring_atom_types)
+    
+    if not is_valid or not ring_atoms:
+        if return_svg:
+            return None
+        else:
+            print(f"❌ 无法获取环编号: {validation_reason}")
+            return None
+    
+    # 创建分子副本并设置原子标签
+    mol_copy = Chem.Mol(mol_obj)
+    AllChem.Compute2DCoords(mol_copy)
+    
+    if show_atom_labels:
+        for atom_idx in ring_atoms:
+            if atom_idx in ring_numbering:
+                atom = mol_copy.GetAtomWithIdx(atom_idx)
+                atom.SetProp("atomNote", str(ring_numbering[atom_idx]))
+    
+    # 定义原子颜色方案
+    def get_atom_color(symbol: str, is_ester_oxygen: bool = False) -> Tuple[float, float, float]:
+        """获取原子颜色"""
+        if is_ester_oxygen:
+            return (1.0, 0.4, 0.4)  # 酯氧用深红色
+        elif symbol == 'C':
+            return (0.7, 0.8, 1.0)  # 碳用蓝色
+        elif symbol == 'N':
+            return (1.0, 0.8, 1.0)  # 氮用粉色
+        elif symbol == 'O':
+            return (1.0, 0.7, 0.7)  # 氧用红色
+        elif symbol == 'S':
+            return (1.0, 1.0, 0.6)  # 硫用黄色
+        elif symbol == 'P':
+            return (0.8, 0.6, 1.0)  # 磷用紫色
+        else:
+            return (0.8, 1.0, 0.8)  # 其他用绿色
+    
+    # 设置原子颜色
+    atom_colors = {}
+    atom_type_stats = {}
+    
+    for atom_idx in ring_atoms:
+        atom = mol_obj.GetAtomWithIdx(atom_idx)
+        symbol = atom.GetSymbol()
+        is_ester_oxygen = (atom_idx == ester_oxygen)
+        
+        # 设置颜色
+        color = get_atom_color(symbol, is_ester_oxygen)
+        atom_colors[atom_idx] = color
+        
+        # 统计原子类型（不包括酯氧）
+        if not is_ester_oxygen:
+            atom_type_stats[symbol] = atom_type_stats.get(symbol, 0) + 1
+    
+    # 绘制分子
+    # 确保size是元组格式
+    if isinstance(size, int):
+        size = (size, size)
+    elif isinstance(size, (list, tuple)) and len(size) == 1:
+        size = (size[0], size[0])
+    elif not isinstance(size, (list, tuple)) or len(size) != 2:
+        size = (800, 800)  # 默认大小
+    
+    drawer = rdMolDraw2D.MolDraw2DSVG(int(size[0]), int(size[1]))
+    drawer.SetFontSize(12)  # 设置合适的字体大小（最小为6）
+    
+    # 注意：某些RDKit版本不支持DrawTitle，暂时注释掉
+    # if title:
+    #     drawer.DrawTitle(title)
+    
+    drawer.DrawMolecule(mol_copy, 
+                       highlightAtoms=ring_atoms,
+                       highlightAtomColors=atom_colors)
+    drawer.FinishDrawing()
+    svg = drawer.GetDrawingText()
+    
+    # 显示统计信息
+    if not return_svg:
+        display(SVG(svg))
+        
+        print(f"\n📊 分子信息:")
+        print(f"   环大小: {ring_size} 元环")
+        print(f"   羰基碳位置: {ring_numbering.get(carbonyl_carbon, 'N/A')} (深红色标记)")
+        print(f"   酯氧位置: {ring_numbering.get(ester_oxygen, 'N/A')} (深红色标记)")
+        print(f"   环原子组成: {atom_type_stats}")
+        print(f"   筛选条件: {validation_reason}")
+        
+        # 显示颜色说明
+        print(f"\n🎨 颜色说明:")
+        print(f"   深红色: 酯键氧原子 (位置2)")
+        print(f"   蓝色: 碳原子")
+        print(f"   粉色: 氮原子")
+        print(f"   红色: 氧原子")
+        print(f"   黄色: 硫原子")
+        print(f"   紫色: 磷原子")
+        print(f"   绿色: 其他原子")
+    
+    return svg if return_svg else None
+
+
+def batch_visualize_macrolactones(
+    data_file: Path,
+    ring_sizes: List[int] = None,
+    allowed_ring_atom_types: Optional[List[str]] = None,
+    max_examples_per_size: int = 3,
+    output_dir: Optional[Path] = None
+) -> Dict[int, List[Dict]]:
+    """
+    批量可视化大环内酯分子
+    
+    Args:
+        data_file: CSV数据文件路径
+        ring_sizes: 要测试的环大小列表，默认为12-20
+        allowed_ring_atom_types: 允许的环原子类型
+        max_examples_per_size: 每种环大小最大示例数
+        output_dir: 输出目录，如果指定则保存SVG文件
+    
+    Returns:
+        按环大小分组的可视化结果字典
+    """
+    if ring_sizes is None:
+        ring_sizes = list(range(12, 21))
+    
+    print(f"🔍 开始批量可视化大环内酯")
+    print(f"   数据文件: {data_file}")
+    print(f"   环大小范围: {ring_sizes}")
+    print(f"   筛选条件: {allowed_ring_atom_types or '无限制'}")
+    
+    # 加载数据
+    if not data_file.exists():
+        print(f"❌ 数据文件不存在: {data_file}")
+        return {}
+    
+    df = pd.read_csv(data_file)
+    print(f"✓ 加载数据: {len(df)} 个分子")
+    
+    results = {}
+    
+    for ring_size in ring_sizes:
+        print(f"\n🔄 处理 {ring_size} 元环...")
+        
+        size_results = []
+        found_count = 0
+        
+        for idx, row in df.iterrows():
+            if found_count >= max_examples_per_size:
+                break
+            
+            smiles = row.get('smiles', '')
+            if not smiles:
+                continue
+            
+            try:
+                # 测试分子
+                mol = Chem.MolFromSmiles(smiles)
+                if not mol:
+                    continue
+                
+                # 检查是否为指定大小的有效大环内酯
+                analyzer = MacroLactoneAnalyzer()
+                if not analyzer.is_valid_macrolactone(mol, ring_size):
+                    continue
+                
+                # 应用筛选条件（如果指定）
+                if allowed_ring_atom_types:
+                    is_valid, validation_reason = get_macrolactone_numbering(
+                        mol, ring_size, allowed_ring_atom_types
+                    )[5]
+                    if not is_valid:
+                        continue
+                
+                # 获取详细信息
+                ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
+                    get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types)
+                
+                if is_valid:
+                    # 分析原子组成
+                    composition = analyze_ring_atom_composition(smiles, ring_size)
+                    
+                    result = {
+                        'index': idx,
+                        'smiles': smiles,
+                        'molecule_id': row.get('molecule_id', f'mol_{idx}'),
+                        'ring_size': ring_size,
+                        'composition': composition,
+                        'validation_reason': validation_reason,
+                        'carbonyl_carbon_pos': ring_numbering.get(carbonyl_carbon, 'N/A'),
+                        'ester_oxygen_pos': ring_numbering.get(ester_oxygen, 'N/A')
+                    }
+                    
+                    size_results.append(result)
+                    found_count += 1
+                    
+                    print(f"   ✓ 找到示例 {found_count}: 分子{idx} ({composition})")
+                    
+                    # 保存可视化（如果指定了输出目录）
+                    if output_dir:
+                        output_dir.mkdir(parents=True, exist_ok=True)
+                        filename = f"ring{ring_size}_example{found_count}_mol{idx}.svg"
+                        output_path = output_dir / filename
+                        
+                        svg = visualize_macrolactone_with_auto_coloring(
+                            mol, ring_size, allowed_ring_atom_types, 
+                            return_svg=True, 
+                            title=f"{ring_size}-元环示例 {found_count}"
+                        )
+                        
+                        if svg:
+                            with open(output_path, 'w', encoding='utf-8') as f:
+                                f.write(svg)
+                            print(f"     💾 保存到: {output_path}")
+                
+            except Exception as e:
+                print(f"   ⚠️  处理分子 {idx} 时出错: {str(e)}")
+                continue
+        
+        results[ring_size] = size_results
+        print(f"   📊 {ring_size} 元环: 找到 {len(size_results)} 个示例")
+    
+    # 显示总体统计
+    print(f"\n📈 总体统计:")
+    total_found = sum(len(results[size]) for size in ring_sizes)
+    print(f"   总示例数: {total_found}")
+    
+    for ring_size in ring_sizes:
+        count = len(results[ring_size])
+        print(f"   {ring_size} 元环: {count} 个示例")
+    
+    return results
+
+
+def test_all_ring_sizes_with_filtering(
+    filtered_data_dir: Path,
+    allowed_ring_atom_types: Optional[List[str]] = ["C"],  # 默认只允许碳原子
+    output_dir: Optional[Path] = None
+) -> Dict[int, Dict]:
+    """
+    测试所有环大小（12-20）的筛选效果
+    
+    Args:
+        filtered_data_dir: 包含filtered CSV文件的目录
+        allowed_ring_atom_types: 允许的环原子类型
+        output_dir: 输出目录
+    
+    Returns:
+        详细的测试结果
+    """
+    print(f"🧪 开始测试所有环大小的筛选效果")
+    print(f"   数据目录: {filtered_data_dir}")
+    print(f"   筛选条件: {allowed_ring_atom_types or '无限制'}")
+    
+    all_results = {}
+    
+    for ring_size in range(12, 21):
+        print(f"\n{'='*60}")
+        print(f"🔍 测试 {ring_size} 元环")
+        print(f"{'='*60}")
+        
+        # 查找对应的数据文件
+        data_file = filtered_data_dir / f'macrolactone_ring{ring_size}_filtered.csv'
+        
+        if not data_file.exists():
+            print(f"⚠️  未找到 {ring_size} 元环数据文件: {data_file}")
+            all_results[ring_size] = {
+                'data_file_exists': False,
+                'total_molecules': 0,
+                'valid_macrolactones': 0,
+                'filtered_molecules': 0,
+                'examples': []
+            }
+            continue
+        
+        # 批量测试
+        batch_results = batch_visualize_macrolactones(
+            data_file, 
+            ring_sizes=[ring_size],
+            allowed_ring_atom_types=allowed_ring_atom_types,
+            max_examples_per_size=5,
+            output_dir=output_dir / f'ring{ring_size}_examples' if output_dir else None
+        )
+        
+        # 统计结果
+        size_results = batch_results.get(ring_size, [])
+        
+        # 加载完整数据进行统计
+        try:
+            df = pd.read_csv(data_file)
+            total_molecules = len(df)
+            
+            # 统计有效大环内酯数量
+            valid_count = 0
+            filtered_count = 0
+            
+            for _, row in df.iterrows():
+                smiles = row.get('smiles', '')
+                if not smiles:
+                    continue
+                
+                try:
+                    mol = Chem.MolFromSmiles(smiles)
+                    if mol:
+                        analyzer = MacroLactoneAnalyzer()
+                        if analyzer.is_valid_macrolactone(mol, ring_size):
+                            valid_count += 1
+                            
+                            if allowed_ring_atom_types:
+                                is_valid, _ = get_macrolactone_numbering(
+                                    mol, ring_size, allowed_ring_atom_types
+                                )[5]
+                                if is_valid:
+                                    filtered_count += 1
+                            else:
+                                filtered_count += 1
+                except:
+                    continue
+            
+            all_results[ring_size] = {
+                'data_file_exists': True,
+                'total_molecules': total_molecules,
+                'valid_macrolactones': valid_count,
+                'filtered_molecules': filtered_count,
+                'filter_rate': filtered_count / valid_count * 100 if valid_count > 0 else 0,
+                'examples': size_results
+            }
+            
+            print(f"\n📊 {ring_size} 元环统计:")
+            print(f"   总分子数: {total_molecules}")
+            print(f"   有效大环内酯: {valid_count}")
+            print(f"   通过筛选: {filtered_count}")
+            print(f"   筛选通过率: {filtered_count/valid_count*100:.1f}%" if valid_count > 0 else "   筛选通过率: 0%")
+            
+        except Exception as e:
+            print(f"❌ 统计 {ring_size} 元环时出错: {str(e)}")
+            all_results[ring_size] = {
+                'data_file_exists': True,
+                'error': str(e),
+                'examples': size_results
+            }
+    
+    # 显示总体统计
+    print(f"\n{'='*60}")
+    print(f"📈 所有环大小测试总结")
+    print(f"{'='*60}")
+    
+    total_molecules = sum(result.get('total_molecules', 0) for result in all_results.values())
+    total_valid = sum(result.get('valid_macrolactones', 0) for result in all_results.values())
+    total_filtered = sum(result.get('filtered_molecules', 0) for result in all_results.values())
+    
+    print(f"总分子数: {total_molecules}")
+    print(f"总有效大环内酯: {total_valid}")
+    print(f"总通过筛选: {total_filtered}")
+    print(f"总体筛选通过率: {total_filtered/total_valid*100:.1f}%" if total_valid > 0 else "总体筛选通过率: 0%")
+    
+    return all_results
+
+
+def analyze_ring_atom_composition(mol: Union[Chem.Mol, str], ring_size: int) -> Dict[str, int]:
+    # 首先获取环编号信息
+    ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
+        get_macrolactone_numbering(mol, ring_size)
+    
+    if not is_valid or not ring_atoms:
+        return {}
+    
+    # 转换SMILES到分子对象（如果需要）
    if isinstance(mol, str):
        mol = Chem.MolFromSmiles(mol)
        if mol is None:
-            return None, None, None, None, None
+            return {}
    
-    # Find ring of specified size
-    ring_atoms = get_ring_atoms_by_size(mol, ring_size)
-    if ring_atoms is None:
-        return None, None, None, None, None
+    # 统计原子类型
+    composition = {}
    
-    # Find ester group
-    ester_atoms, pattern = find_ester_smarts(mol, ring_atoms)
-    if ester_atoms is None:
-        return None, None, None, None, None
+    for atom_idx in ring_atoms:
+        # 跳过酯键氧原子
+        if atom_idx == ester_oxygen:
+            continue
+            
+        atom = mol.GetAtomWithIdx(atom_idx)
+        symbol = atom.GetSymbol()
+        composition[symbol] = composition.get(symbol, 0) + 1
    
-    # Find carbonyl carbon and ester oxygen
-    carbonyl_carbon = get_carbonyl_carbon_in_ring(mol, ester_atoms, ring_atoms)
-    ester_oxygen = get_ester_oxygen_in_ring(mol, ester_atoms, ring_atoms)
-    
-    if carbonyl_carbon is None or ester_oxygen is None:
-        return None, None, None, None, None
-    
-    # Create numbering
-    ring_numbering, ordered_atoms = create_ring_numbering(
-        mol, ring_atoms, carbonyl_carbon, ester_oxygen
-    )
-    
-    return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen
+    return composition


 def draw_mol_with_ring_numbering(