search_macro/src/ring_visualization.py

"""
Ring numbering and visualization utilities for macrolactones.

This module provides functions for:
1. Assigning ring numbering starting from C=O carbonyl carbon
2. Visualizing molecules with ring numbering in Jupyter notebooks
3. Supporting both general molecules and macrolactones (12-20 membered rings)
"""
from rdkit import Chem
from rdkit.Chem import Draw, AllChem
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG, display
from typing import Dict, Tuple, Optional, List, Union
from collections import deque
from pathlib import Path
import pandas as pd

# 导入分析器类
try:
    from src.macro_lactone_analyzer import MacroLactoneAnalyzer
except ImportError:
    # 如果导入失败，定义一个简单的替代类
    class MacroLactoneAnalyzer:
        def is_valid_macrolactone(self, mol, ring_size):
            # 简单检查是否有指定大小的环
            ring_atoms = get_ring_atoms_by_size(mol, ring_size)
            return ring_atoms is not None

# 原子序数映射，支持元素符号和原子序数字符串
ATOMIC_NUMBERS = {
    "H": 1, "C": 6, "N": 7, "O": 8, "S": 16, "P": 15,
    "F": 9, "Cl": 17, "Br": 35, "I": 53
}

def normalize_atom_types(atom_types: Optional[List[str]]) -> Optional[List[int]]:
    """
    标准化原子类型列表，将元素符号转换为原子序数

    Args:
        atom_types: 原子类型列表，可以是元素符号["C", "N"]或原子序数["6", "7"]

    Returns:
        标准化的原子序数列表，如果输入为None则返回None
    """
    if atom_types is None:
        return None

    normalized = []
    for atom_type in atom_types:
        if atom_type.isdigit():
            # 已经是原子序数字符串
            normalized.append(int(atom_type))
        elif atom_type.upper() in ATOMIC_NUMBERS:
            # 元素符号，转换为原子序数
            normalized.append(ATOMIC_NUMBERS[atom_type.upper()])
        else:
            raise ValueError(f"不支持的原子类型: {atom_type}")

    return normalized

def validate_ring_atom_composition(
    mol: Chem.Mol,
    ring_atoms: List[int],
    carbonyl_carbon_idx: int,
    ester_oxygen_idx: int,
    allowed_atom_types: Optional[List[int]] = None
) -> Tuple[bool, str]:
    """
    验证环原子组成是否符合要求

    Args:
        mol: RDKit分子对象
        ring_atoms: 环原子索引列表
        carbonyl_carbon_idx: 羰基碳索引（位置1）
        ester_oxygen_idx: 酯氧索引（位置2）
        allowed_atom_types: 允许的原子类型列表（原子序数）

    Returns:
        (is_valid, reason): 是否有效及原因说明
    """
    if allowed_atom_types is None:
        return True, "不限制原子类型"

    # 检查环中每个原子（除了酯键氧）
    invalid_atoms = []

    for atom_idx in ring_atoms:
        # 跳过酯键氧原子（位置2）
        if atom_idx == ester_oxygen_idx:
            continue

        atom = mol.GetAtomWithIdx(atom_idx)
        atomic_num = atom.GetAtomicNum()

        if atomic_num not in allowed_atom_types:
            symbol = atom.GetSymbol()
            invalid_atoms.append(f"{symbol}(索引:{atom_idx})")

    if invalid_atoms:
        return False, f"环中包含不允许的原子类型: {', '.join(invalid_atoms)}"

    return True, f"环原子组成符合要求，只允许: {[Chem.GetPeriodicTable().GetElementSymbol(num) for num in allowed_atom_types]}"


def get_ring_atoms_by_size(mol: Chem.Mol, ring_size: int) -> Optional[List[int]]:
    """
    Find atoms in a ring of specified size.

    Args:
        mol: RDKit molecule object
        ring_size: Size of the ring to find (e.g., 16 for 16-membered ring)

    Returns:
        List of atom indices in the ring, or None if not found
    """
    ring_info = mol.GetRingInfo()
    rings = ring_info.AtomRings()

    for ring in rings:
        if len(ring) == ring_size:
            return list(ring)

    return None


def find_ester_smarts(mol: Chem.Mol, ring_atoms: List[int]) -> Tuple[Optional[List[int]], Optional[str]]:
    """
    Find ester group atoms using SMARTS patterns.

    Args:
        mol: RDKit molecule object
        ring_atoms: List of atom indices in the ring

    Returns:
        Tuple of (ester_atoms_list, pattern_string) or (None, None) if not found
    """
    ring_atoms_set = set(ring_atoms)

    # Try different SMARTS patterns
    patterns = [
        "[r16][C](=O)[O]",  # Original pattern for 16-membered ring
        "[C](=O)[O]",       # General pattern
        "C(=O)O",          # Simple pattern
    ]

    for pattern_str in patterns:
        pattern = Chem.MolFromSmarts(pattern_str)
        if pattern is None:
            continue

        matches = mol.GetSubstructMatches(pattern)
        for match in matches:
            # Check if first atom is in the ring
            if match[0] in ring_atoms_set:
                return list(match), pattern_str

    return None, None


def get_carbonyl_carbon_in_ring(mol: Chem.Mol, ester_atoms: List[int], ring_atoms: List[int]) -> Optional[int]:
    """
    Find the C=O carbonyl carbon atom in the ring.

    Args:
        mol: RDKit molecule object
        ester_atoms: List of ester group atom indices
        ring_atoms: List of ring atom indices

    Returns:
        Atom index of carbonyl carbon, or None if not found
    """
    ring_atoms_set = set(ring_atoms)

    for idx in ester_atoms:
        atom = mol.GetAtomWithIdx(idx)
        # Find C atom in the ring
        if atom.GetSymbol() == 'C' and idx in ring_atoms_set:
            # Check if this C has a double bond to O (carbonyl feature)
            for neighbor in atom.GetNeighbors():
                neighbor_idx = neighbor.GetIdx()
                bond = mol.GetBondBetweenAtoms(idx, neighbor_idx)
                if bond.GetBondType() == Chem.BondType.DOUBLE and neighbor.GetSymbol() == 'O':
                    return idx

    return None


def get_ester_oxygen_in_ring(mol: Chem.Mol, ester_atoms: List[int], ring_atoms: List[int]) -> Optional[int]:
    """
    Find the ester oxygen atom in the ring.

    Args:
        mol: RDKit molecule object
        ester_atoms: List of ester group atom indices
        ring_atoms: List of ring atom indices

    Returns:
        Atom index of ester oxygen in ring, or None if not found
    """
    ring_atoms_set = set(ring_atoms)

    for idx in ester_atoms:
        atom = mol.GetAtomWithIdx(idx)
        if atom.GetSymbol() == 'O' and idx in ring_atoms_set:
            return idx

    return None


def order_ring_atoms_from_start(
    mol: Chem.Mol,
    ring_atoms: List[int],
    start_atom_idx: int,
    target_atom_idx: Optional[int] = None
) -> List[int]:
    """
    Order ring atoms starting from a specific atom, optionally prioritizing path to target atom.

    Args:
        mol: RDKit molecule object
        ring_atoms: List of ring atom indices
        start_atom_idx: Starting atom index
        target_atom_idx: Optional target atom index to prioritize path towards

    Returns:
        Ordered list of atom indices
    """
    ring_atoms_set = set(ring_atoms)

    if start_atom_idx not in ring_atoms_set:
        return ring_atoms

    ordered = [start_atom_idx]
    remaining = ring_atoms_set - {start_atom_idx}
    current = start_atom_idx

    # If target atom specified, prioritize path towards it
    if target_atom_idx and target_atom_idx in ring_atoms_set:
        queue = deque([(start_atom_idx, [start_atom_idx])])
        visited = {start_atom_idx}

        while queue:
            node, path = queue.popleft()
            if node == target_atom_idx:
                # Found path, add atoms in path
                for atom_idx in path[1:]:  # Skip first (already in ordered)
                    if atom_idx in remaining:
                        ordered.append(atom_idx)
                        remaining.remove(atom_idx)
                break

            atom = mol.GetAtomWithIdx(node)
            for neighbor in atom.GetNeighbors():
                neighbor_idx = neighbor.GetIdx()
                if neighbor_idx in ring_atoms_set and neighbor_idx not in visited:
                    visited.add(neighbor_idx)
                    queue.append((neighbor_idx, path + [neighbor_idx]))

    # Continue adding remaining ring atoms
    while remaining:
        atom = mol.GetAtomWithIdx(current)
        found_next = False

        for neighbor in atom.GetNeighbors():
            neighbor_idx = neighbor.GetIdx()
            if neighbor_idx in remaining:
                ordered.append(neighbor_idx)
                remaining.remove(neighbor_idx)
                current = neighbor_idx
                found_next = True
                break

        if not found_next:
            if remaining:
                next_atom = remaining.pop()
                ordered.append(next_atom)
                current = next_atom

    return ordered


def create_ring_numbering(
    mol: Chem.Mol,
    ring_atoms: List[int],
    carbonyl_carbon_idx: int,
    ester_oxygen_idx: int
) -> Tuple[Dict[int, int], List[int]]:
    """
    Create ring numbering mapping starting from C=O carbonyl carbon.

    Args:
        mol: RDKit molecule object
        ring_atoms: List of ring atom indices
        carbonyl_carbon_idx: Index of C=O carbonyl carbon (will be position 1)
        ester_oxygen_idx: Index of ester oxygen in ring (for ordering direction)

    Returns:
        Tuple of (numbering_dict, ordered_atoms_list)
        numbering_dict: Maps atom index to position (1-N)
        ordered_atoms_list: Ordered list of atom indices
    """
    ordered_atoms = order_ring_atoms_from_start(
        mol, ring_atoms, carbonyl_carbon_idx, ester_oxygen_idx
    )
    numbering = {}
    for i, atom_idx in enumerate(ordered_atoms, start=1):
        numbering[atom_idx] = i

    return numbering, ordered_atoms


def get_macrolactone_numbering(
    mol: Union[Chem.Mol, str],
    ring_size: int = 16,
    allowed_ring_atom_types: Optional[List[str]] = None
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int], Tuple[bool, str]]:
    """
    Get ring numbering for a macrolactone molecule with optional atom type filtering.

    This function performs the complete numbering workflow:
    1. Find ring of specified size
    2. Find ester group
    3. Find carbonyl carbon and ester oxygen
    4. Validate ring atom composition (if filtering requested)
    5. Create numbering mapping

    Args:
        mol: RDKit molecule object or SMILES string
        ring_size: Size of the macrolactone ring (12-20, default 16)
        allowed_ring_atom_types: Allowed atom types for ring atoms (excluding ester oxygen)
            - None: No restriction (default behavior)
            - ["C"]: Only carbon atoms allowed (except ester oxygen)
            - ["C", "N"]: Carbon and nitrogen atoms allowed (except ester oxygen)
            - Can use element symbols or atomic number strings: ["6", "7"]

    Returns:
        Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason))
        Returns (None, None, None, None, None, (False, reason)) if numbering fails
    """
    # Convert SMILES to molecule if needed
    if isinstance(mol, str):
        mol = Chem.MolFromSmiles(mol)
        if mol is None:
            return None, None, None, None, None, (False, "无法解析SMILES字符串")

    # 标准化原子类型
    try:
        allowed_atom_numbers = normalize_atom_types(allowed_ring_atom_types)
    except ValueError as e:
        return None, None, None, None, None, (False, f"原子类型参数错误: {str(e)}")

    # Find ring of specified size
    ring_atoms = get_ring_atoms_by_size(mol, ring_size)
    if ring_atoms is None:
        return None, None, None, None, None, (False, f"未找到{ring_size}元环")

    # Find ester group
    ester_atoms, pattern = find_ester_smarts(mol, ring_atoms)
    if ester_atoms is None:
        return None, None, None, None, None, (False, "未在环中找到酯键")

    # Find carbonyl carbon and ester oxygen
    carbonyl_carbon = get_carbonyl_carbon_in_ring(mol, ester_atoms, ring_atoms)
    ester_oxygen = get_ester_oxygen_in_ring(mol, ester_atoms, ring_atoms)

    if carbonyl_carbon is None or ester_oxygen is None:
        return None, None, None, None, None, (False, "无法识别羰基碳或酯氧原子")

    # 验证环原子组成（如果指定了允许的原子类型）
    if allowed_atom_numbers is not None:
        is_valid, validation_reason = validate_ring_atom_composition(
            mol, ring_atoms, carbonyl_carbon, ester_oxygen, allowed_atom_numbers
        )
        if not is_valid:
            return None, None, None, None, None, (False, validation_reason)
    else:
        validation_reason = "不限制原子类型"

    # Create numbering
    ring_numbering, ordered_atoms = create_ring_numbering(
        mol, ring_atoms, carbonyl_carbon, ester_oxygen
    )

    return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (True, validation_reason)


def get_macrolactone_numbering_legacy(
    mol: Union[Chem.Mol, str],
    ring_size: int = 16
) -> Tuple[Optional[List[int]], Optional[Dict[int, int]], Optional[List[int]], Optional[int], Optional[int]]:
    """
    向后兼容版本的get_macrolactone_numbering函数

    Args:
        mol: RDKit molecule object or SMILES string
        ring_size: Size of the macrolactone ring (12-20, default 16)

    Returns:
        Tuple of (ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen)
        Returns (None, None, None, None, None) if numbering fails
    """
    ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
        get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types=None)

    if not is_valid:
        return None, None, None, None, None

    return ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen


def is_pure_carbon_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
    """
    便捷函数：检查是否为纯碳大环内酯（除酯键氧外只含碳原子）

    Args:
        mol: RDKit molecule object or SMILES string
        ring_size: Size of the macrolactone ring

    Returns:
        (is_valid, reason): 是否为纯碳大环内酯及原因
    """
    _, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
        mol, ring_size, allowed_ring_atom_types=["C"]
    )
    return is_valid, reason


def is_carbon_nitrogen_macrolactone(mol: Union[Chem.Mol, str], ring_size: int) -> Tuple[bool, str]:
    """
    便捷函数：检查是否为碳氮大环内酯（除酯键氧外只含碳和氮原子）

    Args:
        mol: RDKit molecule object or SMILES string
        ring_size: Size of the macrolactone ring

    Returns:
        (is_valid, reason): 是否为碳氮大环内酯及原因
    """
    _, _, _, _, _, (is_valid, reason) = get_macrolactone_numbering(
        mol, ring_size, allowed_ring_atom_types=["C", "N"]
    )
    return is_valid, reason


def visualize_macrolactone_with_auto_coloring(
    mol: Union[Chem.Mol, str],
    ring_size: Optional[int] = None,
    allowed_ring_atom_types: Optional[List[str]] = None,
    size: Tuple[int, int] = (800, 800),
    title: str = "",
    return_svg: bool = False,
    show_atom_labels: bool = True
) -> Union[str, None]:
    """
    可视化大环内酯，自动根据原子类型进行颜色编码

    Args:
        mol: RDKit分子对象或SMILES字符串
        ring_size: 环大小（12-20），如果为None则自动检测
        allowed_ring_atom_types: 允许的环原子类型（除酯氧外）
            - None: 不限制，显示所有原子类型
            - ["C"]: 只允许碳原子（默认筛选条件）
            - ["C", "N"]: 允许碳和氮原子
        size: 图像大小
        title: 图像标题
        return_svg: 是否返回SVG字符串
        show_atom_labels: 是否显示原子编号标签

    Returns:
        SVG字符串（如果return_svg=True）或显示图像
    """
    # 转换SMILES到分子对象
    if isinstance(mol, str):
        mol_obj = Chem.MolFromSmiles(mol)
        if mol_obj is None:
            if return_svg:
                return None
            else:
                print("❌ 无法解析SMILES字符串")
                return None
    else:
        mol_obj = mol

    # 自动检测环大小（如果未指定）
    if ring_size is None:
        ring_sizes = []
        for size in range(12, 21):
            ring_atoms = get_ring_atoms_by_size(mol_obj, size)
            if ring_atoms:
                # 检查是否为有效大环内酯
                analyzer = MacroLactoneAnalyzer()
                if analyzer.is_valid_macrolactone(mol_obj, size):
                    ring_sizes.append(size)

        if not ring_sizes:
            if return_svg:
                return None
            else:
                print("❌ 未找到12-20元大环内酯")
                return None

        # 使用找到的第一个环大小
        ring_size = ring_sizes[0]
        if len(ring_sizes) > 1:
            print(f"⚠️  找到多个环大小: {ring_sizes}，使用 {ring_size}")

    # 获取环编号信息
    ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
        get_macrolactone_numbering(mol_obj, ring_size, allowed_ring_atom_types)

    if not is_valid or not ring_atoms:
        if return_svg:
            return None
        else:
            print(f"❌ 无法获取环编号: {validation_reason}")
            return None

    # 创建分子副本并设置原子标签
    mol_copy = Chem.Mol(mol_obj)
    AllChem.Compute2DCoords(mol_copy)

    if show_atom_labels:
        for atom_idx in ring_atoms:
            if atom_idx in ring_numbering:
                atom = mol_copy.GetAtomWithIdx(atom_idx)
                atom.SetProp("atomNote", str(ring_numbering[atom_idx]))

    # 定义原子颜色方案
    def get_atom_color(symbol: str, is_ester_oxygen: bool = False) -> Tuple[float, float, float]:
        """获取原子颜色"""
        if is_ester_oxygen:
            return (1.0, 0.4, 0.4)  # 酯氧用深红色
        elif symbol == 'C':
            return (0.7, 0.8, 1.0)  # 碳用蓝色
        elif symbol == 'N':
            return (1.0, 0.8, 1.0)  # 氮用粉色
        elif symbol == 'O':
            return (1.0, 0.7, 0.7)  # 氧用红色
        elif symbol == 'S':
            return (1.0, 1.0, 0.6)  # 硫用黄色
        elif symbol == 'P':
            return (0.8, 0.6, 1.0)  # 磷用紫色
        else:
            return (0.8, 1.0, 0.8)  # 其他用绿色

    # 设置原子颜色
    atom_colors = {}
    atom_type_stats = {}

    for atom_idx in ring_atoms:
        atom = mol_obj.GetAtomWithIdx(atom_idx)
        symbol = atom.GetSymbol()
        is_ester_oxygen = (atom_idx == ester_oxygen)

        # 设置颜色
        color = get_atom_color(symbol, is_ester_oxygen)
        atom_colors[atom_idx] = color

        # 统计原子类型（不包括酯氧）
        if not is_ester_oxygen:
            atom_type_stats[symbol] = atom_type_stats.get(symbol, 0) + 1

    # 绘制分子
    # 确保size是元组格式
    if isinstance(size, int):
        size = (size, size)
    elif isinstance(size, (list, tuple)) and len(size) == 1:
        size = (size[0], size[0])
    elif not isinstance(size, (list, tuple)) or len(size) != 2:
        size = (800, 800)  # 默认大小

    drawer = rdMolDraw2D.MolDraw2DSVG(int(size[0]), int(size[1]))
    drawer.SetFontSize(12)  # 设置合适的字体大小（最小为6）

    # 注意：某些RDKit版本不支持DrawTitle，暂时注释掉
    # if title:
    #     drawer.DrawTitle(title)

    drawer.DrawMolecule(mol_copy,
                       highlightAtoms=ring_atoms,
                       highlightAtomColors=atom_colors)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()

    # 显示统计信息
    if not return_svg:
        display(SVG(svg))

        print(f"\n📊 分子信息:")
        print(f"   环大小: {ring_size} 元环")
        print(f"   羰基碳位置: {ring_numbering.get(carbonyl_carbon, 'N/A')} (深红色标记)")
        print(f"   酯氧位置: {ring_numbering.get(ester_oxygen, 'N/A')} (深红色标记)")
        print(f"   环原子组成: {atom_type_stats}")
        print(f"   筛选条件: {validation_reason}")

        # 显示颜色说明
        print(f"\n🎨 颜色说明:")
        print(f"   深红色: 酯键氧原子 (位置2)")
        print(f"   蓝色: 碳原子")
        print(f"   粉色: 氮原子")
        print(f"   红色: 氧原子")
        print(f"   黄色: 硫原子")
        print(f"   紫色: 磷原子")
        print(f"   绿色: 其他原子")

    return svg if return_svg else None


def batch_visualize_macrolactones(
    data_file: Path,
    ring_sizes: List[int] = None,
    allowed_ring_atom_types: Optional[List[str]] = None,
    max_examples_per_size: int = 3,
    output_dir: Optional[Path] = None
) -> Dict[int, List[Dict]]:
    """
    批量可视化大环内酯分子

    Args:
        data_file: CSV数据文件路径
        ring_sizes: 要测试的环大小列表，默认为12-20
        allowed_ring_atom_types: 允许的环原子类型
        max_examples_per_size: 每种环大小最大示例数
        output_dir: 输出目录，如果指定则保存SVG文件

    Returns:
        按环大小分组的可视化结果字典
    """
    if ring_sizes is None:
        ring_sizes = list(range(12, 21))

    print(f"🔍 开始批量可视化大环内酯")
    print(f"   数据文件: {data_file}")
    print(f"   环大小范围: {ring_sizes}")
    print(f"   筛选条件: {allowed_ring_atom_types or '无限制'}")

    # 加载数据
    if not data_file.exists():
        print(f"❌ 数据文件不存在: {data_file}")
        return {}

    df = pd.read_csv(data_file)
    print(f"✓ 加载数据: {len(df)} 个分子")

    results = {}

    for ring_size in ring_sizes:
        print(f"\n🔄 处理 {ring_size} 元环...")

        size_results = []
        found_count = 0

        for idx, row in df.iterrows():
            if found_count >= max_examples_per_size:
                break

            smiles = row.get('smiles', '')
            if not smiles:
                continue

            try:
                # 测试分子
                mol = Chem.MolFromSmiles(smiles)
                if not mol:
                    continue

                # 检查是否为指定大小的有效大环内酯
                analyzer = MacroLactoneAnalyzer()
                if not analyzer.is_valid_macrolactone(mol, ring_size):
                    continue

                # 应用筛选条件（如果指定）
                if allowed_ring_atom_types:
                    is_valid, validation_reason = get_macrolactone_numbering(
                        mol, ring_size, allowed_ring_atom_types
                    )[5]
                    if not is_valid:
                        continue

                # 获取详细信息
                ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, validation_reason) = \
                    get_macrolactone_numbering(mol, ring_size, allowed_ring_atom_types)

                if is_valid:
                    # 分析原子组成
                    composition = analyze_ring_atom_composition(smiles, ring_size)

                    result = {
                        'index': idx,
                        'smiles': smiles,
                        'molecule_id': row.get('molecule_id', f'mol_{idx}'),
                        'ring_size': ring_size,
                        'composition': composition,
                        'validation_reason': validation_reason,
                        'carbonyl_carbon_pos': ring_numbering.get(carbonyl_carbon, 'N/A'),
                        'ester_oxygen_pos': ring_numbering.get(ester_oxygen, 'N/A')
                    }

                    size_results.append(result)
                    found_count += 1

                    print(f"   ✓ 找到示例 {found_count}: 分子{idx} ({composition})")

                    # 保存可视化（如果指定了输出目录）
                    if output_dir:
                        output_dir.mkdir(parents=True, exist_ok=True)
                        filename = f"ring{ring_size}_example{found_count}_mol{idx}.svg"
                        output_path = output_dir / filename

                        svg = visualize_macrolactone_with_auto_coloring(
                            mol, ring_size, allowed_ring_atom_types,
                            return_svg=True,
                            title=f"{ring_size}-元环示例 {found_count}"
                        )

                        if svg:
                            with open(output_path, 'w', encoding='utf-8') as f:
                                f.write(svg)
                            print(f"     💾 保存到: {output_path}")

            except Exception as e:
                print(f"   ⚠️  处理分子 {idx} 时出错: {str(e)}")
                continue

        results[ring_size] = size_results
        print(f"   📊 {ring_size} 元环: 找到 {len(size_results)} 个示例")

    # 显示总体统计
    print(f"\n📈 总体统计:")
    total_found = sum(len(results[size]) for size in ring_sizes)
    print(f"   总示例数: {total_found}")

    for ring_size in ring_sizes:
        count = len(results[ring_size])
        print(f"   {ring_size} 元环: {count} 个示例")

    return results


def test_all_ring_sizes_with_filtering(
    filtered_data_dir: Path,
    allowed_ring_atom_types: Optional[List[str]] = ["C"],  # 默认只允许碳原子
    output_dir: Optional[Path] = None
) -> Dict[int, Dict]:
    """
    测试所有环大小（12-20）的筛选效果

    Args:
        filtered_data_dir: 包含filtered CSV文件的目录
        allowed_ring_atom_types: 允许的环原子类型
        output_dir: 输出目录

    Returns:
        详细的测试结果
    """
    print(f"🧪 开始测试所有环大小的筛选效果")
    print(f"   数据目录: {filtered_data_dir}")
    print(f"   筛选条件: {allowed_ring_atom_types or '无限制'}")

    all_results = {}

    for ring_size in range(12, 21):
        print(f"\n{'='*60}")
        print(f"🔍 测试 {ring_size} 元环")
        print(f"{'='*60}")

        # 查找对应的数据文件
        data_file = filtered_data_dir / f'macrolactone_ring{ring_size}_filtered.csv'

        if not data_file.exists():
            print(f"⚠️  未找到 {ring_size} 元环数据文件: {data_file}")
            all_results[ring_size] = {
                'data_file_exists': False,
                'total_molecules': 0,
                'valid_macrolactones': 0,
                'filtered_molecules': 0,
                'examples': []
            }
            continue

        # 批量测试
        batch_results = batch_visualize_macrolactones(
            data_file,
            ring_sizes=[ring_size],
            allowed_ring_atom_types=allowed_ring_atom_types,
            max_examples_per_size=5,
            output_dir=output_dir / f'ring{ring_size}_examples' if output_dir else None
        )

        # 统计结果
        size_results = batch_results.get(ring_size, [])

        # 加载完整数据进行统计
        try:
            df = pd.read_csv(data_file)
            total_molecules = len(df)

            # 统计有效大环内酯数量
            valid_count = 0
            filtered_count = 0

            for _, row in df.iterrows():
                smiles = row.get('smiles', '')
                if not smiles:
                    continue

                try:
                    mol = Chem.MolFromSmiles(smiles)
                    if mol:
                        analyzer = MacroLactoneAnalyzer()
                        if analyzer.is_valid_macrolactone(mol, ring_size):
                            valid_count += 1

                            if allowed_ring_atom_types:
                                is_valid, _ = get_macrolactone_numbering(
                                    mol, ring_size, allowed_ring_atom_types
                                )[5]
                                if is_valid:
                                    filtered_count += 1
                            else:
                                filtered_count += 1
                except:
                    continue

            all_results[ring_size] = {
                'data_file_exists': True,
                'total_molecules': total_molecules,
                'valid_macrolactones': valid_count,
                'filtered_molecules': filtered_count,
                'filter_rate': filtered_count / valid_count * 100 if valid_count > 0 else 0,
                'examples': size_results
            }

            print(f"\n📊 {ring_size} 元环统计:")
            print(f"   总分子数: {total_molecules}")
            print(f"   有效大环内酯: {valid_count}")
            print(f"   通过筛选: {filtered_count}")
            print(f"   筛选通过率: {filtered_count/valid_count*100:.1f}%" if valid_count > 0 else "   筛选通过率: 0%")

        except Exception as e:
            print(f"❌ 统计 {ring_size} 元环时出错: {str(e)}")
            all_results[ring_size] = {
                'data_file_exists': True,
                'error': str(e),
                'examples': size_results
            }

    # 显示总体统计
    print(f"\n{'='*60}")
    print(f"📈 所有环大小测试总结")
    print(f"{'='*60}")

    total_molecules = sum(result.get('total_molecules', 0) for result in all_results.values())
    total_valid = sum(result.get('valid_macrolactones', 0) for result in all_results.values())
    total_filtered = sum(result.get('filtered_molecules', 0) for result in all_results.values())

    print(f"总分子数: {total_molecules}")
    print(f"总有效大环内酯: {total_valid}")
    print(f"总通过筛选: {total_filtered}")
    print(f"总体筛选通过率: {total_filtered/total_valid*100:.1f}%" if total_valid > 0 else "总体筛选通过率: 0%")

    return all_results


def analyze_ring_atom_composition(mol: Union[Chem.Mol, str], ring_size: int) -> Dict[str, int]:
    # 首先获取环编号信息
    ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen, (is_valid, _) = \
        get_macrolactone_numbering(mol, ring_size)

    if not is_valid or not ring_atoms:
        return {}

    # 转换SMILES到分子对象（如果需要）
    if isinstance(mol, str):
        mol = Chem.MolFromSmiles(mol)
        if mol is None:
            return {}

    # 统计原子类型
    composition = {}

    for atom_idx in ring_atoms:
        # 跳过酯键氧原子
        if atom_idx == ester_oxygen:
            continue

        atom = mol.GetAtomWithIdx(atom_idx)
        symbol = atom.GetSymbol()
        composition[symbol] = composition.get(symbol, 0) + 1

    return composition


def draw_mol_with_ring_numbering(
    mol: Union[Chem.Mol, str],
    ring_numbering: Optional[Dict[int, int]] = None,
    ring_atoms: Optional[List[int]] = None,
    size: Tuple[int, int] = (800, 800),
    title: str = "",
    ring_size: int = 16,
    return_svg: bool = False
) -> Optional[str]:
    """
    Draw molecule with ring numbering displayed.

    This function can work in two modes:
    1. If ring_numbering is provided, use it directly
    2. If ring_numbering is None, automatically compute it for macrolactones

    Args:
        mol: RDKit molecule object or SMILES string
        ring_numbering: Optional pre-computed numbering dictionary
        ring_atoms: Optional pre-computed ring atoms list
        size: Image size (width, height)
        title: Optional title for the image
        ring_size: Ring size for auto-numbering (default 16)
        return_svg: If True, return SVG string instead of displaying

    Returns:
        SVG string if return_svg=True, None otherwise (displays in notebook)
    """
    # Convert SMILES to molecule if needed
    if isinstance(mol, str):
        mol = Chem.MolFromSmiles(mol)
        if mol is None:
            print("Error: Could not parse SMILES")
            return None

    # Auto-compute numbering if not provided
    if ring_numbering is None:
        ring_atoms, ring_numbering, _, _, _ = get_macrolactone_numbering(mol, ring_size)
        if ring_numbering is None:
            print("Error: Could not compute ring numbering")
            return None

    # Get ring atoms if not provided
    if ring_atoms is None:
        ring_atoms = list(ring_numbering.keys())

    # Create drawer
    drawer = rdMolDraw2D.MolDraw2DSVG(size[0], size[1])
    drawer.SetFontSize(6)  # Minimum font size is 6

    draw_options = drawer.drawOptions()
    draw_options.addAtomIndices = False

    # Highlight ring atoms
    highlight_atoms = list(ring_atoms)
    atom_colors = {}
    for atom_idx in ring_atoms:
        atom_colors[atom_idx] = (0.8, 0.9, 1.0)  # Light blue

    # Create copy and set atom notes (ring numbering)
    mol_copy = Chem.Mol(mol)
    for atom_idx in ring_atoms:
        if atom_idx in ring_numbering:
            atom = mol_copy.GetAtomWithIdx(atom_idx)
            atom.SetProp("atomNote", str(ring_numbering[atom_idx]))

    # Draw molecule
    drawer.DrawMolecule(
        mol_copy,
        highlightAtoms=highlight_atoms,
        highlightAtomColors=atom_colors
    )

    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()

    if return_svg:
        return svg
    else:
        display(SVG(svg))
        return None


def visualize_molecule_with_numbering(
    mol: Union[Chem.Mol, str],
    ring_size: int = 16,
    size: Tuple[int, int] = (800, 800),
    title: str = ""
) -> Tuple[Optional[Dict[int, int]], Optional[List[int]]]:
    """
    Convenience function to visualize a molecule with automatic ring numbering.

    This is the main function to use in Jupyter notebooks for quick visualization.

    Args:
        mol: RDKit molecule object or SMILES string
        ring_size: Ring size for macrolactones (default 16)
        size: Image size (width, height)
        title: Optional title

    Returns:
        Tuple of (ring_numbering_dict, ring_atoms_list)
    """
    # Get numbering
    ring_atoms, ring_numbering, ordered_atoms, carbonyl_carbon, ester_oxygen = \
        get_macrolactone_numbering(mol, ring_size)

    if ring_numbering is None:
        print("Error: Could not compute ring numbering")
        return None, None

    # Display
    print(f"Ring size: {ring_size}")
    print(f"Carbonyl C position: {ring_numbering.get(carbonyl_carbon, 'N/A')}")
    print(f"Ester O position: {ring_numbering.get(ester_oxygen, 'N/A')}")
    print(f"Numbering range: 1-{len(ring_numbering)}")

    draw_mol_with_ring_numbering(
        mol, ring_numbering, ring_atoms, size, title, ring_size
    )

    return ring_numbering, ring_atoms


def identify_side_chains(mol: Chem.Mol, ring_atoms: List[int]) -> List[Tuple[int, int]]:
    """
    Identify side chains attached to the ring.

    Args:
        mol: RDKit molecule object
        ring_atoms: List of atom indices in the ring

    Returns:
        List of tuples (ring_atom_idx, side_chain_first_atom_idx)
    """
    side_chains = []
    ring_atom_set = set(ring_atoms)

    for ring_atom_idx in ring_atoms:
        atom = mol.GetAtomWithIdx(ring_atom_idx)
        for neighbor in atom.GetNeighbors():
            neighbor_idx = neighbor.GetIdx()
            # If neighbor is not in the ring, it's a side chain
            if neighbor_idx not in ring_atom_set:
                side_chains.append((ring_atom_idx, neighbor_idx))

    return side_chains


def extract_side_chain_fragment(
    mol: Chem.Mol,
    ring_atom_idx: int,
    side_chain_start_idx: int,
    ring_atoms: List[int]
) -> Optional[str]:
    """
    Extract a side chain fragment as a SMILES string with dummy atom (*) at attachment point.

    Args:
        mol: RDKit molecule object
        ring_atom_idx: Index of the ring atom where side chain is attached
        side_chain_start_idx: Index of the first atom in the side chain
        ring_atoms: List of all ring atom indices

    Returns:
        SMILES string of the fragment (contains dummy atom *), or None if extraction failed
    """
    ring_atom_set = set(ring_atoms)
    visited = set()
    queue = [side_chain_start_idx]
    side_chain_atoms = []

    # Use BFS to collect all atoms in the side chain
    while queue:
        current_idx = queue.pop(0)
        if current_idx in visited:
            continue

        visited.add(current_idx)
        side_chain_atoms.append(current_idx)

        atom = mol.GetAtomWithIdx(current_idx)
        for neighbor in atom.GetNeighbors():
            neighbor_idx = neighbor.GetIdx()
            # Only continue into non-ring atoms
            if neighbor_idx not in ring_atom_set and neighbor_idx not in visited:
                queue.append(neighbor_idx)

    if not side_chain_atoms:
        return None

    # Get the bond type to the ring
    bond_to_ring = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx)
    if bond_to_ring is None:
        return None
    bond_type = bond_to_ring.GetBondType()

    # Create a new molecule with only the side chain atoms
    fragment_mol = Chem.RWMol()
    old_to_new = {}

    # Add atoms
    for old_idx in side_chain_atoms:
        atom = mol.GetAtomWithIdx(old_idx)
        new_atom = Chem.Atom(atom.GetAtomicNum())
        new_atom.SetFormalCharge(atom.GetFormalCharge())
        new_atom.SetIsAromatic(atom.GetIsAromatic())
        new_idx = fragment_mol.AddAtom(new_atom)
        old_to_new[old_idx] = new_idx

    # Add bonds (within side chain)
    for old_idx in side_chain_atoms:
        atom = mol.GetAtomWithIdx(old_idx)
        for neighbor in atom.GetNeighbors():
            neighbor_idx = neighbor.GetIdx()
            if neighbor_idx in old_to_new and old_idx < neighbor_idx:
                bond = mol.GetBondBetweenAtoms(old_idx, neighbor_idx)
                fragment_mol.AddBond(
                    old_to_new[old_idx],
                    old_to_new[neighbor_idx],
                    bond.GetBondType()
                )

    # Add dummy atom (atomic number 0, displays as * in SMILES)
    # Dummy atom connects to the atom that was originally connected to the ring
    attachment_point = old_to_new[side_chain_start_idx]
    dummy_atom_idx = fragment_mol.AddAtom(Chem.Atom(0))

    # Add bond between dummy atom and attachment point, keeping original bond type
    fragment_mol.AddBond(attachment_point, dummy_atom_idx, bond_type)

    # Convert to molecule and get SMILES
    try:
        fragment_mol = fragment_mol.GetMol()
        Chem.SanitizeMol(fragment_mol)
        fragment_smiles = Chem.MolToSmiles(fragment_mol)
        return fragment_smiles
    except Exception as e:
        return None


def cleave_side_chain_at_position(
    mol: Chem.Mol,
    ring_atoms: List[int],
    ring_numbering: Dict[int, int],
    position: int
) -> List[str]:
    """
    Cleave side chains at a specified ring position.

    Args:
        mol: RDKit molecule object
        ring_atoms: List of ring atom indices
        ring_numbering: Ring numbering dictionary mapping atom index to position
        position: Position number to cleave (1-N, where N is ring size)

    Returns:
        List of SMILES strings, one for each side chain (contains dummy atom *,
        compatible with molzip for reconstruction)
    """
    # Find the ring atom index for this position
    ring_atom_idx = None
    for atom_idx, pos in ring_numbering.items():
        if pos == position:
            ring_atom_idx = atom_idx
            break

    if ring_atom_idx is None:
        return []

    # Find all side chains attached to this ring atom
    side_chains = identify_side_chains(mol, ring_atoms)
    fragments = []

    for ring_atom, side_start in side_chains:
        if ring_atom == ring_atom_idx:
            fragment_smiles = extract_side_chain_fragment(mol, ring_atom, side_start, ring_atoms)
            if fragment_smiles:
                fragments.append(fragment_smiles)

    return fragments