Files
macrolactone-toolkit/docs/plans/2026-03-19-macrolactone-validation-design.md

16 KiB
Raw Blame History

MacrolactoneDB 12-20元环验证方案设计

1. 验证目标与范围

1.1 目标

  • 验证 macro_lactone_toolkit 对 MacrolactoneDB 12-20元环大环内酯的识别准确性
  • 验证环编号正确性位置1=羰基碳位置2=酯键氧)
  • 验证侧链断裂功能(仅针对标准大环内酯)
  • 使用同位素标记技术标记裂解位置,便于后续拼接

1.2 范围

  • 数据来源: /data/MacrolactoneDB/ring12_20/temp.csv (11,037分子)
  • 抽样策略: 分层随机抽样10% (~1,100分子)按环大小12-20均匀分布
  • 处理对象: 仅处理 classification="standard_macrolactone" 的分子
  • 输出: 可视化图片 + SQLite数据库 + 汇总统计

2. 数据库设计 (SQLModel)

2.1 模型定义

from typing import Optional, List
from sqlmodel import SQLModel, Field, Relationship
from datetime import datetime
from enum import Enum

class ClassificationType(str, Enum):
    STANDARD = "standard_macrolactone"
    NON_STANDARD = "non_standard_macrocycle"
    NOT_MACROLACTONE = "not_macrolactone"

class ProcessingStatus(str, Enum):
    PENDING = "pending"
    SUCCESS = "success"
    FAILED = "failed"
    SKIPPED = "skipped"

# ==================== 主分子表 ====================
class ParentMolecule(SQLModel, table=True):
    """原始分子信息表"""
    __tablename__ = "parent_molecules"

    id: Optional[int] = Field(default=None, primary_key=True)

    # 原始数据
    source_id: str = Field(index=True)  # 来自CSV的IDs字段
    molecule_name: Optional[str] = None  # molecule_pref_name
    smiles: str = Field(index=True)

    # 分类结果
    classification: ClassificationType = Field(index=True)
    ring_size: Optional[int] = Field(default=None, index=True)
    primary_reason_code: Optional[str] = None
    primary_reason_message: Optional[str] = None

    # 处理状态
    processing_status: ProcessingStatus = Field(default=ProcessingStatus.PENDING)
    error_message: Optional[str] = None

    # 元数据
    created_at: datetime = Field(default_factory=datetime.utcnow)
    processed_at: Optional[datetime] = None

    # 关系
    fragments: List["SideChainFragment"] = Relationship(back_populates="parent")
    numbering: Optional["RingNumbering"] = Relationship(back_populates="parent")

# ==================== 环编号表 ====================
class RingNumbering(SQLModel, table=True):
    """环编号详细信息"""
    __tablename__ = "ring_numberings"

    id: Optional[int] = Field(default=None, primary_key=True)
    parent_id: int = Field(foreign_key="parent_molecules.id", unique=True)

    # 环基本信息
    ring_size: int
    carbonyl_carbon_idx: int  # 位置1
    ester_oxygen_idx: int     # 位置2

    # 原子映射 (JSON存储)
    position_to_atom: str  # JSON: {"1": 5, "2": 10, ...}
    atom_to_position: str  # JSON: {"5": 1, "10": 2, ...}

    # 关系
    parent: Optional[ParentMolecule] = Relationship(back_populates="numbering")

# ==================== 侧链片段表 ====================
class SideChainFragment(SQLModel, table=True):
    """裂解产生的侧链片段"""
    __tablename__ = "side_chain_fragments"

    id: Optional[int] = Field(default=None, primary_key=True)
    parent_id: int = Field(foreign_key="parent_molecules.id", index=True)

    # 片段标识
    fragment_id: str = Field(index=True)  # {source_id}_frag_{n}

    # 裂解位置信息
    cleavage_position: int = Field(index=True)  # 环上的断裂位置
    attachment_atom_idx: int  # 母环上的连接原子索引
    attachment_atom_symbol: str  # C, O, N等

    # SMILES (关键:同位素标记用于标识裂解位置)
    fragment_smiles_labeled: str   # 带同位素标记,如 [5*]CCO
    fragment_smiles_plain: str     # 无标记,如 *CCO

    # 同位素标记值 (用于后续拼接)
    dummy_isotope: int  # 裂解位置的编号,用于重建连接关系

    # 物理化学性质
    atom_count: int
    heavy_atom_count: int
    molecular_weight: float

    # 连接信息 (用于后续拼接)
    original_bond_type: str  # SINGLE, DOUBLE, AROMATIC等

    # 关系
    parent: Optional[ParentMolecule] = Relationship(back_populates="fragments")

# ==================== 验证结果表 ====================
class ValidationResult(SQLModel, table=True):
    """人工验证结果记录"""
    __tablename__ = "validation_results"

    id: Optional[int] = Field(default=None, primary_key=True)
    parent_id: int = Field(foreign_key="parent_molecules.id")

    # 验证字段
    numbering_correct: Optional[bool] = None  # 编号是否正确
    cleavage_correct: Optional[bool] = None   # 裂解位置是否正确
    classification_correct: Optional[bool] = None  # 分类是否正确

    # 备注
    notes: Optional[str] = None
    validated_by: Optional[str] = None
    validated_at: Optional[datetime] = None

2.2 数据库初始化

from sqlmodel import create_engine, Session, SQLModel
from contextlib import contextmanager

# SQLite文件数据库
DATABASE_URL = "sqlite:///./validation_output/fragments.db"

engine = create_engine(DATABASE_URL, echo=False)

@contextmanager
def get_session():
    with Session(engine) as session:
        yield session

def init_database():
    """创建所有表"""
    SQLModel.metadata.create_all(engine)

3. 同位素标记方案 (借鉴Molassembler)

3.1 标记策略

def build_fragment_smiles_with_isotope(
    mol: Chem.Mol,
    side_chain_atoms: list[int],
    side_chain_start_idx: int,
    ring_atom_idx: int,
    cleavage_position: int,  # 使用位置编号作为同位素值
) -> str:
    """
    构建带同位素标记的片段SMILES

    关键用cleavage_position作为dummy原子的同位素值
    这样后续拼接时能精确知道片段来自哪个位置
    """
    # 创建可编辑分子
    emol = Chem.EditableMol(Chem.Mol(mol))

    # 添加dummy原子替代连接点
    dummy_atom = Chem.Atom(0)  # 原子序数0 = dummy
    dummy_atom.SetIsotope(cleavage_position)  # 【关键】用位置编号标记
    dummy_idx = emol.AddAtom(dummy_atom)

    # 获取原始键类型
    bond = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx)
    bond_type = bond.GetBondType()

    # 添加dummy原子与侧链起始原子的键
    emol.AddBond(dummy_idx, side_chain_start_idx, bond_type)

    # 只保留dummy原子和侧链原子
    atoms_to_keep = set([dummy_idx] + list(side_chain_atoms))

    # 标记要删除的原子
    for atom_idx in range(mol.GetNumAtoms()):
        if atom_idx not in atoms_to_keep:
            emol.RemoveAtom(atom_idx)

    fragment = emol.GetMol()
    Chem.SanitizeMol(fragment)

    return Chem.MolToSmiles(fragment)

3.2 标记示例

裂解位置 原始结构 标记后SMILES 说明
5 ...C5(CCO)... [5*]CCO dummy同位素=5表示位置5的侧链
12 ...C12(CCCO)... [12*]CCCO dummy同位素=12表示位置12的侧链

3.3 拼接时使用标记

# 后续拼接时,可以通过同位素值找到对应的裂解位置
def find_attachment_position(fragment_smiles: str) -> int:
    """从片段SMILES中提取裂解位置"""
    mol = Chem.MolFromSmiles(fragment_smiles)
    for atom in mol.GetAtoms():
        if atom.GetAtomicNum() == 0 and atom.GetIsotope() > 0:
            return atom.GetIsotope()  # 返回位置编号
    return 0

4. 输出目录结构

validation_output/
├── README.md                    # 目录结构说明
├── fragments.db                 # SQLite数据库
├── summary.csv                  # 主汇总表 (所有分子)
├── summary_statistics.json      # 统计信息
│
├── ring_size_12/                # 12元环
├── ring_size_13/                # 13元环
├── ring_size_14/                # 14元环
├── ring_size_15/                # 15元环
├── ring_size_16/                # 16元环
├── ring_size_17/                # 17元环
├── ring_size_18/                # 18元环
├── ring_size_19/                # 19元环
└── ring_size_20/                # 20元环
    │
    ├── molecules.csv            # 该环大小的所有分子
    │
    ├── standard/                # 标准大环内酯
    │   ├── numbered/            # 带编号的高亮环图
    │   │   ├── {source_id}_numbered.png
    │   │   └── ...
    │   │
    │   └── sidechains/          # 侧链片段图
    │       └── {source_id}/
    │           ├── {source_id}_frag_0_pos{pos}.png  # 位置信息在文件名
    │           ├── {source_id}_frag_1_pos{pos}.png
    │           └── ...
    │
    ├── non_standard/            # 非标准大环
    │   └── original/
    │       ├── {source_id}_original.png
    │       └── ...
    │
    └── rejected/                # 被拒绝的分子
        └── original/
            ├── {source_id}_original.png
            └── ...

5. CSV字段设计

5.1 summary.csv

字段名 类型 说明
id int 数据库主键
source_id str 原始IDs字段
molecule_name str 分子名称
smiles str 原始SMILES
classification str standard/non_standard/not_macrolactone
ring_size int 检测到的环大小
primary_reason_code str 分类原因代码
primary_reason_message str 分类原因描述
processing_status str pending/success/failed/skipped
error_message str 错误信息
num_sidechains int 侧链数量
cleavage_positions str JSON数组 [5, 8, 12]
numbered_image_path str 编号图相对路径
processed_at datetime 处理时间

5.2 fragments.csv (每个分子的侧链详情)

字段名 类型 说明
fragment_id str 唯一标识
source_id str 母分子ID
cleavage_position int 环上断裂位置
attachment_atom_idx int 连接原子索引
attachment_atom_symbol str 连接原子类型
fragment_smiles_labeled str 带同位素标记的SMILES
fragment_smiles_plain str 无标记SMILES
dummy_isotope int 标记值=裂解位置
atom_count int 原子数
molecular_weight float 分子量
original_bond_type str 原始键类型
image_path str 片段图路径

6. 验证脚本架构

#!/usr/bin/env python3
"""
MacrolactoneDB 12-20元环验证脚本
使用: pixi run python scripts/validate_macrolactone_db.py
"""

import pandas as pd
from sqlmodel import Session
from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter
from macro_lactone_toolkit.visualization import save_numbered_molecule_png

class MacrolactoneValidator:
    def __init__(self, sample_ratio=0.1, output_dir="validation_output"):
        self.analyzer = MacroLactoneAnalyzer()
        self.fragmenter = MacrolactoneFragmenter()
        self.sample_ratio = sample_ratio
        self.output_dir = Path(output_dir)

    def run(self, input_csv: str):
        # 1. 加载数据
        df = pd.read_csv(input_csv)

        # 2. 分层抽样
        sampled = self._stratified_sample(df)

        # 3. 初始化数据库
        init_database()

        # 4. 处理每个分子
        for _, row in sampled.iterrows():
            self._process_molecule(row)

        # 5. 生成汇总
        self._generate_summary()

    def _stratified_sample(self, df: pd.DataFrame) -> pd.DataFrame:
        """按环大小分层抽样"""
        # 先分类所有分子
        df['classification'] = df['smiles'].apply(
            lambda s: self.analyzer.classify_macrocycle(s).classification
        )
        df['ring_size'] = df['smiles'].apply(
            lambda s: self.analyzer.classify_macrocycle(s).ring_size
        )

        # 按ring_size分层每层抽10%
        sampled = df.groupby('ring_size').apply(
            lambda x: x.sample(frac=self.sample_ratio, random_state=42)
        ).reset_index(drop=True)

        return sampled

    def _process_molecule(self, row: pd.Series):
        """处理单个分子"""
        source_id = row['IDs']
        smiles = row['smiles']
        classification = row['classification']
        ring_size = row['ring_size']

        # 保存到ParentMolecule表
        parent = ParentMolecule(
            source_id=source_id,
            molecule_name=row.get('molecule_pref_name'),
            smiles=smiles,
            classification=classification,
            ring_size=ring_size,
        )

        if classification != ClassificationType.STANDARD:
            parent.processing_status = ProcessingStatus.SKIPPED
            self._save_image(smiles, ring_size, source_id, classification)
            return

        # 处理标准大环内酯
        try:
            # 环编号
            numbering = self.fragmenter.number_molecule(smiles)
            self._save_numbering_to_db(parent.id, numbering)

            # 生成编号图
            self._save_numbered_image(smiles, ring_size, source_id)

            # 侧链断裂
            result = self.fragmenter.fragment_molecule(smiles, parent_id=source_id)
            self._save_fragments_to_db(parent.id, result)
            self._save_fragment_images(result, source_id)

            parent.processing_status = ProcessingStatus.SUCCESS
            parent.num_sidechains = len(result.fragments)
            parent.cleavage_positions = json.dumps([f.cleavage_position for f in result.fragments])

        except Exception as e:
            parent.processing_status = ProcessingStatus.FAILED
            parent.error_message = str(e)

        parent.processed_at = datetime.utcnow()

        with get_session() as session:
            session.add(parent)
            session.commit()

7. 执行命令

# 进入项目目录
cd /Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split

# 激活pixi环境
pixi shell

# 运行验证脚本
python scripts/validate_macrolactone_db.py \
  --input data/MacrolactoneDB/ring12_20/temp.csv \
  --output validation_output \
  --sample-ratio 0.1

# 查看数据库
sqlite3 validation_output/fragments.db ".tables"
sqlite3 validation_output/fragments.db "SELECT * FROM parent_molecules LIMIT 5;"

8. 人工检查清单

8.1 编号正确性检查

  • 位置1是否为内酯羰基碳C=O
  • 位置2是否为酯键氧-O-
  • 编号是否沿统一方向连续
  • 桥环/非标准环是否被正确跳过

8.2 裂解正确性检查

  • 位置1和2是否有侧链应该没有是内酯本身
  • 位置3-N的侧链是否正确识别
  • dummy原子是否正确标记裂解位置
  • 键型是否保持(单键/双键)

8.3 分类准确性检查

  • 标准大环内酯是否被正确识别
  • 非标准大环是否被正确分类并跳过
  • 非大环内酯是否被拒绝

9. 后续拼接接口设计

def get_fragment_by_position(db_path: str, source_id: str, position: int) -> Optional[SideChainFragment]:
    """通过位置获取片段,用于后续拼接"""
    engine = create_engine(f"sqlite:///{db_path}")
    with Session(engine) as session:
        statement = select(SideChainFragment).where(
            SideChainFragment.parent_id == source_id,
            SideChainFragment.cleavage_position == position
        )
        return session.exec(statement).first()

def get_all_positions_for_parent(db_path: str, source_id: str) -> List[int]:
    """获取某分子的所有裂解位置"""
    engine = create_engine(f"sqlite:///{db_path}")
    with Session(engine) as session:
        statement = select(SideChainFragment.cleavage_position).where(
            SideChainFragment.parent_id == source_id
        )
        return [r[0] for r in session.exec(statement).all()]