feat(validation): add SQLModel database models and fix relationships

2026-03-19 10:27:20 +08:00
parent d0cdf50ed5
commit d18133ce16
6 changed files with 2091 additions and 13 deletions
--- a/docs/plans/2026-03-19-macrolactone-validation-design.md
+++ b/docs/plans/2026-03-19-macrolactone-validation-design.md
@@ -0,0 +1,475 @@
+# MacrolactoneDB 12-20元环验证方案设计
+
+## 1. 验证目标与范围
+
+### 1.1 目标
+- 验证 `macro_lactone_toolkit` 对 MacrolactoneDB 12-20元环大环内酯的识别准确性
+- 验证环编号正确性（位置1=羰基碳，位置2=酯键氧）
+- 验证侧链断裂功能（仅针对标准大环内酯）
+- 使用同位素标记技术标记裂解位置，便于后续拼接
+
+### 1.2 范围
+- **数据来源**: `/data/MacrolactoneDB/ring12_20/temp.csv` (11,037分子)
+- **抽样策略**: 分层随机抽样10% (~1,100分子)，按环大小12-20均匀分布
+- **处理对象**: 仅处理 `classification="standard_macrolactone"` 的分子
+- **输出**: 可视化图片 + SQLite数据库 + 汇总统计
+
+## 2. 数据库设计 (SQLModel)
+
+### 2.1 模型定义
+
+```python
+from typing import Optional, List
+from sqlmodel import SQLModel, Field, Relationship
+from datetime import datetime
+from enum import Enum
+
+class ClassificationType(str, Enum):
+    STANDARD = "standard_macrolactone"
+    NON_STANDARD = "non_standard_macrocycle"
+    NOT_MACROLACTONE = "not_macrolactone"
+
+class ProcessingStatus(str, Enum):
+    PENDING = "pending"
+    SUCCESS = "success"
+    FAILED = "failed"
+    SKIPPED = "skipped"
+
+# ==================== 主分子表 ====================
+class ParentMolecule(SQLModel, table=True):
+    """原始分子信息表"""
+    __tablename__ = "parent_molecules"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+
+    # 原始数据
+    source_id: str = Field(index=True)  # 来自CSV的IDs字段
+    molecule_name: Optional[str] = None  # molecule_pref_name
+    smiles: str = Field(index=True)
+
+    # 分类结果
+    classification: ClassificationType = Field(index=True)
+    ring_size: Optional[int] = Field(default=None, index=True)
+    primary_reason_code: Optional[str] = None
+    primary_reason_message: Optional[str] = None
+
+    # 处理状态
+    processing_status: ProcessingStatus = Field(default=ProcessingStatus.PENDING)
+    error_message: Optional[str] = None
+
+    # 元数据
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    processed_at: Optional[datetime] = None
+
+    # 关系
+    fragments: List["SideChainFragment"] = Relationship(back_populates="parent")
+    numbering: Optional["RingNumbering"] = Relationship(back_populates="parent")
+
+# ==================== 环编号表 ====================
+class RingNumbering(SQLModel, table=True):
+    """环编号详细信息"""
+    __tablename__ = "ring_numberings"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+    parent_id: int = Field(foreign_key="parent_molecules.id", unique=True)
+
+    # 环基本信息
+    ring_size: int
+    carbonyl_carbon_idx: int  # 位置1
+    ester_oxygen_idx: int     # 位置2
+
+    # 原子映射 (JSON存储)
+    position_to_atom: str  # JSON: {"1": 5, "2": 10, ...}
+    atom_to_position: str  # JSON: {"5": 1, "10": 2, ...}
+
+    # 关系
+    parent: Optional[ParentMolecule] = Relationship(back_populates="numbering")
+
+# ==================== 侧链片段表 ====================
+class SideChainFragment(SQLModel, table=True):
+    """裂解产生的侧链片段"""
+    __tablename__ = "side_chain_fragments"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+    parent_id: int = Field(foreign_key="parent_molecules.id", index=True)
+
+    # 片段标识
+    fragment_id: str = Field(index=True)  # {source_id}_frag_{n}
+
+    # 裂解位置信息
+    cleavage_position: int = Field(index=True)  # 环上的断裂位置
+    attachment_atom_idx: int  # 母环上的连接原子索引
+    attachment_atom_symbol: str  # C, O, N等
+
+    # SMILES (关键：同位素标记用于标识裂解位置)
+    fragment_smiles_labeled: str   # 带同位素标记，如 [5*]CCO
+    fragment_smiles_plain: str     # 无标记，如 *CCO
+
+    # 同位素标记值 (用于后续拼接)
+    dummy_isotope: int  # 裂解位置的编号，用于重建连接关系
+
+    # 物理化学性质
+    atom_count: int
+    heavy_atom_count: int
+    molecular_weight: float
+
+    # 连接信息 (用于后续拼接)
+    original_bond_type: str  # SINGLE, DOUBLE, AROMATIC等
+
+    # 关系
+    parent: Optional[ParentMolecule] = Relationship(back_populates="fragments")
+
+# ==================== 验证结果表 ====================
+class ValidationResult(SQLModel, table=True):
+    """人工验证结果记录"""
+    __tablename__ = "validation_results"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+    parent_id: int = Field(foreign_key="parent_molecules.id")
+
+    # 验证字段
+    numbering_correct: Optional[bool] = None  # 编号是否正确
+    cleavage_correct: Optional[bool] = None   # 裂解位置是否正确
+    classification_correct: Optional[bool] = None  # 分类是否正确
+
+    # 备注
+    notes: Optional[str] = None
+    validated_by: Optional[str] = None
+    validated_at: Optional[datetime] = None
+```
+
+### 2.2 数据库初始化
+
+```python
+from sqlmodel import create_engine, Session, SQLModel
+from contextlib import contextmanager
+
+# SQLite文件数据库
+DATABASE_URL = "sqlite:///./validation_output/fragments.db"
+
+engine = create_engine(DATABASE_URL, echo=False)
+
+@contextmanager
+def get_session():
+    with Session(engine) as session:
+        yield session
+
+def init_database():
+    """创建所有表"""
+    SQLModel.metadata.create_all(engine)
+```
+
+## 3. 同位素标记方案 (借鉴Molassembler)
+
+### 3.1 标记策略
+
+```python
+def build_fragment_smiles_with_isotope(
+    mol: Chem.Mol,
+    side_chain_atoms: list[int],
+    side_chain_start_idx: int,
+    ring_atom_idx: int,
+    cleavage_position: int,  # 使用位置编号作为同位素值
+) -> str:
+    """
+    构建带同位素标记的片段SMILES
+
+    关键：用cleavage_position作为dummy原子的同位素值
+    这样后续拼接时能精确知道片段来自哪个位置
+    """
+    # 创建可编辑分子
+    emol = Chem.EditableMol(Chem.Mol(mol))
+
+    # 添加dummy原子替代连接点
+    dummy_atom = Chem.Atom(0)  # 原子序数0 = dummy
+    dummy_atom.SetIsotope(cleavage_position)  # 【关键】用位置编号标记
+    dummy_idx = emol.AddAtom(dummy_atom)
+
+    # 获取原始键类型
+    bond = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx)
+    bond_type = bond.GetBondType()
+
+    # 添加dummy原子与侧链起始原子的键
+    emol.AddBond(dummy_idx, side_chain_start_idx, bond_type)
+
+    # 只保留dummy原子和侧链原子
+    atoms_to_keep = set([dummy_idx] + list(side_chain_atoms))
+
+    # 标记要删除的原子
+    for atom_idx in range(mol.GetNumAtoms()):
+        if atom_idx not in atoms_to_keep:
+            emol.RemoveAtom(atom_idx)
+
+    fragment = emol.GetMol()
+    Chem.SanitizeMol(fragment)
+
+    return Chem.MolToSmiles(fragment)
+```
+
+### 3.2 标记示例
+
+| 裂解位置 | 原始结构 | 标记后SMILES | 说明 |
+|---------|---------|-------------|------|
+| 5 | `...C5(CCO)...` | `[5*]CCO` | dummy同位素=5，表示位置5的侧链 |
+| 12 | `...C12(CCCO)...` | `[12*]CCCO` | dummy同位素=12，表示位置12的侧链 |
+
+### 3.3 拼接时使用标记
+
+```python
+# 后续拼接时，可以通过同位素值找到对应的裂解位置
+def find_attachment_position(fragment_smiles: str) -> int:
+    """从片段SMILES中提取裂解位置"""
+    mol = Chem.MolFromSmiles(fragment_smiles)
+    for atom in mol.GetAtoms():
+        if atom.GetAtomicNum() == 0 and atom.GetIsotope() > 0:
+            return atom.GetIsotope()  # 返回位置编号
+    return 0
+```
+
+## 4. 输出目录结构
+
+```
+validation_output/
+├── README.md                    # 目录结构说明
+├── fragments.db                 # SQLite数据库
+├── summary.csv                  # 主汇总表 (所有分子)
+├── summary_statistics.json      # 统计信息
+│
+├── ring_size_12/                # 12元环
+├── ring_size_13/                # 13元环
+├── ring_size_14/                # 14元环
+├── ring_size_15/                # 15元环
+├── ring_size_16/                # 16元环
+├── ring_size_17/                # 17元环
+├── ring_size_18/                # 18元环
+├── ring_size_19/                # 19元环
+└── ring_size_20/                # 20元环
+    │
+    ├── molecules.csv            # 该环大小的所有分子
+    │
+    ├── standard/                # 标准大环内酯
+    │   ├── numbered/            # 带编号的高亮环图
+    │   │   ├── {source_id}_numbered.png
+    │   │   └── ...
+    │   │
+    │   └── sidechains/          # 侧链片段图
+    │       └── {source_id}/
+    │           ├── {source_id}_frag_0_pos{pos}.png  # 位置信息在文件名
+    │           ├── {source_id}_frag_1_pos{pos}.png
+    │           └── ...
+    │
+    ├── non_standard/            # 非标准大环
+    │   └── original/
+    │       ├── {source_id}_original.png
+    │       └── ...
+    │
+    └── rejected/                # 被拒绝的分子
+        └── original/
+            ├── {source_id}_original.png
+            └── ...
+```
+
+## 5. CSV字段设计
+
+### 5.1 summary.csv
+
+| 字段名 | 类型 | 说明 |
+|--------|------|------|
+| `id` | int | 数据库主键 |
+| `source_id` | str | 原始IDs字段 |
+| `molecule_name` | str | 分子名称 |
+| `smiles` | str | 原始SMILES |
+| `classification` | str | standard/non_standard/not_macrolactone |
+| `ring_size` | int | 检测到的环大小 |
+| `primary_reason_code` | str | 分类原因代码 |
+| `primary_reason_message` | str | 分类原因描述 |
+| `processing_status` | str | pending/success/failed/skipped |
+| `error_message` | str | 错误信息 |
+| `num_sidechains` | int | 侧链数量 |
+| `cleavage_positions` | str | JSON数组 [5, 8, 12] |
+| `numbered_image_path` | str | 编号图相对路径 |
+| `processed_at` | datetime | 处理时间 |
+
+### 5.2 fragments.csv (每个分子的侧链详情)
+
+| 字段名 | 类型 | 说明 |
+|--------|------|------|
+| `fragment_id` | str | 唯一标识 |
+| `source_id` | str | 母分子ID |
+| `cleavage_position` | int | 环上断裂位置 |
+| `attachment_atom_idx` | int | 连接原子索引 |
+| `attachment_atom_symbol` | str | 连接原子类型 |
+| `fragment_smiles_labeled` | str | 带同位素标记的SMILES |
+| `fragment_smiles_plain` | str | 无标记SMILES |
+| `dummy_isotope` | int | 标记值=裂解位置 |
+| `atom_count` | int | 原子数 |
+| `molecular_weight` | float | 分子量 |
+| `original_bond_type` | str | 原始键类型 |
+| `image_path` | str | 片段图路径 |
+
+## 6. 验证脚本架构
+
+```python
+#!/usr/bin/env python3
+"""
+MacrolactoneDB 12-20元环验证脚本
+使用: pixi run python scripts/validate_macrolactone_db.py
+"""
+
+import pandas as pd
+from sqlmodel import Session
+from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter
+from macro_lactone_toolkit.visualization import save_numbered_molecule_png
+
+class MacrolactoneValidator:
+    def __init__(self, sample_ratio=0.1, output_dir="validation_output"):
+        self.analyzer = MacroLactoneAnalyzer()
+        self.fragmenter = MacrolactoneFragmenter()
+        self.sample_ratio = sample_ratio
+        self.output_dir = Path(output_dir)
+
+    def run(self, input_csv: str):
+        # 1. 加载数据
+        df = pd.read_csv(input_csv)
+
+        # 2. 分层抽样
+        sampled = self._stratified_sample(df)
+
+        # 3. 初始化数据库
+        init_database()
+
+        # 4. 处理每个分子
+        for _, row in sampled.iterrows():
+            self._process_molecule(row)
+
+        # 5. 生成汇总
+        self._generate_summary()
+
+    def _stratified_sample(self, df: pd.DataFrame) -> pd.DataFrame:
+        """按环大小分层抽样"""
+        # 先分类所有分子
+        df['classification'] = df['smiles'].apply(
+            lambda s: self.analyzer.classify_macrocycle(s).classification
+        )
+        df['ring_size'] = df['smiles'].apply(
+            lambda s: self.analyzer.classify_macrocycle(s).ring_size
+        )
+
+        # 按ring_size分层，每层抽10%
+        sampled = df.groupby('ring_size').apply(
+            lambda x: x.sample(frac=self.sample_ratio, random_state=42)
+        ).reset_index(drop=True)
+
+        return sampled
+
+    def _process_molecule(self, row: pd.Series):
+        """处理单个分子"""
+        source_id = row['IDs']
+        smiles = row['smiles']
+        classification = row['classification']
+        ring_size = row['ring_size']
+
+        # 保存到ParentMolecule表
+        parent = ParentMolecule(
+            source_id=source_id,
+            molecule_name=row.get('molecule_pref_name'),
+            smiles=smiles,
+            classification=classification,
+            ring_size=ring_size,
+        )
+
+        if classification != ClassificationType.STANDARD:
+            parent.processing_status = ProcessingStatus.SKIPPED
+            self._save_image(smiles, ring_size, source_id, classification)
+            return
+
+        # 处理标准大环内酯
+        try:
+            # 环编号
+            numbering = self.fragmenter.number_molecule(smiles)
+            self._save_numbering_to_db(parent.id, numbering)
+
+            # 生成编号图
+            self._save_numbered_image(smiles, ring_size, source_id)
+
+            # 侧链断裂
+            result = self.fragmenter.fragment_molecule(smiles, parent_id=source_id)
+            self._save_fragments_to_db(parent.id, result)
+            self._save_fragment_images(result, source_id)
+
+            parent.processing_status = ProcessingStatus.SUCCESS
+            parent.num_sidechains = len(result.fragments)
+            parent.cleavage_positions = json.dumps([f.cleavage_position for f in result.fragments])
+
+        except Exception as e:
+            parent.processing_status = ProcessingStatus.FAILED
+            parent.error_message = str(e)
+
+        parent.processed_at = datetime.utcnow()
+
+        with get_session() as session:
+            session.add(parent)
+            session.commit()
+```
+
+## 7. 执行命令
+
+```bash
+# 进入项目目录
+cd /Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split
+
+# 激活pixi环境
+pixi shell
+
+# 运行验证脚本
+python scripts/validate_macrolactone_db.py \
+  --input data/MacrolactoneDB/ring12_20/temp.csv \
+  --output validation_output \
+  --sample-ratio 0.1
+
+# 查看数据库
+sqlite3 validation_output/fragments.db ".tables"
+sqlite3 validation_output/fragments.db "SELECT * FROM parent_molecules LIMIT 5;"
+```
+
+## 8. 人工检查清单
+
+### 8.1 编号正确性检查
+- [ ] 位置1是否为内酯羰基碳（C=O）
+- [ ] 位置2是否为酯键氧（-O-）
+- [ ] 编号是否沿统一方向连续
+- [ ] 桥环/非标准环是否被正确跳过
+
+### 8.2 裂解正确性检查
+- [ ] 位置1和2是否有侧链（应该没有，是内酯本身）
+- [ ] 位置3-N的侧链是否正确识别
+- [ ] dummy原子是否正确标记裂解位置
+- [ ] 键型是否保持（单键/双键）
+
+### 8.3 分类准确性检查
+- [ ] 标准大环内酯是否被正确识别
+- [ ] 非标准大环是否被正确分类并跳过
+- [ ] 非大环内酯是否被拒绝
+
+## 9. 后续拼接接口设计
+
+```python
+def get_fragment_by_position(db_path: str, source_id: str, position: int) -> Optional[SideChainFragment]:
+    """通过位置获取片段，用于后续拼接"""
+    engine = create_engine(f"sqlite:///{db_path}")
+    with Session(engine) as session:
+        statement = select(SideChainFragment).where(
+            SideChainFragment.parent_id == source_id,
+            SideChainFragment.cleavage_position == position
+        )
+        return session.exec(statement).first()
+
+def get_all_positions_for_parent(db_path: str, source_id: str) -> List[int]:
+    """获取某分子的所有裂解位置"""
+    engine = create_engine(f"sqlite:///{db_path}")
+    with Session(engine) as session:
+        statement = select(SideChainFragment.cleavage_position).where(
+            SideChainFragment.parent_id == source_id
+        )
+        return [r[0] for r in session.exec(statement).all()]
+```
--- a/docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md
+++ b/docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md