feat(validation): add SQLModel database models and fix relationships
This commit is contained in:
475
docs/plans/2026-03-19-macrolactone-validation-design.md
Normal file
475
docs/plans/2026-03-19-macrolactone-validation-design.md
Normal file
@@ -0,0 +1,475 @@
|
||||
# MacrolactoneDB 12-20元环验证方案设计
|
||||
|
||||
## 1. 验证目标与范围
|
||||
|
||||
### 1.1 目标
|
||||
- 验证 `macro_lactone_toolkit` 对 MacrolactoneDB 12-20元环大环内酯的识别准确性
|
||||
- 验证环编号正确性(位置1=羰基碳,位置2=酯键氧)
|
||||
- 验证侧链断裂功能(仅针对标准大环内酯)
|
||||
- 使用同位素标记技术标记裂解位置,便于后续拼接
|
||||
|
||||
### 1.2 范围
|
||||
- **数据来源**: `/data/MacrolactoneDB/ring12_20/temp.csv` (11,037分子)
|
||||
- **抽样策略**: 分层随机抽样10% (~1,100分子),按环大小12-20均匀分布
|
||||
- **处理对象**: 仅处理 `classification="standard_macrolactone"` 的分子
|
||||
- **输出**: 可视化图片 + SQLite数据库 + 汇总统计
|
||||
|
||||
## 2. 数据库设计 (SQLModel)
|
||||
|
||||
### 2.1 模型定义
|
||||
|
||||
```python
|
||||
from typing import Optional, List
|
||||
from sqlmodel import SQLModel, Field, Relationship
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
class ClassificationType(str, Enum):
|
||||
STANDARD = "standard_macrolactone"
|
||||
NON_STANDARD = "non_standard_macrocycle"
|
||||
NOT_MACROLACTONE = "not_macrolactone"
|
||||
|
||||
class ProcessingStatus(str, Enum):
|
||||
PENDING = "pending"
|
||||
SUCCESS = "success"
|
||||
FAILED = "failed"
|
||||
SKIPPED = "skipped"
|
||||
|
||||
# ==================== 主分子表 ====================
|
||||
class ParentMolecule(SQLModel, table=True):
|
||||
"""原始分子信息表"""
|
||||
__tablename__ = "parent_molecules"
|
||||
|
||||
id: Optional[int] = Field(default=None, primary_key=True)
|
||||
|
||||
# 原始数据
|
||||
source_id: str = Field(index=True) # 来自CSV的IDs字段
|
||||
molecule_name: Optional[str] = None # molecule_pref_name
|
||||
smiles: str = Field(index=True)
|
||||
|
||||
# 分类结果
|
||||
classification: ClassificationType = Field(index=True)
|
||||
ring_size: Optional[int] = Field(default=None, index=True)
|
||||
primary_reason_code: Optional[str] = None
|
||||
primary_reason_message: Optional[str] = None
|
||||
|
||||
# 处理状态
|
||||
processing_status: ProcessingStatus = Field(default=ProcessingStatus.PENDING)
|
||||
error_message: Optional[str] = None
|
||||
|
||||
# 元数据
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
processed_at: Optional[datetime] = None
|
||||
|
||||
# 关系
|
||||
fragments: List["SideChainFragment"] = Relationship(back_populates="parent")
|
||||
numbering: Optional["RingNumbering"] = Relationship(back_populates="parent")
|
||||
|
||||
# ==================== 环编号表 ====================
|
||||
class RingNumbering(SQLModel, table=True):
|
||||
"""环编号详细信息"""
|
||||
__tablename__ = "ring_numberings"
|
||||
|
||||
id: Optional[int] = Field(default=None, primary_key=True)
|
||||
parent_id: int = Field(foreign_key="parent_molecules.id", unique=True)
|
||||
|
||||
# 环基本信息
|
||||
ring_size: int
|
||||
carbonyl_carbon_idx: int # 位置1
|
||||
ester_oxygen_idx: int # 位置2
|
||||
|
||||
# 原子映射 (JSON存储)
|
||||
position_to_atom: str # JSON: {"1": 5, "2": 10, ...}
|
||||
atom_to_position: str # JSON: {"5": 1, "10": 2, ...}
|
||||
|
||||
# 关系
|
||||
parent: Optional[ParentMolecule] = Relationship(back_populates="numbering")
|
||||
|
||||
# ==================== 侧链片段表 ====================
|
||||
class SideChainFragment(SQLModel, table=True):
|
||||
"""裂解产生的侧链片段"""
|
||||
__tablename__ = "side_chain_fragments"
|
||||
|
||||
id: Optional[int] = Field(default=None, primary_key=True)
|
||||
parent_id: int = Field(foreign_key="parent_molecules.id", index=True)
|
||||
|
||||
# 片段标识
|
||||
fragment_id: str = Field(index=True) # {source_id}_frag_{n}
|
||||
|
||||
# 裂解位置信息
|
||||
cleavage_position: int = Field(index=True) # 环上的断裂位置
|
||||
attachment_atom_idx: int # 母环上的连接原子索引
|
||||
attachment_atom_symbol: str # C, O, N等
|
||||
|
||||
# SMILES (关键:同位素标记用于标识裂解位置)
|
||||
fragment_smiles_labeled: str # 带同位素标记,如 [5*]CCO
|
||||
fragment_smiles_plain: str # 无标记,如 *CCO
|
||||
|
||||
# 同位素标记值 (用于后续拼接)
|
||||
dummy_isotope: int # 裂解位置的编号,用于重建连接关系
|
||||
|
||||
# 物理化学性质
|
||||
atom_count: int
|
||||
heavy_atom_count: int
|
||||
molecular_weight: float
|
||||
|
||||
# 连接信息 (用于后续拼接)
|
||||
original_bond_type: str # SINGLE, DOUBLE, AROMATIC等
|
||||
|
||||
# 关系
|
||||
parent: Optional[ParentMolecule] = Relationship(back_populates="fragments")
|
||||
|
||||
# ==================== 验证结果表 ====================
|
||||
class ValidationResult(SQLModel, table=True):
|
||||
"""人工验证结果记录"""
|
||||
__tablename__ = "validation_results"
|
||||
|
||||
id: Optional[int] = Field(default=None, primary_key=True)
|
||||
parent_id: int = Field(foreign_key="parent_molecules.id")
|
||||
|
||||
# 验证字段
|
||||
numbering_correct: Optional[bool] = None # 编号是否正确
|
||||
cleavage_correct: Optional[bool] = None # 裂解位置是否正确
|
||||
classification_correct: Optional[bool] = None # 分类是否正确
|
||||
|
||||
# 备注
|
||||
notes: Optional[str] = None
|
||||
validated_by: Optional[str] = None
|
||||
validated_at: Optional[datetime] = None
|
||||
```
|
||||
|
||||
### 2.2 数据库初始化
|
||||
|
||||
```python
|
||||
from sqlmodel import create_engine, Session, SQLModel
|
||||
from contextlib import contextmanager
|
||||
|
||||
# SQLite文件数据库
|
||||
DATABASE_URL = "sqlite:///./validation_output/fragments.db"
|
||||
|
||||
engine = create_engine(DATABASE_URL, echo=False)
|
||||
|
||||
@contextmanager
|
||||
def get_session():
|
||||
with Session(engine) as session:
|
||||
yield session
|
||||
|
||||
def init_database():
|
||||
"""创建所有表"""
|
||||
SQLModel.metadata.create_all(engine)
|
||||
```
|
||||
|
||||
## 3. 同位素标记方案 (借鉴Molassembler)
|
||||
|
||||
### 3.1 标记策略
|
||||
|
||||
```python
|
||||
def build_fragment_smiles_with_isotope(
|
||||
mol: Chem.Mol,
|
||||
side_chain_atoms: list[int],
|
||||
side_chain_start_idx: int,
|
||||
ring_atom_idx: int,
|
||||
cleavage_position: int, # 使用位置编号作为同位素值
|
||||
) -> str:
|
||||
"""
|
||||
构建带同位素标记的片段SMILES
|
||||
|
||||
关键:用cleavage_position作为dummy原子的同位素值
|
||||
这样后续拼接时能精确知道片段来自哪个位置
|
||||
"""
|
||||
# 创建可编辑分子
|
||||
emol = Chem.EditableMol(Chem.Mol(mol))
|
||||
|
||||
# 添加dummy原子替代连接点
|
||||
dummy_atom = Chem.Atom(0) # 原子序数0 = dummy
|
||||
dummy_atom.SetIsotope(cleavage_position) # 【关键】用位置编号标记
|
||||
dummy_idx = emol.AddAtom(dummy_atom)
|
||||
|
||||
# 获取原始键类型
|
||||
bond = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx)
|
||||
bond_type = bond.GetBondType()
|
||||
|
||||
# 添加dummy原子与侧链起始原子的键
|
||||
emol.AddBond(dummy_idx, side_chain_start_idx, bond_type)
|
||||
|
||||
# 只保留dummy原子和侧链原子
|
||||
atoms_to_keep = set([dummy_idx] + list(side_chain_atoms))
|
||||
|
||||
# 标记要删除的原子
|
||||
for atom_idx in range(mol.GetNumAtoms()):
|
||||
if atom_idx not in atoms_to_keep:
|
||||
emol.RemoveAtom(atom_idx)
|
||||
|
||||
fragment = emol.GetMol()
|
||||
Chem.SanitizeMol(fragment)
|
||||
|
||||
return Chem.MolToSmiles(fragment)
|
||||
```
|
||||
|
||||
### 3.2 标记示例
|
||||
|
||||
| 裂解位置 | 原始结构 | 标记后SMILES | 说明 |
|
||||
|---------|---------|-------------|------|
|
||||
| 5 | `...C5(CCO)...` | `[5*]CCO` | dummy同位素=5,表示位置5的侧链 |
|
||||
| 12 | `...C12(CCCO)...` | `[12*]CCCO` | dummy同位素=12,表示位置12的侧链 |
|
||||
|
||||
### 3.3 拼接时使用标记
|
||||
|
||||
```python
|
||||
# 后续拼接时,可以通过同位素值找到对应的裂解位置
|
||||
def find_attachment_position(fragment_smiles: str) -> int:
|
||||
"""从片段SMILES中提取裂解位置"""
|
||||
mol = Chem.MolFromSmiles(fragment_smiles)
|
||||
for atom in mol.GetAtoms():
|
||||
if atom.GetAtomicNum() == 0 and atom.GetIsotope() > 0:
|
||||
return atom.GetIsotope() # 返回位置编号
|
||||
return 0
|
||||
```
|
||||
|
||||
## 4. 输出目录结构
|
||||
|
||||
```
|
||||
validation_output/
|
||||
├── README.md # 目录结构说明
|
||||
├── fragments.db # SQLite数据库
|
||||
├── summary.csv # 主汇总表 (所有分子)
|
||||
├── summary_statistics.json # 统计信息
|
||||
│
|
||||
├── ring_size_12/ # 12元环
|
||||
├── ring_size_13/ # 13元环
|
||||
├── ring_size_14/ # 14元环
|
||||
├── ring_size_15/ # 15元环
|
||||
├── ring_size_16/ # 16元环
|
||||
├── ring_size_17/ # 17元环
|
||||
├── ring_size_18/ # 18元环
|
||||
├── ring_size_19/ # 19元环
|
||||
└── ring_size_20/ # 20元环
|
||||
│
|
||||
├── molecules.csv # 该环大小的所有分子
|
||||
│
|
||||
├── standard/ # 标准大环内酯
|
||||
│ ├── numbered/ # 带编号的高亮环图
|
||||
│ │ ├── {source_id}_numbered.png
|
||||
│ │ └── ...
|
||||
│ │
|
||||
│ └── sidechains/ # 侧链片段图
|
||||
│ └── {source_id}/
|
||||
│ ├── {source_id}_frag_0_pos{pos}.png # 位置信息在文件名
|
||||
│ ├── {source_id}_frag_1_pos{pos}.png
|
||||
│ └── ...
|
||||
│
|
||||
├── non_standard/ # 非标准大环
|
||||
│ └── original/
|
||||
│ ├── {source_id}_original.png
|
||||
│ └── ...
|
||||
│
|
||||
└── rejected/ # 被拒绝的分子
|
||||
└── original/
|
||||
├── {source_id}_original.png
|
||||
└── ...
|
||||
```
|
||||
|
||||
## 5. CSV字段设计
|
||||
|
||||
### 5.1 summary.csv
|
||||
|
||||
| 字段名 | 类型 | 说明 |
|
||||
|--------|------|------|
|
||||
| `id` | int | 数据库主键 |
|
||||
| `source_id` | str | 原始IDs字段 |
|
||||
| `molecule_name` | str | 分子名称 |
|
||||
| `smiles` | str | 原始SMILES |
|
||||
| `classification` | str | standard/non_standard/not_macrolactone |
|
||||
| `ring_size` | int | 检测到的环大小 |
|
||||
| `primary_reason_code` | str | 分类原因代码 |
|
||||
| `primary_reason_message` | str | 分类原因描述 |
|
||||
| `processing_status` | str | pending/success/failed/skipped |
|
||||
| `error_message` | str | 错误信息 |
|
||||
| `num_sidechains` | int | 侧链数量 |
|
||||
| `cleavage_positions` | str | JSON数组 [5, 8, 12] |
|
||||
| `numbered_image_path` | str | 编号图相对路径 |
|
||||
| `processed_at` | datetime | 处理时间 |
|
||||
|
||||
### 5.2 fragments.csv (每个分子的侧链详情)
|
||||
|
||||
| 字段名 | 类型 | 说明 |
|
||||
|--------|------|------|
|
||||
| `fragment_id` | str | 唯一标识 |
|
||||
| `source_id` | str | 母分子ID |
|
||||
| `cleavage_position` | int | 环上断裂位置 |
|
||||
| `attachment_atom_idx` | int | 连接原子索引 |
|
||||
| `attachment_atom_symbol` | str | 连接原子类型 |
|
||||
| `fragment_smiles_labeled` | str | 带同位素标记的SMILES |
|
||||
| `fragment_smiles_plain` | str | 无标记SMILES |
|
||||
| `dummy_isotope` | int | 标记值=裂解位置 |
|
||||
| `atom_count` | int | 原子数 |
|
||||
| `molecular_weight` | float | 分子量 |
|
||||
| `original_bond_type` | str | 原始键类型 |
|
||||
| `image_path` | str | 片段图路径 |
|
||||
|
||||
## 6. 验证脚本架构
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
MacrolactoneDB 12-20元环验证脚本
|
||||
使用: pixi run python scripts/validate_macrolactone_db.py
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from sqlmodel import Session
|
||||
from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter
|
||||
from macro_lactone_toolkit.visualization import save_numbered_molecule_png
|
||||
|
||||
class MacrolactoneValidator:
|
||||
def __init__(self, sample_ratio=0.1, output_dir="validation_output"):
|
||||
self.analyzer = MacroLactoneAnalyzer()
|
||||
self.fragmenter = MacrolactoneFragmenter()
|
||||
self.sample_ratio = sample_ratio
|
||||
self.output_dir = Path(output_dir)
|
||||
|
||||
def run(self, input_csv: str):
|
||||
# 1. 加载数据
|
||||
df = pd.read_csv(input_csv)
|
||||
|
||||
# 2. 分层抽样
|
||||
sampled = self._stratified_sample(df)
|
||||
|
||||
# 3. 初始化数据库
|
||||
init_database()
|
||||
|
||||
# 4. 处理每个分子
|
||||
for _, row in sampled.iterrows():
|
||||
self._process_molecule(row)
|
||||
|
||||
# 5. 生成汇总
|
||||
self._generate_summary()
|
||||
|
||||
def _stratified_sample(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""按环大小分层抽样"""
|
||||
# 先分类所有分子
|
||||
df['classification'] = df['smiles'].apply(
|
||||
lambda s: self.analyzer.classify_macrocycle(s).classification
|
||||
)
|
||||
df['ring_size'] = df['smiles'].apply(
|
||||
lambda s: self.analyzer.classify_macrocycle(s).ring_size
|
||||
)
|
||||
|
||||
# 按ring_size分层,每层抽10%
|
||||
sampled = df.groupby('ring_size').apply(
|
||||
lambda x: x.sample(frac=self.sample_ratio, random_state=42)
|
||||
).reset_index(drop=True)
|
||||
|
||||
return sampled
|
||||
|
||||
def _process_molecule(self, row: pd.Series):
|
||||
"""处理单个分子"""
|
||||
source_id = row['IDs']
|
||||
smiles = row['smiles']
|
||||
classification = row['classification']
|
||||
ring_size = row['ring_size']
|
||||
|
||||
# 保存到ParentMolecule表
|
||||
parent = ParentMolecule(
|
||||
source_id=source_id,
|
||||
molecule_name=row.get('molecule_pref_name'),
|
||||
smiles=smiles,
|
||||
classification=classification,
|
||||
ring_size=ring_size,
|
||||
)
|
||||
|
||||
if classification != ClassificationType.STANDARD:
|
||||
parent.processing_status = ProcessingStatus.SKIPPED
|
||||
self._save_image(smiles, ring_size, source_id, classification)
|
||||
return
|
||||
|
||||
# 处理标准大环内酯
|
||||
try:
|
||||
# 环编号
|
||||
numbering = self.fragmenter.number_molecule(smiles)
|
||||
self._save_numbering_to_db(parent.id, numbering)
|
||||
|
||||
# 生成编号图
|
||||
self._save_numbered_image(smiles, ring_size, source_id)
|
||||
|
||||
# 侧链断裂
|
||||
result = self.fragmenter.fragment_molecule(smiles, parent_id=source_id)
|
||||
self._save_fragments_to_db(parent.id, result)
|
||||
self._save_fragment_images(result, source_id)
|
||||
|
||||
parent.processing_status = ProcessingStatus.SUCCESS
|
||||
parent.num_sidechains = len(result.fragments)
|
||||
parent.cleavage_positions = json.dumps([f.cleavage_position for f in result.fragments])
|
||||
|
||||
except Exception as e:
|
||||
parent.processing_status = ProcessingStatus.FAILED
|
||||
parent.error_message = str(e)
|
||||
|
||||
parent.processed_at = datetime.utcnow()
|
||||
|
||||
with get_session() as session:
|
||||
session.add(parent)
|
||||
session.commit()
|
||||
```
|
||||
|
||||
## 7. 执行命令
|
||||
|
||||
```bash
|
||||
# 进入项目目录
|
||||
cd /Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split
|
||||
|
||||
# 激活pixi环境
|
||||
pixi shell
|
||||
|
||||
# 运行验证脚本
|
||||
python scripts/validate_macrolactone_db.py \
|
||||
--input data/MacrolactoneDB/ring12_20/temp.csv \
|
||||
--output validation_output \
|
||||
--sample-ratio 0.1
|
||||
|
||||
# 查看数据库
|
||||
sqlite3 validation_output/fragments.db ".tables"
|
||||
sqlite3 validation_output/fragments.db "SELECT * FROM parent_molecules LIMIT 5;"
|
||||
```
|
||||
|
||||
## 8. 人工检查清单
|
||||
|
||||
### 8.1 编号正确性检查
|
||||
- [ ] 位置1是否为内酯羰基碳(C=O)
|
||||
- [ ] 位置2是否为酯键氧(-O-)
|
||||
- [ ] 编号是否沿统一方向连续
|
||||
- [ ] 桥环/非标准环是否被正确跳过
|
||||
|
||||
### 8.2 裂解正确性检查
|
||||
- [ ] 位置1和2是否有侧链(应该没有,是内酯本身)
|
||||
- [ ] 位置3-N的侧链是否正确识别
|
||||
- [ ] dummy原子是否正确标记裂解位置
|
||||
- [ ] 键型是否保持(单键/双键)
|
||||
|
||||
### 8.3 分类准确性检查
|
||||
- [ ] 标准大环内酯是否被正确识别
|
||||
- [ ] 非标准大环是否被正确分类并跳过
|
||||
- [ ] 非大环内酯是否被拒绝
|
||||
|
||||
## 9. 后续拼接接口设计
|
||||
|
||||
```python
|
||||
def get_fragment_by_position(db_path: str, source_id: str, position: int) -> Optional[SideChainFragment]:
|
||||
"""通过位置获取片段,用于后续拼接"""
|
||||
engine = create_engine(f"sqlite:///{db_path}")
|
||||
with Session(engine) as session:
|
||||
statement = select(SideChainFragment).where(
|
||||
SideChainFragment.parent_id == source_id,
|
||||
SideChainFragment.cleavage_position == position
|
||||
)
|
||||
return session.exec(statement).first()
|
||||
|
||||
def get_all_positions_for_parent(db_path: str, source_id: str) -> List[int]:
|
||||
"""获取某分子的所有裂解位置"""
|
||||
engine = create_engine(f"sqlite:///{db_path}")
|
||||
with Session(engine) as session:
|
||||
statement = select(SideChainFragment.cleavage_position).where(
|
||||
SideChainFragment.parent_id == source_id
|
||||
)
|
||||
return [r[0] for r in session.exec(statement).all()]
|
||||
```
|
||||
1287
docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md
Normal file
1287
docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user