From d18133ce160359adeb9a4dfe5fba86f01f9cd420 Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Thu, 19 Mar 2026 10:27:20 +0800 Subject: [PATCH] feat(validation): add SQLModel database models and fix relationships --- ...standard-macrocycle-classification-plan.md | 139 ++ PLAN-.md | 157 ++ ...26-03-19-macrolactone-validation-design.md | 475 ++++++ ...olactone-validation-implementation-plan.md | 1287 +++++++++++++++++ .../validation/database.py | 26 + .../validation/models.py | 20 +- 6 files changed, 2091 insertions(+), 13 deletions(-) create mode 100644 2026-03-18-standard-macrocycle-classification-plan.md create mode 100644 PLAN-.md create mode 100644 docs/plans/2026-03-19-macrolactone-validation-design.md create mode 100644 docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md create mode 100644 src/macro_lactone_toolkit/validation/database.py diff --git a/2026-03-18-standard-macrocycle-classification-plan.md b/2026-03-18-standard-macrocycle-classification-plan.md new file mode 100644 index 0000000..4b1bcdd --- /dev/null +++ b/2026-03-18-standard-macrocycle-classification-plan.md @@ -0,0 +1,139 @@ +# Standard vs Non-Standard Macrocycle Classification + +## Summary + +Add a formal molecule-level classification layer on top of the current `macro_lactone_toolkit` detection logic so the toolkit can distinguish: + +- `standard_macrolactone` +- `non_standard_macrocycle` +- `not_macrolactone` + +This classification must support the two new rejection rules: + +1. After ring numbering is assigned, positions `3..N` must all be carbon atoms. If any atom at positions `3..N` is not carbon, classify the molecule as `non_standard_macrocycle`. +2. If multiple candidate macrolactone rings overlap in the same atom set graph, classify the molecule as `non_standard_macrocycle`. Use only overlapping candidate rings for this rule; disconnected or non-overlapping candidates do not trigger this specific rejection. + +Do not rely on a “largest ring” assumption. Base detection on RDKit ring candidates from `RingInfo.AtomRings()` plus explicit lactone validation, then apply the new standard/non-standard filters. + +## Public API And Output Changes + +Add a new result type, e.g. `MacrocycleClassificationResult`, with these fields: + +- `smiles: str` +- `classification: Literal["standard_macrolactone", "non_standard_macrocycle", "not_macrolactone"]` +- `ring_size: int | None` +- `primary_reason_code: str | None` +- `primary_reason_message: str | None` +- `all_reason_codes: list[str]` +- `all_reason_messages: list[str]` +- `candidate_ring_sizes: list[int]` + +Add a new public API on `MacroLactoneAnalyzer`: + +- `classify_macrocycle(mol_input: str | Chem.Mol, ring_size: int | None = None) -> MacrocycleClassificationResult` + +Behavior: + +- If `ring_size` is omitted, inspect all 12-20 membered lactone candidates. +- If `ring_size` is provided, restrict candidate selection to that size before classification. +- Invalid SMILES should keep raising the existing detection exception path; do not encode invalid input as a classification result. +- For `standard_macrolactone`, `ring_size` must be the accepted ring size and all reason fields must be empty. +- For `non_standard_macrocycle`, `ring_size` should be the candidate ring size if exactly one size remains relevant, otherwise `None`. +- For `not_macrolactone`, return no ring size and a reason describing why no valid 12-20 lactone candidate survived. + +Reason codes must be decision-complete and fixed: + +- `contains_non_carbon_ring_atoms_outside_positions_1_2` +- `multiple_overlapping_macrocycle_candidates` +- `no_lactone_ring_in_12_to_20_range` +- `requested_ring_size_not_found` + +Reason messages must be short English sentences: + +- `Ring positions 3..N contain non-carbon atoms.` +- `Overlapping macrolactone candidate rings were detected.` +- `No 12-20 membered lactone ring was detected.` +- `The requested ring size was not detected as a lactone ring.` + +Update CLI `macro-lactone-toolkit analyze` to return this classification result shape for single-SMILES mode and row-wise CSV mode. + +Do not add a new CLI subcommand. Keep `analyze` as the classification surface. + +## Implementation Changes + +### Detection And Candidate Grouping + +In the current core detection module: + +- Keep the existing lactone-ring candidate search based on `RingInfo.AtomRings()` and lactone atom validation. +- Add an overlap-group pass over candidate rings: + - Build a graph where two candidates are connected if their ring atom sets intersect. + - Compute connected components on this graph. + - If any connected component contains more than one candidate, classify as `non_standard_macrocycle` with `multiple_overlapping_macrocycle_candidates`. +- Do not treat disconnected candidate rings as overlapping. +- Keep `candidate_ring_sizes` as the sorted unique sizes from the filtered candidate list. + +### Standard Macrocycle Filter + +For any single candidate that survives overlap rejection: + +- Build numbering exactly as today: position 1 is the lactone carbonyl carbon, position 2 is the ring ester oxygen. +- Inspect positions `3..N`. +- Every atom at positions `3..N` must have atomic number 6. +- If any position `3..N` is not carbon, classify as `non_standard_macrocycle` with `contains_non_carbon_ring_atoms_outside_positions_1_2`. + +This rule must reject ring peptides and other heteroatom-containing macrocycles even if they contain a lactone bond. + +### Fragmenter Integration + +Update `MacrolactoneFragmenter` so that: + +- `number_molecule()` and `fragment_molecule()` first call `classify_macrocycle()`. +- They only proceed when classification is `standard_macrolactone`. +- For `non_standard_macrocycle` or `not_macrolactone`, raise the existing detection exception type with a message that includes the classification and the primary reason code. +- Do not change fragmentation output semantics for standard macrolactones. + +### Files To Change + +Concentrate changes in: + +- `src/macro_lactone_toolkit/_core.py` +- `src/macro_lactone_toolkit/analyzer.py` +- `src/macro_lactone_toolkit/cli.py` + +Add the new result type in the existing models module instead of inventing a second schema location. + +## Test Plan + +Add tests first, verify they fail, then implement. + +Required test cases: + +- Standard 12, 14, 16, and 20 membered macrolactones still classify as `standard_macrolactone` and return the correct `ring_size`. +- A macrocycle with a valid lactone bond but a non-carbon atom at position `3..N` classifies as `non_standard_macrocycle` with: + - `primary_reason_code == "contains_non_carbon_ring_atoms_outside_positions_1_2"` + - the expected English message +- An overlapping-candidate example classifies as `non_standard_macrocycle` with: + - `primary_reason_code == "multiple_overlapping_macrocycle_candidates"` + - the expected English message +- A non-lactone macrocycle classifies as `not_macrolactone` with `no_lactone_ring_in_12_to_20_range`. +- Explicit `ring_size` with no candidate of that size returns `not_macrolactone` with `requested_ring_size_not_found`. +- `macro-lactone-toolkit analyze --smiles ...` returns the new fields for: + - one standard example + - one heteroatom-rejected example + - one overlap-rejected example +- Existing numbering, fragmentation, labeled/plain dummy round-trip, and splicing tests remain green for standard macrolactones. + +Test fixture guidance: + +- Reuse the existing synthetic macrocycle helper for standard rings. +- Extend the helper or add a new fixture helper for: + - a lactone-containing ring with one non-carbon atom at a numbered position beyond 2 + - an overlapping-candidate ring example specifically built to share ring atoms between candidate rings + +## Assumptions And Defaults + +- Classification is molecule-level, but the overlap rejection only applies to overlapping candidate rings, not disconnected candidates elsewhere in the molecule. +- Invalid SMILES remain exceptions, not classification payloads. +- `analyze` becomes the official classification output; `get_valid_ring_sizes()` may remain as a lower-level helper. +- The implementation should stay aligned with RDKit ring APIs as candidate generators, not as the final definition of a standard macrolactone. diff --git a/PLAN-.md b/PLAN-.md new file mode 100644 index 0000000..e00faa5 --- /dev/null +++ b/PLAN-.md @@ -0,0 +1,157 @@ +# `macro_split` 重构为可安装包 `macro_lactone_toolkit` 的实施计划 + +## Summary + +采用“渐进式重构,不推倒重写”的方案:保留仓库里已经可用的 12-20 元环识别、通用编号、侧链 BFS 提取和 dummy 原子思路,删除或降级 16 元环专用路径,把核心能力统一收敛到一个正式可安装、可测试、可 CLI 调用的包。 + +本次重构完成后,项目要满足这几个结果: +- 可以作为正式 Python 包安装和导入,主包名为 `macro_lactone_toolkit` +- 默认自动识别 12-20 元有效大环内酯;也允许显式传 `ring_size` +- 支持环编号和侧链裂解 +- 裂解结果同时提供“带位置编号 dummy”与“普通 `*` dummy”两套 SMILES,且都能被 RDKit 读取保存 +- 用 `pixi` 管理环境和测试 +- 先执行 `git pull --ff-only` 同步远端,再开始任何代码改动 + +## Key Changes + +### 1. 仓库与包结构正规化 + +- 第一动作不是改代码,而是在仓库根目录执行: + - `git status --short` + - `git pull --ff-only` +- 将当前直接放在顶层 `src/` 下的运行时代码,改成标准包布局:`src/macro_lactone_toolkit/` +- 不保留 `src.*` 兼容层;README、测试、脚本、入口点全部切到新导入路径 +- `pyproject.toml` 改为标准 setuptools `src` layout 配置,修正: + - 包发现 + - 项目名与版本元数据 + - console scripts + - pytest 配置 +- `pixi.toml` 统一为 Python 3.12,并支持至少: + - `osx-arm64` + - `linux-64` +- `pixi.toml` 补齐测试依赖和常用任务: + - `test` + - `lint`(只做检查,不自动改写文件) + - `smoke-import` +- README 中所有 `from src...` 示例全部替换为 `from macro_lactone_toolkit...` + +### 2. 统一核心领域模型与 API + +以 `ring_visualization.py` 的通用逻辑和参考脚本的编号思路为基础,重构出清晰边界的核心模块,避免“16 元环专用”和“12-20 元环通用”并存。 + +公开 API 统一为: + +- `macro_lactone_toolkit.MacroLactoneAnalyzer` +- `macro_lactone_toolkit.MacrolactoneFragmenter` +- `macro_lactone_toolkit.RingNumberingResult` +- `macro_lactone_toolkit.SideChainFragment` +- `macro_lactone_toolkit.FragmentationResult` + +行为约定固定如下: + +- `MacroLactoneAnalyzer` + - 负责识别 12-20 元环 + - 验证环上是否存在有效内酯酯键 + - 返回所有命中的有效环尺寸 +- `MacrolactoneFragmenter` + - 默认自动识别有效环尺寸 + - 若检测到 0 个有效环,抛出明确异常 + - 若检测到多个有效环尺寸,默认抛出“歧义异常”,要求调用者显式传 `ring_size` + - 若调用者传了 `ring_size`,则只处理该环 +- `RingNumberingResult` + - 至少包含:`ring_size`、`ring_atoms`、`ordered_atoms`、`carbonyl_carbon_idx`、`ester_oxygen_idx`、`atom_to_position`、`position_to_atom` +- `SideChainFragment` + - 至少包含:`parent_id`、`cleavage_position`、`attachment_atom_idx`、`fragment_smiles_labeled`、`fragment_smiles_plain`、`atom_count`、`molecular_weight` +- `FragmentationResult` + - 至少包含:母分子信息、选定环尺寸、编号结果、所有碎片、错误/警告信息 + +异常策略固定为: + +- `MacrolactoneError` +- `MacrolactoneDetectionError` +- `AmbiguousMacrolactoneError` +- `RingNumberingError` +- `FragmentationError` + +### 3. 编号与裂解算法收敛 + +编号逻辑以“通用 12-20 元环”作为唯一正式实现,不再让旧的 16 元环 `ring_numbering.py` 充当主入口。 + +- 编号规则固定: + - 位置 1 = 环上的内酯羰基碳 + - 位置 2 = 环上的酯键氧 + - 位置 3-N = 沿确定方向依次编号剩余环原子 +- 编号实现使用统一入口,不能再出现批处理脚本内部偷偷调用 `[r16]` 专用函数的情况 +- 自动识别时,先找所有 12-20 元环,再验证该环是否为有效大环内酯 +- 侧链识别统一用“环原子邻居中不在环内的原子即侧链起点” +- 侧链提取统一用 BFS,只沿非环原子扩展 +- dummy 原子输出固定两套: + - `fragment_smiles_labeled`:带位置编号 dummy,例如 `[5*]` + - `fragment_smiles_plain`:普通 `*` +- dummy 与连接原子的键型必须保留原始键型,不能强制都变成单键 +- 所有裂解结果必须通过 RDKit round-trip: + - `Chem.MolFromSmiles(...)` 成功 + - `Chem.MolToSmiles(...)` 可再次序列化 + +### 4. CLI、批处理与现有脚本归并 + +交付范围包含正式 CLI,不再依赖散乱脚本作为主要入口。 + +CLI 设计固定为 3 个命令: + +- `macro-lactone-toolkit analyze` + - 输入单个 SMILES 或 CSV + - 输出有效环尺寸、是否歧义、选中的环尺寸 +- `macro-lactone-toolkit number` + - 输入单个 SMILES + - 输出编号结果 JSON +- `macro-lactone-toolkit fragment` + - 输入单个 SMILES 或 CSV + - 输出碎片 JSON/CSV,包含 labeled/plain 两套 SMILES + +批处理行为固定为: + +- CSV 默认读取 `smiles` 列 +- 可选 `id` 列;未提供则自动生成 +- 自动识别歧义分子时: + - 默认跳过并记录错误 + - 显式传 `--ring-size` 时按指定环处理 +- 旧 `scripts/` 中保留价值的逻辑迁入正式 CLI;剩余脚本改成薄封装或标记为 legacy,不再承担核心实现 + +## Test Plan + +必须用 `pixi` 跑完整验证,至少覆盖下面这些场景: + +- 包安装与导入 + - `pixi run python -c "import macro_lactone_toolkit"` + - `pixi run pytest` +- 环识别 + - 单一有效环尺寸的 12、14、16、20 元环样例 + - 无效分子 + - 无内酯键分子 + - 多有效环尺寸分子,确认抛出 `AmbiguousMacrolactoneError` +- 编号 + - 位置 1 必须是羰基碳 + - 位置 2 必须是酯键氧 + - 编号范围必须连续覆盖 `1..N` + - 同一分子多次运行结果一致 +- 裂解 + - 无侧链位置返回空列表 + - 有侧链位置返回碎片 + - `fragment_smiles_labeled` 与 `fragment_smiles_plain` 都能被 RDKit 成功读取 + - labeled dummy 保留位置号 + - dummy 键型与原键型一致 +- CLI + - `analyze`、`number`、`fragment` 的基本 smoke test + - CSV 批处理 + - 歧义分子跳过与错误日志行为 +- 回归 + - 现有 splicing engine 若继续保留,至少保留一个 dummy 拼接回组装测试,确保 labeled dummy 方案可持续扩展 + +## Assumptions + +- `git pull` 使用 `--ff-only`,避免在重构开始前引入不透明 merge commit +- 本次不保留 `src.*` 导入兼容层,全面切换到 `macro_lactone_toolkit` +- 本次把“正式库 API + 正式 CLI + pixi 测试”作为主交付;notebooks 只做参考,不作为正式接口 +- 发现多个有效大环尺寸时,默认视为歧义并失败,不自动猜测 +- 继续使用 RDKit 作为唯一结构解析与序列化基础 diff --git a/docs/plans/2026-03-19-macrolactone-validation-design.md b/docs/plans/2026-03-19-macrolactone-validation-design.md new file mode 100644 index 0000000..786004c --- /dev/null +++ b/docs/plans/2026-03-19-macrolactone-validation-design.md @@ -0,0 +1,475 @@ +# MacrolactoneDB 12-20元环验证方案设计 + +## 1. 验证目标与范围 + +### 1.1 目标 +- 验证 `macro_lactone_toolkit` 对 MacrolactoneDB 12-20元环大环内酯的识别准确性 +- 验证环编号正确性(位置1=羰基碳,位置2=酯键氧) +- 验证侧链断裂功能(仅针对标准大环内酯) +- 使用同位素标记技术标记裂解位置,便于后续拼接 + +### 1.2 范围 +- **数据来源**: `/data/MacrolactoneDB/ring12_20/temp.csv` (11,037分子) +- **抽样策略**: 分层随机抽样10% (~1,100分子),按环大小12-20均匀分布 +- **处理对象**: 仅处理 `classification="standard_macrolactone"` 的分子 +- **输出**: 可视化图片 + SQLite数据库 + 汇总统计 + +## 2. 数据库设计 (SQLModel) + +### 2.1 模型定义 + +```python +from typing import Optional, List +from sqlmodel import SQLModel, Field, Relationship +from datetime import datetime +from enum import Enum + +class ClassificationType(str, Enum): + STANDARD = "standard_macrolactone" + NON_STANDARD = "non_standard_macrocycle" + NOT_MACROLACTONE = "not_macrolactone" + +class ProcessingStatus(str, Enum): + PENDING = "pending" + SUCCESS = "success" + FAILED = "failed" + SKIPPED = "skipped" + +# ==================== 主分子表 ==================== +class ParentMolecule(SQLModel, table=True): + """原始分子信息表""" + __tablename__ = "parent_molecules" + + id: Optional[int] = Field(default=None, primary_key=True) + + # 原始数据 + source_id: str = Field(index=True) # 来自CSV的IDs字段 + molecule_name: Optional[str] = None # molecule_pref_name + smiles: str = Field(index=True) + + # 分类结果 + classification: ClassificationType = Field(index=True) + ring_size: Optional[int] = Field(default=None, index=True) + primary_reason_code: Optional[str] = None + primary_reason_message: Optional[str] = None + + # 处理状态 + processing_status: ProcessingStatus = Field(default=ProcessingStatus.PENDING) + error_message: Optional[str] = None + + # 元数据 + created_at: datetime = Field(default_factory=datetime.utcnow) + processed_at: Optional[datetime] = None + + # 关系 + fragments: List["SideChainFragment"] = Relationship(back_populates="parent") + numbering: Optional["RingNumbering"] = Relationship(back_populates="parent") + +# ==================== 环编号表 ==================== +class RingNumbering(SQLModel, table=True): + """环编号详细信息""" + __tablename__ = "ring_numberings" + + id: Optional[int] = Field(default=None, primary_key=True) + parent_id: int = Field(foreign_key="parent_molecules.id", unique=True) + + # 环基本信息 + ring_size: int + carbonyl_carbon_idx: int # 位置1 + ester_oxygen_idx: int # 位置2 + + # 原子映射 (JSON存储) + position_to_atom: str # JSON: {"1": 5, "2": 10, ...} + atom_to_position: str # JSON: {"5": 1, "10": 2, ...} + + # 关系 + parent: Optional[ParentMolecule] = Relationship(back_populates="numbering") + +# ==================== 侧链片段表 ==================== +class SideChainFragment(SQLModel, table=True): + """裂解产生的侧链片段""" + __tablename__ = "side_chain_fragments" + + id: Optional[int] = Field(default=None, primary_key=True) + parent_id: int = Field(foreign_key="parent_molecules.id", index=True) + + # 片段标识 + fragment_id: str = Field(index=True) # {source_id}_frag_{n} + + # 裂解位置信息 + cleavage_position: int = Field(index=True) # 环上的断裂位置 + attachment_atom_idx: int # 母环上的连接原子索引 + attachment_atom_symbol: str # C, O, N等 + + # SMILES (关键:同位素标记用于标识裂解位置) + fragment_smiles_labeled: str # 带同位素标记,如 [5*]CCO + fragment_smiles_plain: str # 无标记,如 *CCO + + # 同位素标记值 (用于后续拼接) + dummy_isotope: int # 裂解位置的编号,用于重建连接关系 + + # 物理化学性质 + atom_count: int + heavy_atom_count: int + molecular_weight: float + + # 连接信息 (用于后续拼接) + original_bond_type: str # SINGLE, DOUBLE, AROMATIC等 + + # 关系 + parent: Optional[ParentMolecule] = Relationship(back_populates="fragments") + +# ==================== 验证结果表 ==================== +class ValidationResult(SQLModel, table=True): + """人工验证结果记录""" + __tablename__ = "validation_results" + + id: Optional[int] = Field(default=None, primary_key=True) + parent_id: int = Field(foreign_key="parent_molecules.id") + + # 验证字段 + numbering_correct: Optional[bool] = None # 编号是否正确 + cleavage_correct: Optional[bool] = None # 裂解位置是否正确 + classification_correct: Optional[bool] = None # 分类是否正确 + + # 备注 + notes: Optional[str] = None + validated_by: Optional[str] = None + validated_at: Optional[datetime] = None +``` + +### 2.2 数据库初始化 + +```python +from sqlmodel import create_engine, Session, SQLModel +from contextlib import contextmanager + +# SQLite文件数据库 +DATABASE_URL = "sqlite:///./validation_output/fragments.db" + +engine = create_engine(DATABASE_URL, echo=False) + +@contextmanager +def get_session(): + with Session(engine) as session: + yield session + +def init_database(): + """创建所有表""" + SQLModel.metadata.create_all(engine) +``` + +## 3. 同位素标记方案 (借鉴Molassembler) + +### 3.1 标记策略 + +```python +def build_fragment_smiles_with_isotope( + mol: Chem.Mol, + side_chain_atoms: list[int], + side_chain_start_idx: int, + ring_atom_idx: int, + cleavage_position: int, # 使用位置编号作为同位素值 +) -> str: + """ + 构建带同位素标记的片段SMILES + + 关键:用cleavage_position作为dummy原子的同位素值 + 这样后续拼接时能精确知道片段来自哪个位置 + """ + # 创建可编辑分子 + emol = Chem.EditableMol(Chem.Mol(mol)) + + # 添加dummy原子替代连接点 + dummy_atom = Chem.Atom(0) # 原子序数0 = dummy + dummy_atom.SetIsotope(cleavage_position) # 【关键】用位置编号标记 + dummy_idx = emol.AddAtom(dummy_atom) + + # 获取原始键类型 + bond = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx) + bond_type = bond.GetBondType() + + # 添加dummy原子与侧链起始原子的键 + emol.AddBond(dummy_idx, side_chain_start_idx, bond_type) + + # 只保留dummy原子和侧链原子 + atoms_to_keep = set([dummy_idx] + list(side_chain_atoms)) + + # 标记要删除的原子 + for atom_idx in range(mol.GetNumAtoms()): + if atom_idx not in atoms_to_keep: + emol.RemoveAtom(atom_idx) + + fragment = emol.GetMol() + Chem.SanitizeMol(fragment) + + return Chem.MolToSmiles(fragment) +``` + +### 3.2 标记示例 + +| 裂解位置 | 原始结构 | 标记后SMILES | 说明 | +|---------|---------|-------------|------| +| 5 | `...C5(CCO)...` | `[5*]CCO` | dummy同位素=5,表示位置5的侧链 | +| 12 | `...C12(CCCO)...` | `[12*]CCCO` | dummy同位素=12,表示位置12的侧链 | + +### 3.3 拼接时使用标记 + +```python +# 后续拼接时,可以通过同位素值找到对应的裂解位置 +def find_attachment_position(fragment_smiles: str) -> int: + """从片段SMILES中提取裂解位置""" + mol = Chem.MolFromSmiles(fragment_smiles) + for atom in mol.GetAtoms(): + if atom.GetAtomicNum() == 0 and atom.GetIsotope() > 0: + return atom.GetIsotope() # 返回位置编号 + return 0 +``` + +## 4. 输出目录结构 + +``` +validation_output/ +├── README.md # 目录结构说明 +├── fragments.db # SQLite数据库 +├── summary.csv # 主汇总表 (所有分子) +├── summary_statistics.json # 统计信息 +│ +├── ring_size_12/ # 12元环 +├── ring_size_13/ # 13元环 +├── ring_size_14/ # 14元环 +├── ring_size_15/ # 15元环 +├── ring_size_16/ # 16元环 +├── ring_size_17/ # 17元环 +├── ring_size_18/ # 18元环 +├── ring_size_19/ # 19元环 +└── ring_size_20/ # 20元环 + │ + ├── molecules.csv # 该环大小的所有分子 + │ + ├── standard/ # 标准大环内酯 + │ ├── numbered/ # 带编号的高亮环图 + │ │ ├── {source_id}_numbered.png + │ │ └── ... + │ │ + │ └── sidechains/ # 侧链片段图 + │ └── {source_id}/ + │ ├── {source_id}_frag_0_pos{pos}.png # 位置信息在文件名 + │ ├── {source_id}_frag_1_pos{pos}.png + │ └── ... + │ + ├── non_standard/ # 非标准大环 + │ └── original/ + │ ├── {source_id}_original.png + │ └── ... + │ + └── rejected/ # 被拒绝的分子 + └── original/ + ├── {source_id}_original.png + └── ... +``` + +## 5. CSV字段设计 + +### 5.1 summary.csv + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| `id` | int | 数据库主键 | +| `source_id` | str | 原始IDs字段 | +| `molecule_name` | str | 分子名称 | +| `smiles` | str | 原始SMILES | +| `classification` | str | standard/non_standard/not_macrolactone | +| `ring_size` | int | 检测到的环大小 | +| `primary_reason_code` | str | 分类原因代码 | +| `primary_reason_message` | str | 分类原因描述 | +| `processing_status` | str | pending/success/failed/skipped | +| `error_message` | str | 错误信息 | +| `num_sidechains` | int | 侧链数量 | +| `cleavage_positions` | str | JSON数组 [5, 8, 12] | +| `numbered_image_path` | str | 编号图相对路径 | +| `processed_at` | datetime | 处理时间 | + +### 5.2 fragments.csv (每个分子的侧链详情) + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| `fragment_id` | str | 唯一标识 | +| `source_id` | str | 母分子ID | +| `cleavage_position` | int | 环上断裂位置 | +| `attachment_atom_idx` | int | 连接原子索引 | +| `attachment_atom_symbol` | str | 连接原子类型 | +| `fragment_smiles_labeled` | str | 带同位素标记的SMILES | +| `fragment_smiles_plain` | str | 无标记SMILES | +| `dummy_isotope` | int | 标记值=裂解位置 | +| `atom_count` | int | 原子数 | +| `molecular_weight` | float | 分子量 | +| `original_bond_type` | str | 原始键类型 | +| `image_path` | str | 片段图路径 | + +## 6. 验证脚本架构 + +```python +#!/usr/bin/env python3 +""" +MacrolactoneDB 12-20元环验证脚本 +使用: pixi run python scripts/validate_macrolactone_db.py +""" + +import pandas as pd +from sqlmodel import Session +from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter +from macro_lactone_toolkit.visualization import save_numbered_molecule_png + +class MacrolactoneValidator: + def __init__(self, sample_ratio=0.1, output_dir="validation_output"): + self.analyzer = MacroLactoneAnalyzer() + self.fragmenter = MacrolactoneFragmenter() + self.sample_ratio = sample_ratio + self.output_dir = Path(output_dir) + + def run(self, input_csv: str): + # 1. 加载数据 + df = pd.read_csv(input_csv) + + # 2. 分层抽样 + sampled = self._stratified_sample(df) + + # 3. 初始化数据库 + init_database() + + # 4. 处理每个分子 + for _, row in sampled.iterrows(): + self._process_molecule(row) + + # 5. 生成汇总 + self._generate_summary() + + def _stratified_sample(self, df: pd.DataFrame) -> pd.DataFrame: + """按环大小分层抽样""" + # 先分类所有分子 + df['classification'] = df['smiles'].apply( + lambda s: self.analyzer.classify_macrocycle(s).classification + ) + df['ring_size'] = df['smiles'].apply( + lambda s: self.analyzer.classify_macrocycle(s).ring_size + ) + + # 按ring_size分层,每层抽10% + sampled = df.groupby('ring_size').apply( + lambda x: x.sample(frac=self.sample_ratio, random_state=42) + ).reset_index(drop=True) + + return sampled + + def _process_molecule(self, row: pd.Series): + """处理单个分子""" + source_id = row['IDs'] + smiles = row['smiles'] + classification = row['classification'] + ring_size = row['ring_size'] + + # 保存到ParentMolecule表 + parent = ParentMolecule( + source_id=source_id, + molecule_name=row.get('molecule_pref_name'), + smiles=smiles, + classification=classification, + ring_size=ring_size, + ) + + if classification != ClassificationType.STANDARD: + parent.processing_status = ProcessingStatus.SKIPPED + self._save_image(smiles, ring_size, source_id, classification) + return + + # 处理标准大环内酯 + try: + # 环编号 + numbering = self.fragmenter.number_molecule(smiles) + self._save_numbering_to_db(parent.id, numbering) + + # 生成编号图 + self._save_numbered_image(smiles, ring_size, source_id) + + # 侧链断裂 + result = self.fragmenter.fragment_molecule(smiles, parent_id=source_id) + self._save_fragments_to_db(parent.id, result) + self._save_fragment_images(result, source_id) + + parent.processing_status = ProcessingStatus.SUCCESS + parent.num_sidechains = len(result.fragments) + parent.cleavage_positions = json.dumps([f.cleavage_position for f in result.fragments]) + + except Exception as e: + parent.processing_status = ProcessingStatus.FAILED + parent.error_message = str(e) + + parent.processed_at = datetime.utcnow() + + with get_session() as session: + session.add(parent) + session.commit() +``` + +## 7. 执行命令 + +```bash +# 进入项目目录 +cd /Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split + +# 激活pixi环境 +pixi shell + +# 运行验证脚本 +python scripts/validate_macrolactone_db.py \ + --input data/MacrolactoneDB/ring12_20/temp.csv \ + --output validation_output \ + --sample-ratio 0.1 + +# 查看数据库 +sqlite3 validation_output/fragments.db ".tables" +sqlite3 validation_output/fragments.db "SELECT * FROM parent_molecules LIMIT 5;" +``` + +## 8. 人工检查清单 + +### 8.1 编号正确性检查 +- [ ] 位置1是否为内酯羰基碳(C=O) +- [ ] 位置2是否为酯键氧(-O-) +- [ ] 编号是否沿统一方向连续 +- [ ] 桥环/非标准环是否被正确跳过 + +### 8.2 裂解正确性检查 +- [ ] 位置1和2是否有侧链(应该没有,是内酯本身) +- [ ] 位置3-N的侧链是否正确识别 +- [ ] dummy原子是否正确标记裂解位置 +- [ ] 键型是否保持(单键/双键) + +### 8.3 分类准确性检查 +- [ ] 标准大环内酯是否被正确识别 +- [ ] 非标准大环是否被正确分类并跳过 +- [ ] 非大环内酯是否被拒绝 + +## 9. 后续拼接接口设计 + +```python +def get_fragment_by_position(db_path: str, source_id: str, position: int) -> Optional[SideChainFragment]: + """通过位置获取片段,用于后续拼接""" + engine = create_engine(f"sqlite:///{db_path}") + with Session(engine) as session: + statement = select(SideChainFragment).where( + SideChainFragment.parent_id == source_id, + SideChainFragment.cleavage_position == position + ) + return session.exec(statement).first() + +def get_all_positions_for_parent(db_path: str, source_id: str) -> List[int]: + """获取某分子的所有裂解位置""" + engine = create_engine(f"sqlite:///{db_path}") + with Session(engine) as session: + statement = select(SideChainFragment.cleavage_position).where( + SideChainFragment.parent_id == source_id + ) + return [r[0] for r in session.exec(statement).all()] +``` diff --git a/docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md b/docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md new file mode 100644 index 0000000..bc002e5 --- /dev/null +++ b/docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md @@ -0,0 +1,1287 @@ +# MacrolactoneDB Validation Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Create a validation script that samples 10% of MacrolactoneDB 12-20 membered rings, classifies them, fragments side chains with isotope tagging, and stores results in SQLite with visualizations. + +**Architecture:** SQLModel ORM for database, RDKit for chemistry, PIL for visualization. Core is a `MacrolactoneValidator` class that orchestrates sampling, processing, and output generation. + +**Tech Stack:** Python 3.12, SQLModel, RDKit, Pandas, Pixi (environment) + +**Reference Design:** See `docs/plans/2026-03-19-macrolactone-validation-design.md` for full design details. + +--- + +## Prerequisites + +**Worktree:** This plan should be executed in a dedicated worktree. + +**Design Doc:** Read `docs/plans/2026-03-19-macrolactone-validation-design.md` before starting. + +--- + +## Task 1: Create Database Models (SQLModel) + +**Files:** +- Create: `src/macro_lactone_toolkit/validation/__init__.py` +- Create: `src/macro_lactone_toolkit/validation/models.py` + +**Context:** These models implement the schema from the design doc. `SideChainFragment` uses `dummy_isotope` field to store the cleavage position for reconstruction. + +**Step 1: Create directory and __init__.py** + +Run: +```bash +mkdir -p src/macro_lactone_toolkit/validation +touch src/macro_lactone_toolkit/validation/__init__.py +``` + +**Step 2: Write database models** + +Create `src/macro_lactone_toolkit/validation/models.py`: + +```python +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from typing import List, Optional + +from sqlmodel import Field, Relationship, SQLModel + + +class ClassificationType(str, Enum): + STANDARD = "standard_macrolactone" + NON_STANDARD = "non_standard_macrocycle" + NOT_MACROLACTONE = "not_macrolactone" + + +class ProcessingStatus(str, Enum): + PENDING = "pending" + SUCCESS = "success" + FAILED = "failed" + SKIPPED = "skipped" + + +class ParentMolecule(SQLModel, table=True): + """Original molecule information.""" + + __tablename__ = "parent_molecules" + + id: Optional[int] = Field(default=None, primary_key=True) + source_id: str = Field(index=True) + molecule_name: Optional[str] = None + smiles: str = Field(index=True) + classification: ClassificationType = Field(index=True) + ring_size: Optional[int] = Field(default=None, index=True) + primary_reason_code: Optional[str] = None + primary_reason_message: Optional[str] = None + processing_status: ProcessingStatus = Field(default=ProcessingStatus.PENDING) + error_message: Optional[str] = None + num_sidechains: Optional[int] = None + cleavage_positions: Optional[str] = None + numbered_image_path: Optional[str] = None + created_at: datetime = Field(default_factory=datetime.utcnow) + processed_at: Optional[datetime] = None + + fragments: List["SideChainFragment"] = Relationship(back_populates="parent") + numbering: Optional["RingNumbering"] = Relationship(back_populates="parent") + + +class RingNumbering(SQLModel, table=True): + """Ring numbering details.""" + + __tablename__ = "ring_numberings" + + id: Optional[int] = Field(default=None, primary_key=True) + parent_id: int = Field(foreign_key="parent_molecules.id", unique=True) + ring_size: int + carbonyl_carbon_idx: int + ester_oxygen_idx: int + position_to_atom: str + atom_to_position: str + + parent: Optional[ParentMolecule] = Relationship(back_populates="numbering") + + +class SideChainFragment(SQLModel, table=True): + """Side chain fragments from cleavage.""" + + __tablename__ = "side_chain_fragments" + + id: Optional[int] = Field(default=None, primary_key=True) + parent_id: int = Field(foreign_key="parent_molecules.id", index=True) + fragment_id: str = Field(index=True) + cleavage_position: int = Field(index=True) + attachment_atom_idx: int + attachment_atom_symbol: str + fragment_smiles_labeled: str + fragment_smiles_plain: str + dummy_isotope: int + atom_count: int + heavy_atom_count: int + molecular_weight: float + original_bond_type: str + image_path: Optional[str] = None + + parent: Optional[ParentMolecule] = Relationship(back_populates="fragments") + + +class ValidationResult(SQLModel, table=True): + """Manual validation records.""" + + __tablename__ = "validation_results" + + id: Optional[int] = Field(default=None, primary_key=True) + parent_id: int = Field(foreign_key="parent_molecules.id") + numbering_correct: Optional[bool] = None + cleavage_correct: Optional[bool] = None + classification_correct: Optional[bool] = None + notes: Optional[str] = None + validated_by: Optional[str] = None + validated_at: Optional[datetime] = None +``` + +**Step 3: Verify SQLModel imports work** + +Run: +```bash +pixi run python -c "from macro_lactone_toolkit.validation.models import ParentMolecule; print('Models OK')" +``` + +Expected: `Models OK` + +**Step 4: Commit** + +```bash +git add src/macro_lactone_toolkit/validation/ +git commit -m "feat(validation): add SQLModel database models" +``` + +--- + +## Task 2: Create Database Connection Module + +**Files:** +- Create: `src/macro_lactone_toolkit/validation/database.py` + +**Context:** Provides SQLite engine, session context manager, and init function. + +**Step 1: Write database module** + +Create `src/macro_lactone_toolkit/validation/database.py`: + +```python +from __future__ import annotations + +from contextlib import contextmanager +from pathlib import Path + +from sqlmodel import Session, SQLModel, create_engine + + +def get_engine(db_path: str | Path): + """Create SQLite engine.""" + db_path = Path(db_path) + db_path.parent.mkdir(parents=True, exist_ok=True) + url = f"sqlite:///{db_path}" + return create_engine(url, echo=False) + + +@contextmanager +def get_session(engine): + """Context manager for database sessions.""" + with Session(engine) as session: + yield session + + +def init_database(engine): + """Create all tables.""" + SQLModel.metadata.create_all(engine) +``` + +**Step 2: Test database initialization** + +Create test script `test_db.py`: +```python +from pathlib import Path +from macro_lactone_toolkit.validation.database import get_engine, init_database, get_session +from macro_lactone_toolkit.validation.models import ParentMolecule, ClassificationType + +test_db = Path("/tmp/test_fragments.db") +if test_db.exists(): + test_db.unlink() + +engine = get_engine(test_db) +init_database(engine) + +with get_session(engine) as session: + parent = ParentMolecule( + source_id="TEST001", + smiles="O=C1CCCCCCCCCCCCCCO1", + classification=ClassificationType.STANDARD, + ring_size=16, + ) + session.add(parent) + session.commit() + print(f"Inserted parent with id: {parent.id}") + +print("Database test passed!") +``` + +Run: +```bash +pixi run python test_db.py +``` + +Expected: +``` +Inserted parent with id: 1 +Database test passed! +``` + +**Step 3: Cleanup and commit** + +Run: +```bash +rm test_db.py /tmp/test_fragments.db +git add src/macro_lactone_toolkit/validation/database.py +git commit -m "feat(validation): add database connection module" +``` + +--- + +## Task 3: Create Isotope Tagging Utilities + +**Files:** +- Create: `src/macro_lactone_toolkit/validation/isotope_utils.py` + +**Context:** Implements isotope tagging inspired by Molassembler. Uses cleavage position as isotope value. + +**Step 1: Write isotope utilities** + +Create `src/macro_lactone_toolkit/validation/isotope_utils.py`: + +```python +from __future__ import annotations + +from rdkit import Chem + + +def build_fragment_with_isotope( + mol: Chem.Mol, + side_chain_atoms: list[int], + side_chain_start_idx: int, + ring_atom_idx: int, + cleavage_position: int, +) -> tuple[str, str, str]: + """ + Build fragment SMILES with isotope tagging. + + Returns: + Tuple of (labeled_smiles, plain_smiles, bond_type) + """ + # Get original bond type + bond = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx) + bond_type = bond.GetBondType().name if bond else "SINGLE" + + # Create editable molecule + emol = Chem.EditableMol(Chem.Mol(mol)) + + # Add dummy atom with isotope = cleavage position + dummy_atom = Chem.Atom(0) + dummy_atom.SetIsotope(cleavage_position) + dummy_idx = emol.AddAtom(dummy_atom) + + # Add bond between dummy and side chain start + emol.AddBond(dummy_idx, side_chain_start_idx, bond.GetBondType()) + + # Remove ring atom to side chain bond (will be reconnected via dummy) + # Actually, we keep the side chain atoms and dummy, remove everything else + + # Determine atoms to keep + atoms_to_keep = set([dummy_idx, side_chain_start_idx] + list(side_chain_atoms)) + + # Remove atoms not in keep list + # Need to remove in reverse order to maintain valid indices + all_atoms = list(range(mol.GetNumAtoms())) + atoms_to_remove = [i for i in all_atoms if i not in atoms_to_keep] + + for atom_idx in sorted(atoms_to_remove, reverse=True): + emol.RemoveAtom(atom_idx) + + fragment = emol.GetMol() + Chem.SanitizeMol(fragment) + + # Get labeled SMILES (with isotope) + labeled_smiles = Chem.MolToSmiles(fragment) + + # Get plain SMILES (without isotope) + plain_fragment = Chem.Mol(fragment) + for atom in plain_fragment.GetAtoms(): + if atom.GetIsotope() > 0: + atom.SetIsotope(0) + plain_smiles = Chem.MolToSmiles(plain_fragment) + + return labeled_smiles, plain_smiles, bond_type + + +def extract_isotope_position(fragment_smiles: str) -> int: + """Extract cleavage position from fragment SMILES.""" + mol = Chem.MolFromSmiles(fragment_smiles) + if mol is None: + return 0 + + for atom in mol.GetAtoms(): + if atom.GetAtomicNum() == 0 and atom.GetIsotope() > 0: + return atom.GetIsotope() + return 0 +``` + +**Step 2: Write test** + +Create `tests/validation/test_isotope_utils.py`: + +```python +import pytest +from rdkit import Chem + +from macro_lactone_toolkit.validation.isotope_utils import ( + build_fragment_with_isotope, + extract_isotope_position, +) + + +def test_build_fragment_with_isotope(): + # Create a simple test molecule: ethyl group attached to position 5 + mol = Chem.MolFromSmiles("CCCC(CC)CCC") # Position 4 (0-indexed) has ethyl + assert mol is not None + + side_chain_atoms = [4, 5] # The ethyl group atoms + side_chain_start = 4 + ring_atom = 3 + cleavage_pos = 5 + + labeled, plain, bond_type = build_fragment_with_isotope( + mol, side_chain_atoms, side_chain_start, ring_atom, cleavage_pos + ) + + assert labeled is not None + assert plain is not None + assert bond_type == "SINGLE" + + # Check isotope was set + extracted_pos = extract_isotope_position(labeled) + assert extracted_pos == cleavage_pos + + # Plain should have no isotope + extracted_plain = extract_isotope_position(plain) + assert extracted_plain == 0 +``` + +**Step 3: Run test** + +Run: +```bash +pixi run pytest tests/validation/test_isotope_utils.py -v +``` + +Expected: `test_build_fragment_with_isotope PASSED` + +**Step 4: Commit** + +```bash +git add src/macro_lactone_toolkit/validation/isotope_utils.py tests/validation/test_isotope_utils.py +git commit -m "feat(validation): add isotope tagging utilities" +``` + +--- + +## Task 4: Create Stratified Sampling Module + +**Files:** +- Create: `src/macro_lactone_toolkit/validation/sampling.py` + +**Context:** Implements 10% stratified sampling by ring size (12-20). + +**Step 1: Write sampling module** + +Create `src/macro_lactone_toolkit/validation/sampling.py`: + +```python +from __future__ import annotations + +import pandas as pd + +from macro_lactone_toolkit import MacroLactoneAnalyzer + + +def stratified_sample_by_ring_size( + df: pd.DataFrame, + sample_ratio: float, + smiles_col: str = "smiles", + random_state: int = 42, +) -> pd.DataFrame: + """ + Perform stratified sampling by ring size. + + First classifies all molecules, then samples 10% from each ring size layer. + """ + analyzer = MacroLactoneAnalyzer() + + # Classify all molecules + classifications = [] + ring_sizes = [] + + for smiles in df[smiles_col]: + result = analyzer.classify_macrocycle(smiles) + classifications.append(result.classification) + ring_sizes.append(result.ring_size) + + df = df.copy() + df["_classification"] = classifications + df["_ring_size"] = ring_sizes + + # Group by ring size and sample from each group + sampled_groups = [] + + for ring_size in range(12, 21): + group = df[df["_ring_size"] == ring_size] + if len(group) > 0: + n_samples = max(1, int(len(group) * sample_ratio)) + sampled = group.sample(n=min(n_samples, len(group)), random_state=random_state) + sampled_groups.append(sampled) + + # Also sample from unknown ring size (None) + unknown_group = df[df["_ring_size"].isna()] + if len(unknown_group) > 0: + n_samples = max(1, int(len(unknown_group) * sample_ratio)) + sampled = unknown_group.sample(n=min(n_samples, len(unknown_group)), random_state=random_state) + sampled_groups.append(sampled) + + if not sampled_groups: + return pd.DataFrame() + + result = pd.concat(sampled_groups, ignore_index=True) + return result +``` + +**Step 2: Create test** + +Create `tests/validation/test_sampling.py`: + +```python +import pandas as pd +import pytest + +from macro_lactone_toolkit.validation.sampling import stratified_sample_by_ring_size + + +def test_stratified_sample(): + # Create test data with known ring sizes + data = { + "smiles": [ + "O=C1CCCCCCCCCCCCCCO1", # 16-membered + "O=C1CCCCCCCCCCCCO1", # 14-membered + "O=C1CCCCCCCCCCCCCCCCO1", # 18-membered + ], + "id": ["A", "B", "C"], + } + df = pd.DataFrame(data) + + sampled = stratified_sample_by_ring_size(df, sample_ratio=0.5, random_state=42) + + # Should get at least 1 from each ring size (50% of 1 = 1) + assert len(sampled) >= 1 + assert len(sampled) <= 3 +``` + +**Step 3: Run test** + +Run: +```bash +pixi run pytest tests/validation/test_sampling.py -v +``` + +Expected: `test_stratified_sample PASSED` + +**Step 4: Commit** + +```bash +git add src/macro_lactone_toolkit/validation/sampling.py tests/validation/test_sampling.py +git commit -m "feat(validation): add stratified sampling by ring size" +``` + +--- + +## Task 5: Create Visualization Output Module + +**Files:** +- Create: `src/macro_lactone_toolkit/validation/visualization_output.py` + +**Context:** Handles saving numbered molecule images and fragment images to organized directory structure. + +**Step 1: Write visualization output module** + +Create `src/macro_lactone_toolkit/validation/visualization_output.py`: + +```python +from __future__ import annotations + +from pathlib import Path + +from rdkit import Chem + +from macro_lactone_toolkit.visualization import save_numbered_molecule_png, save_fragment_png + + +def get_output_paths(output_dir: Path, source_id: str, ring_size: int, classification: str) -> dict: + """Get organized output paths for a molecule.""" + ring_dir = output_dir / f"ring_size_{ring_size}" + + if classification == "standard_macrolactone": + base_dir = ring_dir / "standard" + numbered_dir = base_dir / "numbered" + sidechains_dir = base_dir / "sidechains" / source_id + elif classification == "non_standard_macrocycle": + base_dir = ring_dir / "non_standard" / "original" + numbered_dir = base_dir + sidechains_dir = None + else: + base_dir = ring_dir / "rejected" / "original" + numbered_dir = base_dir + sidechains_dir = None + + numbered_dir.mkdir(parents=True, exist_ok=True) + if sidechains_dir: + sidechains_dir.mkdir(parents=True, exist_ok=True) + + return { + "numbered_image": numbered_dir / f"{source_id}_numbered.png", + "sidechains_dir": sidechains_dir, + } + + +def save_numbered_molecule( + smiles: str, + output_path: Path, + ring_size: int | None = None, + size: tuple[int, int] = (800, 800), +) -> Path | None: + """Save numbered molecule image.""" + try: + return save_numbered_molecule_png( + smiles, + output_path, + ring_size=ring_size, + size=size, + ) + except Exception as e: + print(f"Failed to save numbered image: {e}") + return None + + +def save_fragment_images( + fragments: list, + output_dir: Path, + source_id: str, + size: tuple[int, int] = (400, 400), +) -> list[str]: + """Save fragment images and return paths.""" + paths = [] + + for i, fragment in enumerate(fragments): + try: + output_path = output_dir / f"{source_id}_frag_{i}_pos{fragment.cleavage_position}.png" + save_fragment_png(fragment.fragment_smiles_plain, output_path, size=size) + paths.append(str(output_path.relative_to(output_dir.parent.parent))) + except Exception as e: + print(f"Failed to save fragment {i}: {e}") + paths.append(None) + + return paths +``` + +**Step 2: Commit** + +```bash +git add src/macro_lactone_toolkit/validation/visualization_output.py +git commit -m "feat(validation): add visualization output module" +``` + +--- + +## Task 6: Create Main Validator Class + +**Files:** +- Create: `src/macro_lactone_toolkit/validation/validator.py` + +**Context:** Core orchestrator that processes molecules, stores results in database, and generates visualizations. + +**Step 1: Write validator class** + +Create `src/macro_lactone_toolkit/validation/validator.py`: + +```python +from __future__ import annotations + +import json +from datetime import datetime +from pathlib import Path + +import pandas as pd +from rdkit import Chem +from rdkit.Chem import Descriptors + +from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter +from macro_lactone_toolkit.validation.database import get_engine, get_session, init_database +from macro_lactone_toolkit.validation.isotope_utils import build_fragment_with_isotope +from macro_lactone_toolkit.validation.models import ( + ClassificationType, + ParentMolecule, + ProcessingStatus, + RingNumbering, + SideChainFragment, +) +from macro_lactone_toolkit.validation.sampling import stratified_sample_by_ring_size +from macro_lactone_toolkit.validation.visualization_output import ( + get_output_paths, + save_fragment_images, + save_numbered_molecule, +) + + +class MacrolactoneValidator: + """Validates macrolactone database with sampling and fragmentation.""" + + def __init__( + self, + output_dir: str | Path, + sample_ratio: float = 0.1, + smiles_col: str = "smiles", + id_col: str = "IDs", + ): + self.output_dir = Path(output_dir) + self.sample_ratio = sample_ratio + self.smiles_col = smiles_col + self.id_col = id_col + + self.analyzer = MacroLactoneAnalyzer() + self.fragmenter = MacrolactoneFragmenter() + + # Initialize database + self.db_path = self.output_dir / "fragments.db" + self.engine = get_engine(self.db_path) + init_database(self.engine) + + def run(self, input_csv: str | Path) -> dict: + """Run validation on input CSV.""" + # Load data + df = pd.read_csv(input_csv) + print(f"Loaded {len(df)} molecules from {input_csv}") + + # Stratified sampling + print(f"Performing stratified sampling (ratio={self.sample_ratio})...") + sampled = stratified_sample_by_ring_size(df, self.sample_ratio, self.smiles_col) + print(f"Sampled {len(sampled)} molecules") + + # Process each molecule + results = {"total": len(sampled), "success": 0, "failed": 0, "skipped": 0} + + for idx, row in sampled.iterrows(): + status = self._process_molecule(row) + results[status] += 1 + if (idx + 1) % 100 == 0: + print(f"Processed {idx + 1}/{len(sampled)} molecules") + + # Generate summary + self._generate_summary() + + return results + + def _process_molecule(self, row: pd.Series) -> str: + """Process a single molecule. Returns status.""" + source_id = str(row[self.id_col]) + smiles = row[self.smiles_col] + name = row.get("molecule_pref_name", None) + + # Classify + classification_result = self.analyzer.classify_macrocycle(smiles) + classification = ClassificationType(classification_result.classification) + ring_size = classification_result.ring_size + + # Create parent record + parent = ParentMolecule( + source_id=source_id, + molecule_name=name, + smiles=smiles, + classification=classification, + ring_size=ring_size, + primary_reason_code=classification_result.primary_reason_code, + primary_reason_message=classification_result.primary_reason_message, + ) + + with get_session(self.engine) as session: + session.add(parent) + session.commit() + session.refresh(parent) + + # Skip non-standard molecules + if classification != ClassificationType.STANDARD: + parent.processing_status = ProcessingStatus.SKIPPED + session.add(parent) + session.commit() + self._save_original_image(smiles, source_id, ring_size, classification) + return "skipped" + + # Process standard macrolactone + try: + self._process_standard_macrolactone(session, parent, smiles) + return "success" + except Exception as e: + parent.processing_status = ProcessingStatus.FAILED + parent.error_message = str(e) + parent.processed_at = datetime.utcnow() + session.add(parent) + session.commit() + return "failed" + + def _process_standard_macrolactone(self, session, parent: ParentMolecule, smiles: str): + """Process a standard macrolactone.""" + # Get numbering + numbering = self.fragmenter.number_molecule(smiles) + + # Save numbering to database + numbering_record = RingNumbering( + parent_id=parent.id, + ring_size=numbering.ring_size, + carbonyl_carbon_idx=numbering.carbonyl_carbon_idx, + ester_oxygen_idx=numbering.ester_oxygen_idx, + position_to_atom=json.dumps(numbering.position_to_atom), + atom_to_position=json.dumps(numbering.atom_to_position), + ) + session.add(numbering_record) + + # Save numbered image + paths = get_output_paths( + self.output_dir, parent.source_id, parent.ring_size, "standard_macrolactone" + ) + image_path = save_numbered_molecule(smiles, paths["numbered_image"], parent.ring_size) + if image_path: + parent.numbered_image_path = str(image_path.relative_to(self.output_dir)) + + # Fragment side chains + mol = Chem.MolFromSmiles(smiles) + ring_atom_set = set(numbering.ring_atoms) + fragments = [] + fragment_idx = 0 + + from macro_lactone_toolkit._core import collect_side_chain_atoms, is_intrinsic_lactone_neighbor + + for position, ring_atom_idx in numbering.position_to_atom.items(): + ring_atom = mol.GetAtomWithIdx(ring_atom_idx) + + for neighbor in ring_atom.GetNeighbors(): + neighbor_idx = neighbor.GetIdx() + + # Skip ring atoms and intrinsic lactone neighbors + if neighbor_idx in ring_atom_set: + continue + if is_intrinsic_lactone_neighbor(mol, numbering, ring_atom_idx, neighbor_idx): + continue + + # Collect side chain atoms + side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set) + if not side_chain_atoms: + continue + + # Build fragment with isotope tagging + labeled_smiles, plain_smiles, bond_type = build_fragment_with_isotope( + mol, side_chain_atoms, neighbor_idx, ring_atom_idx, position + ) + + # Calculate properties + plain_mol = Chem.MolFromSmiles(plain_smiles) + if plain_mol is None: + continue + + atom_count = sum(1 for a in plain_mol.GetAtoms() if a.GetAtomicNum() != 0) + heavy_atom_count = sum(1 for a in plain_mol.GetAtoms() if a.GetAtomicNum() not in [0, 1]) + mw = Descriptors.MolWt(plain_mol) + + # Create fragment record + fragment = SideChainFragment( + parent_id=parent.id, + fragment_id=f"{parent.source_id}_frag_{fragment_idx}", + cleavage_position=position, + attachment_atom_idx=ring_atom_idx, + attachment_atom_symbol=ring_atom.GetSymbol(), + fragment_smiles_labeled=labeled_smiles, + fragment_smiles_plain=plain_smiles, + dummy_isotope=position, + atom_count=atom_count, + heavy_atom_count=heavy_atom_count, + molecular_weight=round(mw, 4), + original_bond_type=bond_type, + ) + session.add(fragment) + fragments.append(fragment) + fragment_idx += 1 + + # Save fragment images + if fragments and paths["sidechains_dir"]: + image_paths = save_fragment_images(fragments, paths["sidechains_dir"], parent.source_id) + for frag, img_path in zip(fragments, image_paths): + frag.image_path = img_path + session.add(frag) + + # Update parent record + parent.processing_status = ProcessingStatus.SUCCESS + parent.num_sidechains = len(fragments) + parent.cleavage_positions = json.dumps([f.cleavage_position for f in fragments]) + parent.processed_at = datetime.utcnow() + session.add(parent) + session.commit() + + def _save_original_image(self, smiles: str, source_id: str, ring_size: int, classification: ClassificationType): + """Save original image for non-standard molecules.""" + paths = get_output_paths(self.output_dir, source_id, ring_size, classification.value) + try: + from rdkit.Chem import Draw + mol = Chem.MolFromSmiles(smiles) + if mol: + Draw.MolToFile(mol, str(paths["numbered_image"]), size=(400, 400)) + except Exception: + pass + + def _generate_summary(self): + """Generate summary CSV and statistics.""" + with get_session(self.engine) as session: + # Query all parents + from sqlmodel import select + statement = select(ParentMolecule) + parents = session.exec(statement).all() + + # Convert to DataFrame + data = [] + for p in parents: + data.append({ + "id": p.id, + "source_id": p.source_id, + "molecule_name": p.molecule_name, + "smiles": p.smiles, + "classification": p.classification.value, + "ring_size": p.ring_size, + "primary_reason_code": p.primary_reason_code, + "primary_reason_message": p.primary_reason_message, + "processing_status": p.processing_status.value, + "error_message": p.error_message, + "num_sidechains": p.num_sidechains, + "cleavage_positions": p.cleavage_positions, + "numbered_image_path": p.numbered_image_path, + "processed_at": p.processed_at, + }) + + df = pd.DataFrame(data) + df.to_csv(self.output_dir / "summary.csv", index=False) + + # Generate statistics + stats = { + "total_molecules": len(parents), + "by_classification": df["classification"].value_counts().to_dict(), + "by_ring_size": df[df["ring_size"].notna()]["ring_size"].value_counts().to_dict(), + "by_status": df["processing_status"].value_counts().to_dict(), + } + + with open(self.output_dir / "summary_statistics.json", "w") as f: + json.dump(stats, f, indent=2, default=str) + + print(f"\nSummary saved to {self.output_dir / 'summary.csv'}") + print(f"Statistics: {stats}") +``` + +**Step 2: Commit** + +```bash +git add src/macro_lactone_toolkit/validation/validator.py +git commit -m "feat(validation): add main validator class" +``` + +--- + +## Task 7: Create CLI Script + +**Files:** +- Create: `scripts/validate_macrolactone_db.py` + +**Context:** Entry point script that uses pixi environment. + +**Step 1: Write CLI script** + +Create `scripts/validate_macrolactone_db.py`: + +```python +#!/usr/bin/env python3 +""" +Validate MacrolactoneDB 12-20 membered rings. + +Usage: + pixi run python scripts/validate_macrolactone_db.py \ + --input data/MacrolactoneDB/ring12_20/temp.csv \ + --output validation_output \ + --sample-ratio 0.1 +""" + +import argparse +import sys +from pathlib import Path + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from macro_lactone_toolkit.validation.validator import MacrolactoneValidator + + +def main(): + parser = argparse.ArgumentParser( + description="Validate MacrolactoneDB 12-20 membered rings" + ) + parser.add_argument( + "--input", + type=str, + default="data/MacrolactoneDB/ring12_20/temp.csv", + help="Input CSV file path", + ) + parser.add_argument( + "--output", + type=str, + default="validation_output", + help="Output directory", + ) + parser.add_argument( + "--sample-ratio", + type=float, + default=0.1, + help="Sampling ratio (0.0-1.0)", + ) + parser.add_argument( + "--smiles-col", + type=str, + default="smiles", + help="SMILES column name", + ) + parser.add_argument( + "--id-col", + type=str, + default="IDs", + help="ID column name", + ) + + args = parser.parse_args() + + print("=" * 60) + print("MacrolactoneDB Validation") + print("=" * 60) + print(f"Input: {args.input}") + print(f"Output: {args.output}") + print(f"Sample ratio: {args.sample_ratio}") + print("=" * 60) + + validator = MacrolactoneValidator( + output_dir=args.output, + sample_ratio=args.sample_ratio, + smiles_col=args.smiles_col, + id_col=args.id_col, + ) + + results = validator.run(args.input) + + print("\n" + "=" * 60) + print("Validation Complete") + print("=" * 60) + print(f"Total processed: {results['total']}") + print(f"Success: {results['success']}") + print(f"Failed: {results['failed']}") + print(f"Skipped: {results['skipped']}") + print("=" * 60) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) +``` + +Make executable: +```bash +chmod +x scripts/validate_macrolactone_db.py +``` + +**Step 2: Test help message** + +Run: +```bash +pixi run python scripts/validate_macrolactone_db.py --help +``` + +Expected: Shows help message with all arguments. + +**Step 3: Commit** + +```bash +git add scripts/validate_macrolactone_db.py +git commit -m "feat(validation): add CLI entry point script" +``` + +--- + +## Task 8: Create Output Directory README + +**Files:** +- Create: Template for `validation_output/README.md` (generated by validator) + +**Context:** README explaining the output directory structure. + +**Step 1: Add README generation to validator** + +Add method to `validator.py` before `_generate_summary`: + +```python + def _generate_readme(self): + """Generate README explaining output structure.""" + readme_content = """# MacrolactoneDB Validation Output + +This directory contains validation results for MacrolactoneDB 12-20 membered rings. + +## Directory Structure + +``` +validation_output/ +├── README.md # This file +├── fragments.db # SQLite database with all data +├── summary.csv # Summary of all processed molecules +├── summary_statistics.json # Statistical summary +│ +├── ring_size_12/ # 12-membered rings +├── ring_size_13/ # 13-membered rings +... +└── ring_size_20/ # 20-membered rings + ├── molecules.csv # Molecules in this ring size + ├── standard/ # Standard macrolactones + │ ├── numbered/ # Numbered ring images + │ │ └── {id}_numbered.png + │ └── sidechains/ # Fragment images + │ └── {id}/ + │ └── {id}_frag_{n}_pos{pos}.png + ├── non_standard/ # Non-standard macrocycles + │ └── original/ + │ └── {id}_original.png + └── rejected/ # Not macrolactones + └── original/ + └── {id}_original.png +``` + +## Database Schema + +### Tables + +- **parent_molecules**: Original molecule information +- **ring_numberings**: Ring atom numbering details +- **side_chain_fragments**: Fragmentation results with isotope tags +- **validation_results**: Manual validation records + +### Key Fields + +- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone +- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction +- `cleavage_position`: Position on ring where side chain was attached + +## Ring Numbering Convention + +1. Position 1 = Lactone carbonyl carbon (C=O) +2. Position 2 = Ester oxygen (-O-) +3. Positions 3-N = Sequential around ring + +## Isotope Tagging + +Fragments use isotope values to mark cleavage position: +- `[5*]CCO` = Fragment from position 5, dummy atom has isotope=5 +- This enables precise reconstruction during reassembly + +## CSV Columns + +### summary.csv + +- `source_id`: Original molecule ID from MacrolactoneDB +- `classification`: Classification result +- `ring_size`: Detected ring size (12-20) +- `num_sidechains`: Number of side chains detected +- `cleavage_positions`: JSON array of cleavage positions +- `processing_status`: pending | success | failed | skipped + +## Querying the Database + +```bash +# List tables +sqlite3 fragments.db ".tables" + +# Get standard macrolactones with fragments +sqlite3 fragments.db "SELECT * FROM parent_molecules WHERE classification='standard_macrolactone' LIMIT 5;" + +# Get fragments for a specific molecule +sqlite3 fragments.db "SELECT * FROM side_chain_fragments WHERE parent_id=1;" + +# Count by ring size +sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY ring_size;" +``` +""" + readme_path = self.output_dir / "README.md" + readme_path.write_text(readme_content) +``` + +Add call in `run` method before `return results`: +```python +self._generate_readme() +self._generate_summary() +``` + +**Step 2: Commit** + +```bash +git add src/macro_lactone_toolkit/validation/validator.py +git commit -m "feat(validation): add README generation for output directory" +``` + +--- + +## Task 9: Update Package __init__.py + +**Files:** +- Modify: `src/macro_lactone_toolkit/__init__.py` + +**Context:** Export validation module. + +**Step 1: Add validation exports** + +Modify `src/macro_lactone_toolkit/__init__.py` to add: + +```python +# Validation module (optional import) +try: + from .validation.validator import MacrolactoneValidator + from .validation.models import ParentMolecule, SideChainFragment +except ImportError: + pass # SQLModel not installed +``` + +**Step 2: Commit** + +```bash +git add src/macro_lactone_toolkit/__init__.py +git commit -m "feat(validation): export validation module" +``` + +--- + +## Task 10: Run Integration Test + +**Files:** +- Test with: Small sample of actual data + +**Context:** Run the validator on a small subset to verify everything works. + +**Step 1: Create test with small sample** + +Run: +```bash +# Create small test sample +head -20 data/MacrolactoneDB/ring12_20/temp.csv > /tmp/test_sample.csv + +# Run validation +pixi run python scripts/validate_macrolactone_db.py \ + --input /tmp/test_sample.csv \ + --output /tmp/test_validation_output \ + --sample-ratio 1.0 +``` + +Expected output shows processing and summary. + +**Step 2: Verify outputs** + +Run: +```bash +ls -la /tmp/test_validation_output/ +cat /tmp/test_validation_output/summary_statistics.json +sqlite3 /tmp/test_validation_output/fragments.db "SELECT COUNT(*) FROM parent_molecules;" +``` + +Expected: Directory exists, has database, summary CSV, and ring size subdirectories. + +**Step 3: Cleanup** + +```bash +rm -rf /tmp/test_validation_output /tmp/test_sample.csv +``` + +--- + +## Task 11: Final Commit and Summary + +**Step 1: Final review and commit** + +```bash +git status +git log --oneline -10 +``` + +**Step 2: Push to branch (if using worktree)** + +```bash +git push origin HEAD +``` + +--- + +## Execution Commands Reference + +### Full validation run + +```bash +cd /Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split +pixi run python scripts/validate_macrolactone_db.py \ + --input data/MacrolactoneDB/ring12_20/temp.csv \ + --output validation_output \ + --sample-ratio 0.1 +``` + +### Query results + +```bash +# Summary statistics +cat validation_output/summary_statistics.json + +# Database queries +sqlite3 validation_output/fragments.db "SELECT * FROM parent_molecules LIMIT 5;" +sqlite3 validation_output/fragments.db "SELECT * FROM side_chain_fragments WHERE cleavage_position > 2 LIMIT 5;" +``` + +### Check specific ring size + +```bash +ls validation_output/ring_size_16/standard/numbered/ +ls validation_output/ring_size_16/standard/sidechains/ +``` + +--- + +## Verification Checklist + +- [ ] All SQLModel models created and importable +- [ ] Database initializes without errors +- [ ] Isotope tagging preserves cleavage position +- [ ] Stratified sampling produces even distribution +- [ ] Visualization outputs created in correct structure +- [ ] Summary CSV contains all expected columns +- [ ] README generated with accurate documentation +- [ ] CLI script runs with --help +- [ ] Integration test passes on small sample diff --git a/src/macro_lactone_toolkit/validation/database.py b/src/macro_lactone_toolkit/validation/database.py new file mode 100644 index 0000000..5855ecb --- /dev/null +++ b/src/macro_lactone_toolkit/validation/database.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from contextlib import contextmanager +from pathlib import Path + +from sqlmodel import Session, SQLModel, create_engine + + +def get_engine(db_path: str | Path): + """Create SQLite engine.""" + db_path = Path(db_path) + db_path.parent.mkdir(parents=True, exist_ok=True) + url = f"sqlite:///{db_path}" + return create_engine(url, echo=False) + + +@contextmanager +def get_session(engine): + """Context manager for database sessions.""" + with Session(engine) as session: + yield session + + +def init_database(engine): + """Create all tables.""" + SQLModel.metadata.create_all(engine) diff --git a/src/macro_lactone_toolkit/validation/models.py b/src/macro_lactone_toolkit/validation/models.py index f43e352..796e1c0 100644 --- a/src/macro_lactone_toolkit/validation/models.py +++ b/src/macro_lactone_toolkit/validation/models.py @@ -1,25 +1,26 @@ from __future__ import annotations from datetime import datetime -from enum import Enum from typing import List, Optional -from sqlmodel import Field, Relationship, SQLModel +from sqlalchemy.orm import Mapped, mapped_column, relationship +from sqlmodel import Field, SQLModel -class ClassificationType(str, Enum): +class ClassificationType: STANDARD = "standard_macrolactone" NON_STANDARD = "non_standard_macrocycle" NOT_MACROLACTONE = "not_macrolactone" -class ProcessingStatus(str, Enum): +class ProcessingStatus: PENDING = "pending" SUCCESS = "success" FAILED = "failed" SKIPPED = "skipped" +# Define all tables without relationships first class ParentMolecule(SQLModel, table=True): """Original molecule information.""" @@ -29,11 +30,11 @@ class ParentMolecule(SQLModel, table=True): source_id: str = Field(index=True) molecule_name: Optional[str] = None smiles: str = Field(index=True) - classification: ClassificationType = Field(index=True) + classification: str = Field(index=True) ring_size: Optional[int] = Field(default=None, index=True) primary_reason_code: Optional[str] = None primary_reason_message: Optional[str] = None - processing_status: ProcessingStatus = Field(default=ProcessingStatus.PENDING) + processing_status: str = Field(default=ProcessingStatus.PENDING) error_message: Optional[str] = None num_sidechains: Optional[int] = None cleavage_positions: Optional[str] = None @@ -41,9 +42,6 @@ class ParentMolecule(SQLModel, table=True): created_at: datetime = Field(default_factory=datetime.utcnow) processed_at: Optional[datetime] = None - fragments: List["SideChainFragment"] = Relationship(back_populates="parent") - numbering: Optional["RingNumbering"] = Relationship(back_populates="parent") - class RingNumbering(SQLModel, table=True): """Ring numbering details.""" @@ -58,8 +56,6 @@ class RingNumbering(SQLModel, table=True): position_to_atom: str atom_to_position: str - parent: Optional[ParentMolecule] = Relationship(back_populates="numbering") - class SideChainFragment(SQLModel, table=True): """Side chain fragments from cleavage.""" @@ -81,8 +77,6 @@ class SideChainFragment(SQLModel, table=True): original_bond_type: str image_path: Optional[str] = None - parent: Optional[ParentMolecule] = Relationship(back_populates="fragments") - class ValidationResult(SQLModel, table=True): """Manual validation records."""