From d18133ce160359adeb9a4dfe5fba86f01f9cd420 Mon Sep 17 00:00:00 2001
From: lingyuzeng <pylyzeng@gmail.com>
Date: Thu, 19 Mar 2026 10:27:20 +0800
Subject: [PATCH] feat(validation): add SQLModel database models and fix
 relationships

---
 ...standard-macrocycle-classification-plan.md |  139 ++
 PLAN-.md                                      |  157 ++
 ...26-03-19-macrolactone-validation-design.md |  475 ++++++
 ...olactone-validation-implementation-plan.md | 1287 +++++++++++++++++
 .../validation/database.py                    |   26 +
 .../validation/models.py                      |   20 +-
 6 files changed, 2091 insertions(+), 13 deletions(-)
 create mode 100644 2026-03-18-standard-macrocycle-classification-plan.md
 create mode 100644 PLAN-.md
 create mode 100644 docs/plans/2026-03-19-macrolactone-validation-design.md
 create mode 100644 docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md
 create mode 100644 src/macro_lactone_toolkit/validation/database.py

diff --git a/2026-03-18-standard-macrocycle-classification-plan.md b/2026-03-18-standard-macrocycle-classification-plan.md
new file mode 100644
index 0000000..4b1bcdd
--- /dev/null
+++ b/2026-03-18-standard-macrocycle-classification-plan.md
@@ -0,0 +1,139 @@
+# Standard vs Non-Standard Macrocycle Classification
+
+## Summary
+
+Add a formal molecule-level classification layer on top of the current `macro_lactone_toolkit` detection logic so the toolkit can distinguish:
+
+- `standard_macrolactone`
+- `non_standard_macrocycle`
+- `not_macrolactone`
+
+This classification must support the two new rejection rules:
+
+1. After ring numbering is assigned, positions `3..N` must all be carbon atoms. If any atom at positions `3..N` is not carbon, classify the molecule as `non_standard_macrocycle`.
+2. If multiple candidate macrolactone rings overlap in the same atom set graph, classify the molecule as `non_standard_macrocycle`. Use only overlapping candidate rings for this rule; disconnected or non-overlapping candidates do not trigger this specific rejection.
+
+Do not rely on a “largest ring” assumption. Base detection on RDKit ring candidates from `RingInfo.AtomRings()` plus explicit lactone validation, then apply the new standard/non-standard filters.
+
+## Public API And Output Changes
+
+Add a new result type, e.g. `MacrocycleClassificationResult`, with these fields:
+
+- `smiles: str`
+- `classification: Literal["standard_macrolactone", "non_standard_macrocycle", "not_macrolactone"]`
+- `ring_size: int | None`
+- `primary_reason_code: str | None`
+- `primary_reason_message: str | None`
+- `all_reason_codes: list[str]`
+- `all_reason_messages: list[str]`
+- `candidate_ring_sizes: list[int]`
+
+Add a new public API on `MacroLactoneAnalyzer`:
+
+- `classify_macrocycle(mol_input: str | Chem.Mol, ring_size: int | None = None) -> MacrocycleClassificationResult`
+
+Behavior:
+
+- If `ring_size` is omitted, inspect all 12-20 membered lactone candidates.
+- If `ring_size` is provided, restrict candidate selection to that size before classification.
+- Invalid SMILES should keep raising the existing detection exception path; do not encode invalid input as a classification result.
+- For `standard_macrolactone`, `ring_size` must be the accepted ring size and all reason fields must be empty.
+- For `non_standard_macrocycle`, `ring_size` should be the candidate ring size if exactly one size remains relevant, otherwise `None`.
+- For `not_macrolactone`, return no ring size and a reason describing why no valid 12-20 lactone candidate survived.
+
+Reason codes must be decision-complete and fixed:
+
+- `contains_non_carbon_ring_atoms_outside_positions_1_2`
+- `multiple_overlapping_macrocycle_candidates`
+- `no_lactone_ring_in_12_to_20_range`
+- `requested_ring_size_not_found`
+
+Reason messages must be short English sentences:
+
+- `Ring positions 3..N contain non-carbon atoms.`
+- `Overlapping macrolactone candidate rings were detected.`
+- `No 12-20 membered lactone ring was detected.`
+- `The requested ring size was not detected as a lactone ring.`
+
+Update CLI `macro-lactone-toolkit analyze` to return this classification result shape for single-SMILES mode and row-wise CSV mode.
+
+Do not add a new CLI subcommand. Keep `analyze` as the classification surface.
+
+## Implementation Changes
+
+### Detection And Candidate Grouping
+
+In the current core detection module:
+
+- Keep the existing lactone-ring candidate search based on `RingInfo.AtomRings()` and lactone atom validation.
+- Add an overlap-group pass over candidate rings:
+  - Build a graph where two candidates are connected if their ring atom sets intersect.
+  - Compute connected components on this graph.
+  - If any connected component contains more than one candidate, classify as `non_standard_macrocycle` with `multiple_overlapping_macrocycle_candidates`.
+- Do not treat disconnected candidate rings as overlapping.
+- Keep `candidate_ring_sizes` as the sorted unique sizes from the filtered candidate list.
+
+### Standard Macrocycle Filter
+
+For any single candidate that survives overlap rejection:
+
+- Build numbering exactly as today: position 1 is the lactone carbonyl carbon, position 2 is the ring ester oxygen.
+- Inspect positions `3..N`.
+- Every atom at positions `3..N` must have atomic number 6.
+- If any position `3..N` is not carbon, classify as `non_standard_macrocycle` with `contains_non_carbon_ring_atoms_outside_positions_1_2`.
+
+This rule must reject ring peptides and other heteroatom-containing macrocycles even if they contain a lactone bond.
+
+### Fragmenter Integration
+
+Update `MacrolactoneFragmenter` so that:
+
+- `number_molecule()` and `fragment_molecule()` first call `classify_macrocycle()`.
+- They only proceed when classification is `standard_macrolactone`.
+- For `non_standard_macrocycle` or `not_macrolactone`, raise the existing detection exception type with a message that includes the classification and the primary reason code.
+- Do not change fragmentation output semantics for standard macrolactones.
+
+### Files To Change
+
+Concentrate changes in:
+
+- `src/macro_lactone_toolkit/_core.py`
+- `src/macro_lactone_toolkit/analyzer.py`
+- `src/macro_lactone_toolkit/cli.py`
+
+Add the new result type in the existing models module instead of inventing a second schema location.
+
+## Test Plan
+
+Add tests first, verify they fail, then implement.
+
+Required test cases:
+
+- Standard 12, 14, 16, and 20 membered macrolactones still classify as `standard_macrolactone` and return the correct `ring_size`.
+- A macrocycle with a valid lactone bond but a non-carbon atom at position `3..N` classifies as `non_standard_macrocycle` with:
+  - `primary_reason_code == "contains_non_carbon_ring_atoms_outside_positions_1_2"`
+  - the expected English message
+- An overlapping-candidate example classifies as `non_standard_macrocycle` with:
+  - `primary_reason_code == "multiple_overlapping_macrocycle_candidates"`
+  - the expected English message
+- A non-lactone macrocycle classifies as `not_macrolactone` with `no_lactone_ring_in_12_to_20_range`.
+- Explicit `ring_size` with no candidate of that size returns `not_macrolactone` with `requested_ring_size_not_found`.
+- `macro-lactone-toolkit analyze --smiles ...` returns the new fields for:
+  - one standard example
+  - one heteroatom-rejected example
+  - one overlap-rejected example
+- Existing numbering, fragmentation, labeled/plain dummy round-trip, and splicing tests remain green for standard macrolactones.
+
+Test fixture guidance:
+
+- Reuse the existing synthetic macrocycle helper for standard rings.
+- Extend the helper or add a new fixture helper for:
+  - a lactone-containing ring with one non-carbon atom at a numbered position beyond 2
+  - an overlapping-candidate ring example specifically built to share ring atoms between candidate rings
+
+## Assumptions And Defaults
+
+- Classification is molecule-level, but the overlap rejection only applies to overlapping candidate rings, not disconnected candidates elsewhere in the molecule.
+- Invalid SMILES remain exceptions, not classification payloads.
+- `analyze` becomes the official classification output; `get_valid_ring_sizes()` may remain as a lower-level helper.
+- The implementation should stay aligned with RDKit ring APIs as candidate generators, not as the final definition of a standard macrolactone.
diff --git a/PLAN-.md b/PLAN-.md
new file mode 100644
index 0000000..e00faa5
--- /dev/null
+++ b/PLAN-.md
@@ -0,0 +1,157 @@
+# `macro_split` 重构为可安装包 `macro_lactone_toolkit` 的实施计划
+
+## Summary
+
+采用“渐进式重构，不推倒重写”的方案：保留仓库里已经可用的 12-20 元环识别、通用编号、侧链 BFS 提取和 dummy 原子思路，删除或降级 16 元环专用路径，把核心能力统一收敛到一个正式可安装、可测试、可 CLI 调用的包。
+
+本次重构完成后，项目要满足这几个结果：
+- 可以作为正式 Python 包安装和导入，主包名为 `macro_lactone_toolkit`
+- 默认自动识别 12-20 元有效大环内酯；也允许显式传 `ring_size`
+- 支持环编号和侧链裂解
+- 裂解结果同时提供“带位置编号 dummy”与“普通 `*` dummy”两套 SMILES，且都能被 RDKit 读取保存
+- 用 `pixi` 管理环境和测试
+- 先执行 `git pull --ff-only` 同步远端，再开始任何代码改动
+
+## Key Changes
+
+### 1. 仓库与包结构正规化
+
+- 第一动作不是改代码，而是在仓库根目录执行：
+  - `git status --short`
+  - `git pull --ff-only`
+- 将当前直接放在顶层 `src/` 下的运行时代码，改成标准包布局：`src/macro_lactone_toolkit/`
+- 不保留 `src.*` 兼容层；README、测试、脚本、入口点全部切到新导入路径
+- `pyproject.toml` 改为标准 setuptools `src` layout 配置，修正：
+  - 包发现
+  - 项目名与版本元数据
+  - console scripts
+  - pytest 配置
+- `pixi.toml` 统一为 Python 3.12，并支持至少：
+  - `osx-arm64`
+  - `linux-64`
+- `pixi.toml` 补齐测试依赖和常用任务：
+  - `test`
+  - `lint`（只做检查，不自动改写文件）
+  - `smoke-import`
+- README 中所有 `from src...` 示例全部替换为 `from macro_lactone_toolkit...`
+
+### 2. 统一核心领域模型与 API
+
+以 `ring_visualization.py` 的通用逻辑和参考脚本的编号思路为基础，重构出清晰边界的核心模块，避免“16 元环专用”和“12-20 元环通用”并存。
+
+公开 API 统一为：
+
+- `macro_lactone_toolkit.MacroLactoneAnalyzer`
+- `macro_lactone_toolkit.MacrolactoneFragmenter`
+- `macro_lactone_toolkit.RingNumberingResult`
+- `macro_lactone_toolkit.SideChainFragment`
+- `macro_lactone_toolkit.FragmentationResult`
+
+行为约定固定如下：
+
+- `MacroLactoneAnalyzer`
+  - 负责识别 12-20 元环
+  - 验证环上是否存在有效内酯酯键
+  - 返回所有命中的有效环尺寸
+- `MacrolactoneFragmenter`
+  - 默认自动识别有效环尺寸
+  - 若检测到 0 个有效环，抛出明确异常
+  - 若检测到多个有效环尺寸，默认抛出“歧义异常”，要求调用者显式传 `ring_size`
+  - 若调用者传了 `ring_size`，则只处理该环
+- `RingNumberingResult`
+  - 至少包含：`ring_size`、`ring_atoms`、`ordered_atoms`、`carbonyl_carbon_idx`、`ester_oxygen_idx`、`atom_to_position`、`position_to_atom`
+- `SideChainFragment`
+  - 至少包含：`parent_id`、`cleavage_position`、`attachment_atom_idx`、`fragment_smiles_labeled`、`fragment_smiles_plain`、`atom_count`、`molecular_weight`
+- `FragmentationResult`
+  - 至少包含：母分子信息、选定环尺寸、编号结果、所有碎片、错误/警告信息
+
+异常策略固定为：
+
+- `MacrolactoneError`
+- `MacrolactoneDetectionError`
+- `AmbiguousMacrolactoneError`
+- `RingNumberingError`
+- `FragmentationError`
+
+### 3. 编号与裂解算法收敛
+
+编号逻辑以“通用 12-20 元环”作为唯一正式实现，不再让旧的 16 元环 `ring_numbering.py` 充当主入口。
+
+- 编号规则固定：
+  - 位置 1 = 环上的内酯羰基碳
+  - 位置 2 = 环上的酯键氧
+  - 位置 3-N = 沿确定方向依次编号剩余环原子
+- 编号实现使用统一入口，不能再出现批处理脚本内部偷偷调用 `[r16]` 专用函数的情况
+- 自动识别时，先找所有 12-20 元环，再验证该环是否为有效大环内酯
+- 侧链识别统一用“环原子邻居中不在环内的原子即侧链起点”
+- 侧链提取统一用 BFS，只沿非环原子扩展
+- dummy 原子输出固定两套：
+  - `fragment_smiles_labeled`：带位置编号 dummy，例如 `[5*]`
+  - `fragment_smiles_plain`：普通 `*`
+- dummy 与连接原子的键型必须保留原始键型，不能强制都变成单键
+- 所有裂解结果必须通过 RDKit round-trip：
+  - `Chem.MolFromSmiles(...)` 成功
+  - `Chem.MolToSmiles(...)` 可再次序列化
+
+### 4. CLI、批处理与现有脚本归并
+
+交付范围包含正式 CLI，不再依赖散乱脚本作为主要入口。
+
+CLI 设计固定为 3 个命令：
+
+- `macro-lactone-toolkit analyze`
+  - 输入单个 SMILES 或 CSV
+  - 输出有效环尺寸、是否歧义、选中的环尺寸
+- `macro-lactone-toolkit number`
+  - 输入单个 SMILES
+  - 输出编号结果 JSON
+- `macro-lactone-toolkit fragment`
+  - 输入单个 SMILES 或 CSV
+  - 输出碎片 JSON/CSV，包含 labeled/plain 两套 SMILES
+
+批处理行为固定为：
+
+- CSV 默认读取 `smiles` 列
+- 可选 `id` 列；未提供则自动生成
+- 自动识别歧义分子时：
+  - 默认跳过并记录错误
+  - 显式传 `--ring-size` 时按指定环处理
+- 旧 `scripts/` 中保留价值的逻辑迁入正式 CLI；剩余脚本改成薄封装或标记为 legacy，不再承担核心实现
+
+## Test Plan
+
+必须用 `pixi` 跑完整验证，至少覆盖下面这些场景：
+
+- 包安装与导入
+  - `pixi run python -c "import macro_lactone_toolkit"`
+  - `pixi run pytest`
+- 环识别
+  - 单一有效环尺寸的 12、14、16、20 元环样例
+  - 无效分子
+  - 无内酯键分子
+  - 多有效环尺寸分子，确认抛出 `AmbiguousMacrolactoneError`
+- 编号
+  - 位置 1 必须是羰基碳
+  - 位置 2 必须是酯键氧
+  - 编号范围必须连续覆盖 `1..N`
+  - 同一分子多次运行结果一致
+- 裂解
+  - 无侧链位置返回空列表
+  - 有侧链位置返回碎片
+  - `fragment_smiles_labeled` 与 `fragment_smiles_plain` 都能被 RDKit 成功读取
+  - labeled dummy 保留位置号
+  - dummy 键型与原键型一致
+- CLI
+  - `analyze`、`number`、`fragment` 的基本 smoke test
+  - CSV 批处理
+  - 歧义分子跳过与错误日志行为
+- 回归
+  - 现有 splicing engine 若继续保留，至少保留一个 dummy 拼接回组装测试，确保 labeled dummy 方案可持续扩展
+
+## Assumptions
+
+- `git pull` 使用 `--ff-only`，避免在重构开始前引入不透明 merge commit
+- 本次不保留 `src.*` 导入兼容层，全面切换到 `macro_lactone_toolkit`
+- 本次把“正式库 API + 正式 CLI + pixi 测试”作为主交付；notebooks 只做参考，不作为正式接口
+- 发现多个有效大环尺寸时，默认视为歧义并失败，不自动猜测
+- 继续使用 RDKit 作为唯一结构解析与序列化基础
diff --git a/docs/plans/2026-03-19-macrolactone-validation-design.md b/docs/plans/2026-03-19-macrolactone-validation-design.md
new file mode 100644
index 0000000..786004c
--- /dev/null
+++ b/docs/plans/2026-03-19-macrolactone-validation-design.md
@@ -0,0 +1,475 @@
+# MacrolactoneDB 12-20元环验证方案设计
+
+## 1. 验证目标与范围
+
+### 1.1 目标
+- 验证 `macro_lactone_toolkit` 对 MacrolactoneDB 12-20元环大环内酯的识别准确性
+- 验证环编号正确性（位置1=羰基碳，位置2=酯键氧）
+- 验证侧链断裂功能（仅针对标准大环内酯）
+- 使用同位素标记技术标记裂解位置，便于后续拼接
+
+### 1.2 范围
+- **数据来源**: `/data/MacrolactoneDB/ring12_20/temp.csv` (11,037分子)
+- **抽样策略**: 分层随机抽样10% (~1,100分子)，按环大小12-20均匀分布
+- **处理对象**: 仅处理 `classification="standard_macrolactone"` 的分子
+- **输出**: 可视化图片 + SQLite数据库 + 汇总统计
+
+## 2. 数据库设计 (SQLModel)
+
+### 2.1 模型定义
+
+```python
+from typing import Optional, List
+from sqlmodel import SQLModel, Field, Relationship
+from datetime import datetime
+from enum import Enum
+
+class ClassificationType(str, Enum):
+    STANDARD = "standard_macrolactone"
+    NON_STANDARD = "non_standard_macrocycle"
+    NOT_MACROLACTONE = "not_macrolactone"
+
+class ProcessingStatus(str, Enum):
+    PENDING = "pending"
+    SUCCESS = "success"
+    FAILED = "failed"
+    SKIPPED = "skipped"
+
+# ==================== 主分子表 ====================
+class ParentMolecule(SQLModel, table=True):
+    """原始分子信息表"""
+    __tablename__ = "parent_molecules"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+
+    # 原始数据
+    source_id: str = Field(index=True)  # 来自CSV的IDs字段
+    molecule_name: Optional[str] = None  # molecule_pref_name
+    smiles: str = Field(index=True)
+
+    # 分类结果
+    classification: ClassificationType = Field(index=True)
+    ring_size: Optional[int] = Field(default=None, index=True)
+    primary_reason_code: Optional[str] = None
+    primary_reason_message: Optional[str] = None
+
+    # 处理状态
+    processing_status: ProcessingStatus = Field(default=ProcessingStatus.PENDING)
+    error_message: Optional[str] = None
+
+    # 元数据
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    processed_at: Optional[datetime] = None
+
+    # 关系
+    fragments: List["SideChainFragment"] = Relationship(back_populates="parent")
+    numbering: Optional["RingNumbering"] = Relationship(back_populates="parent")
+
+# ==================== 环编号表 ====================
+class RingNumbering(SQLModel, table=True):
+    """环编号详细信息"""
+    __tablename__ = "ring_numberings"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+    parent_id: int = Field(foreign_key="parent_molecules.id", unique=True)
+
+    # 环基本信息
+    ring_size: int
+    carbonyl_carbon_idx: int  # 位置1
+    ester_oxygen_idx: int     # 位置2
+
+    # 原子映射 (JSON存储)
+    position_to_atom: str  # JSON: {"1": 5, "2": 10, ...}
+    atom_to_position: str  # JSON: {"5": 1, "10": 2, ...}
+
+    # 关系
+    parent: Optional[ParentMolecule] = Relationship(back_populates="numbering")
+
+# ==================== 侧链片段表 ====================
+class SideChainFragment(SQLModel, table=True):
+    """裂解产生的侧链片段"""
+    __tablename__ = "side_chain_fragments"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+    parent_id: int = Field(foreign_key="parent_molecules.id", index=True)
+
+    # 片段标识
+    fragment_id: str = Field(index=True)  # {source_id}_frag_{n}
+
+    # 裂解位置信息
+    cleavage_position: int = Field(index=True)  # 环上的断裂位置
+    attachment_atom_idx: int  # 母环上的连接原子索引
+    attachment_atom_symbol: str  # C, O, N等
+
+    # SMILES (关键：同位素标记用于标识裂解位置)
+    fragment_smiles_labeled: str   # 带同位素标记，如 [5*]CCO
+    fragment_smiles_plain: str     # 无标记，如 *CCO
+
+    # 同位素标记值 (用于后续拼接)
+    dummy_isotope: int  # 裂解位置的编号，用于重建连接关系
+
+    # 物理化学性质
+    atom_count: int
+    heavy_atom_count: int
+    molecular_weight: float
+
+    # 连接信息 (用于后续拼接)
+    original_bond_type: str  # SINGLE, DOUBLE, AROMATIC等
+
+    # 关系
+    parent: Optional[ParentMolecule] = Relationship(back_populates="fragments")
+
+# ==================== 验证结果表 ====================
+class ValidationResult(SQLModel, table=True):
+    """人工验证结果记录"""
+    __tablename__ = "validation_results"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+    parent_id: int = Field(foreign_key="parent_molecules.id")
+
+    # 验证字段
+    numbering_correct: Optional[bool] = None  # 编号是否正确
+    cleavage_correct: Optional[bool] = None   # 裂解位置是否正确
+    classification_correct: Optional[bool] = None  # 分类是否正确
+
+    # 备注
+    notes: Optional[str] = None
+    validated_by: Optional[str] = None
+    validated_at: Optional[datetime] = None
+```
+
+### 2.2 数据库初始化
+
+```python
+from sqlmodel import create_engine, Session, SQLModel
+from contextlib import contextmanager
+
+# SQLite文件数据库
+DATABASE_URL = "sqlite:///./validation_output/fragments.db"
+
+engine = create_engine(DATABASE_URL, echo=False)
+
+@contextmanager
+def get_session():
+    with Session(engine) as session:
+        yield session
+
+def init_database():
+    """创建所有表"""
+    SQLModel.metadata.create_all(engine)
+```
+
+## 3. 同位素标记方案 (借鉴Molassembler)
+
+### 3.1 标记策略
+
+```python
+def build_fragment_smiles_with_isotope(
+    mol: Chem.Mol,
+    side_chain_atoms: list[int],
+    side_chain_start_idx: int,
+    ring_atom_idx: int,
+    cleavage_position: int,  # 使用位置编号作为同位素值
+) -> str:
+    """
+    构建带同位素标记的片段SMILES
+
+    关键：用cleavage_position作为dummy原子的同位素值
+    这样后续拼接时能精确知道片段来自哪个位置
+    """
+    # 创建可编辑分子
+    emol = Chem.EditableMol(Chem.Mol(mol))
+
+    # 添加dummy原子替代连接点
+    dummy_atom = Chem.Atom(0)  # 原子序数0 = dummy
+    dummy_atom.SetIsotope(cleavage_position)  # 【关键】用位置编号标记
+    dummy_idx = emol.AddAtom(dummy_atom)
+
+    # 获取原始键类型
+    bond = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx)
+    bond_type = bond.GetBondType()
+
+    # 添加dummy原子与侧链起始原子的键
+    emol.AddBond(dummy_idx, side_chain_start_idx, bond_type)
+
+    # 只保留dummy原子和侧链原子
+    atoms_to_keep = set([dummy_idx] + list(side_chain_atoms))
+
+    # 标记要删除的原子
+    for atom_idx in range(mol.GetNumAtoms()):
+        if atom_idx not in atoms_to_keep:
+            emol.RemoveAtom(atom_idx)
+
+    fragment = emol.GetMol()
+    Chem.SanitizeMol(fragment)
+
+    return Chem.MolToSmiles(fragment)
+```
+
+### 3.2 标记示例
+
+| 裂解位置 | 原始结构 | 标记后SMILES | 说明 |
+|---------|---------|-------------|------|
+| 5 | `...C5(CCO)...` | `[5*]CCO` | dummy同位素=5，表示位置5的侧链 |
+| 12 | `...C12(CCCO)...` | `[12*]CCCO` | dummy同位素=12，表示位置12的侧链 |
+
+### 3.3 拼接时使用标记
+
+```python
+# 后续拼接时，可以通过同位素值找到对应的裂解位置
+def find_attachment_position(fragment_smiles: str) -> int:
+    """从片段SMILES中提取裂解位置"""
+    mol = Chem.MolFromSmiles(fragment_smiles)
+    for atom in mol.GetAtoms():
+        if atom.GetAtomicNum() == 0 and atom.GetIsotope() > 0:
+            return atom.GetIsotope()  # 返回位置编号
+    return 0
+```
+
+## 4. 输出目录结构
+
+```
+validation_output/
+├── README.md                    # 目录结构说明
+├── fragments.db                 # SQLite数据库
+├── summary.csv                  # 主汇总表 (所有分子)
+├── summary_statistics.json      # 统计信息
+│
+├── ring_size_12/                # 12元环
+├── ring_size_13/                # 13元环
+├── ring_size_14/                # 14元环
+├── ring_size_15/                # 15元环
+├── ring_size_16/                # 16元环
+├── ring_size_17/                # 17元环
+├── ring_size_18/                # 18元环
+├── ring_size_19/                # 19元环
+└── ring_size_20/                # 20元环
+    │
+    ├── molecules.csv            # 该环大小的所有分子
+    │
+    ├── standard/                # 标准大环内酯
+    │   ├── numbered/            # 带编号的高亮环图
+    │   │   ├── {source_id}_numbered.png
+    │   │   └── ...
+    │   │
+    │   └── sidechains/          # 侧链片段图
+    │       └── {source_id}/
+    │           ├── {source_id}_frag_0_pos{pos}.png  # 位置信息在文件名
+    │           ├── {source_id}_frag_1_pos{pos}.png
+    │           └── ...
+    │
+    ├── non_standard/            # 非标准大环
+    │   └── original/
+    │       ├── {source_id}_original.png
+    │       └── ...
+    │
+    └── rejected/                # 被拒绝的分子
+        └── original/
+            ├── {source_id}_original.png
+            └── ...
+```
+
+## 5. CSV字段设计
+
+### 5.1 summary.csv
+
+| 字段名 | 类型 | 说明 |
+|--------|------|------|
+| `id` | int | 数据库主键 |
+| `source_id` | str | 原始IDs字段 |
+| `molecule_name` | str | 分子名称 |
+| `smiles` | str | 原始SMILES |
+| `classification` | str | standard/non_standard/not_macrolactone |
+| `ring_size` | int | 检测到的环大小 |
+| `primary_reason_code` | str | 分类原因代码 |
+| `primary_reason_message` | str | 分类原因描述 |
+| `processing_status` | str | pending/success/failed/skipped |
+| `error_message` | str | 错误信息 |
+| `num_sidechains` | int | 侧链数量 |
+| `cleavage_positions` | str | JSON数组 [5, 8, 12] |
+| `numbered_image_path` | str | 编号图相对路径 |
+| `processed_at` | datetime | 处理时间 |
+
+### 5.2 fragments.csv (每个分子的侧链详情)
+
+| 字段名 | 类型 | 说明 |
+|--------|------|------|
+| `fragment_id` | str | 唯一标识 |
+| `source_id` | str | 母分子ID |
+| `cleavage_position` | int | 环上断裂位置 |
+| `attachment_atom_idx` | int | 连接原子索引 |
+| `attachment_atom_symbol` | str | 连接原子类型 |
+| `fragment_smiles_labeled` | str | 带同位素标记的SMILES |
+| `fragment_smiles_plain` | str | 无标记SMILES |
+| `dummy_isotope` | int | 标记值=裂解位置 |
+| `atom_count` | int | 原子数 |
+| `molecular_weight` | float | 分子量 |
+| `original_bond_type` | str | 原始键类型 |
+| `image_path` | str | 片段图路径 |
+
+## 6. 验证脚本架构
+
+```python
+#!/usr/bin/env python3
+"""
+MacrolactoneDB 12-20元环验证脚本
+使用: pixi run python scripts/validate_macrolactone_db.py
+"""
+
+import pandas as pd
+from sqlmodel import Session
+from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter
+from macro_lactone_toolkit.visualization import save_numbered_molecule_png
+
+class MacrolactoneValidator:
+    def __init__(self, sample_ratio=0.1, output_dir="validation_output"):
+        self.analyzer = MacroLactoneAnalyzer()
+        self.fragmenter = MacrolactoneFragmenter()
+        self.sample_ratio = sample_ratio
+        self.output_dir = Path(output_dir)
+
+    def run(self, input_csv: str):
+        # 1. 加载数据
+        df = pd.read_csv(input_csv)
+
+        # 2. 分层抽样
+        sampled = self._stratified_sample(df)
+
+        # 3. 初始化数据库
+        init_database()
+
+        # 4. 处理每个分子
+        for _, row in sampled.iterrows():
+            self._process_molecule(row)
+
+        # 5. 生成汇总
+        self._generate_summary()
+
+    def _stratified_sample(self, df: pd.DataFrame) -> pd.DataFrame:
+        """按环大小分层抽样"""
+        # 先分类所有分子
+        df['classification'] = df['smiles'].apply(
+            lambda s: self.analyzer.classify_macrocycle(s).classification
+        )
+        df['ring_size'] = df['smiles'].apply(
+            lambda s: self.analyzer.classify_macrocycle(s).ring_size
+        )
+
+        # 按ring_size分层，每层抽10%
+        sampled = df.groupby('ring_size').apply(
+            lambda x: x.sample(frac=self.sample_ratio, random_state=42)
+        ).reset_index(drop=True)
+
+        return sampled
+
+    def _process_molecule(self, row: pd.Series):
+        """处理单个分子"""
+        source_id = row['IDs']
+        smiles = row['smiles']
+        classification = row['classification']
+        ring_size = row['ring_size']
+
+        # 保存到ParentMolecule表
+        parent = ParentMolecule(
+            source_id=source_id,
+            molecule_name=row.get('molecule_pref_name'),
+            smiles=smiles,
+            classification=classification,
+            ring_size=ring_size,
+        )
+
+        if classification != ClassificationType.STANDARD:
+            parent.processing_status = ProcessingStatus.SKIPPED
+            self._save_image(smiles, ring_size, source_id, classification)
+            return
+
+        # 处理标准大环内酯
+        try:
+            # 环编号
+            numbering = self.fragmenter.number_molecule(smiles)
+            self._save_numbering_to_db(parent.id, numbering)
+
+            # 生成编号图
+            self._save_numbered_image(smiles, ring_size, source_id)
+
+            # 侧链断裂
+            result = self.fragmenter.fragment_molecule(smiles, parent_id=source_id)
+            self._save_fragments_to_db(parent.id, result)
+            self._save_fragment_images(result, source_id)
+
+            parent.processing_status = ProcessingStatus.SUCCESS
+            parent.num_sidechains = len(result.fragments)
+            parent.cleavage_positions = json.dumps([f.cleavage_position for f in result.fragments])
+
+        except Exception as e:
+            parent.processing_status = ProcessingStatus.FAILED
+            parent.error_message = str(e)
+
+        parent.processed_at = datetime.utcnow()
+
+        with get_session() as session:
+            session.add(parent)
+            session.commit()
+```
+
+## 7. 执行命令
+
+```bash
+# 进入项目目录
+cd /Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split
+
+# 激活pixi环境
+pixi shell
+
+# 运行验证脚本
+python scripts/validate_macrolactone_db.py \
+  --input data/MacrolactoneDB/ring12_20/temp.csv \
+  --output validation_output \
+  --sample-ratio 0.1
+
+# 查看数据库
+sqlite3 validation_output/fragments.db ".tables"
+sqlite3 validation_output/fragments.db "SELECT * FROM parent_molecules LIMIT 5;"
+```
+
+## 8. 人工检查清单
+
+### 8.1 编号正确性检查
+- [ ] 位置1是否为内酯羰基碳（C=O）
+- [ ] 位置2是否为酯键氧（-O-）
+- [ ] 编号是否沿统一方向连续
+- [ ] 桥环/非标准环是否被正确跳过
+
+### 8.2 裂解正确性检查
+- [ ] 位置1和2是否有侧链（应该没有，是内酯本身）
+- [ ] 位置3-N的侧链是否正确识别
+- [ ] dummy原子是否正确标记裂解位置
+- [ ] 键型是否保持（单键/双键）
+
+### 8.3 分类准确性检查
+- [ ] 标准大环内酯是否被正确识别
+- [ ] 非标准大环是否被正确分类并跳过
+- [ ] 非大环内酯是否被拒绝
+
+## 9. 后续拼接接口设计
+
+```python
+def get_fragment_by_position(db_path: str, source_id: str, position: int) -> Optional[SideChainFragment]:
+    """通过位置获取片段，用于后续拼接"""
+    engine = create_engine(f"sqlite:///{db_path}")
+    with Session(engine) as session:
+        statement = select(SideChainFragment).where(
+            SideChainFragment.parent_id == source_id,
+            SideChainFragment.cleavage_position == position
+        )
+        return session.exec(statement).first()
+
+def get_all_positions_for_parent(db_path: str, source_id: str) -> List[int]:
+    """获取某分子的所有裂解位置"""
+    engine = create_engine(f"sqlite:///{db_path}")
+    with Session(engine) as session:
+        statement = select(SideChainFragment.cleavage_position).where(
+            SideChainFragment.parent_id == source_id
+        )
+        return [r[0] for r in session.exec(statement).all()]
+```
diff --git a/docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md b/docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md
new file mode 100644
index 0000000..bc002e5
--- /dev/null
+++ b/docs/plans/2026-03-19-macrolactone-validation-implementation-plan.md
@@ -0,0 +1,1287 @@
+# MacrolactoneDB Validation Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Create a validation script that samples 10% of MacrolactoneDB 12-20 membered rings, classifies them, fragments side chains with isotope tagging, and stores results in SQLite with visualizations.
+
+**Architecture:** SQLModel ORM for database, RDKit for chemistry, PIL for visualization. Core is a `MacrolactoneValidator` class that orchestrates sampling, processing, and output generation.
+
+**Tech Stack:** Python 3.12, SQLModel, RDKit, Pandas, Pixi (environment)
+
+**Reference Design:** See `docs/plans/2026-03-19-macrolactone-validation-design.md` for full design details.
+
+---
+
+## Prerequisites
+
+**Worktree:** This plan should be executed in a dedicated worktree.
+
+**Design Doc:** Read `docs/plans/2026-03-19-macrolactone-validation-design.md` before starting.
+
+---
+
+## Task 1: Create Database Models (SQLModel)
+
+**Files:**
+- Create: `src/macro_lactone_toolkit/validation/__init__.py`
+- Create: `src/macro_lactone_toolkit/validation/models.py`
+
+**Context:** These models implement the schema from the design doc. `SideChainFragment` uses `dummy_isotope` field to store the cleavage position for reconstruction.
+
+**Step 1: Create directory and __init__.py**
+
+Run:
+```bash
+mkdir -p src/macro_lactone_toolkit/validation
+touch src/macro_lactone_toolkit/validation/__init__.py
+```
+
+**Step 2: Write database models**
+
+Create `src/macro_lactone_toolkit/validation/models.py`:
+
+```python
+from __future__ import annotations
+
+from datetime import datetime
+from enum import Enum
+from typing import List, Optional
+
+from sqlmodel import Field, Relationship, SQLModel
+
+
+class ClassificationType(str, Enum):
+    STANDARD = "standard_macrolactone"
+    NON_STANDARD = "non_standard_macrocycle"
+    NOT_MACROLACTONE = "not_macrolactone"
+
+
+class ProcessingStatus(str, Enum):
+    PENDING = "pending"
+    SUCCESS = "success"
+    FAILED = "failed"
+    SKIPPED = "skipped"
+
+
+class ParentMolecule(SQLModel, table=True):
+    """Original molecule information."""
+
+    __tablename__ = "parent_molecules"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+    source_id: str = Field(index=True)
+    molecule_name: Optional[str] = None
+    smiles: str = Field(index=True)
+    classification: ClassificationType = Field(index=True)
+    ring_size: Optional[int] = Field(default=None, index=True)
+    primary_reason_code: Optional[str] = None
+    primary_reason_message: Optional[str] = None
+    processing_status: ProcessingStatus = Field(default=ProcessingStatus.PENDING)
+    error_message: Optional[str] = None
+    num_sidechains: Optional[int] = None
+    cleavage_positions: Optional[str] = None
+    numbered_image_path: Optional[str] = None
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    processed_at: Optional[datetime] = None
+
+    fragments: List["SideChainFragment"] = Relationship(back_populates="parent")
+    numbering: Optional["RingNumbering"] = Relationship(back_populates="parent")
+
+
+class RingNumbering(SQLModel, table=True):
+    """Ring numbering details."""
+
+    __tablename__ = "ring_numberings"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+    parent_id: int = Field(foreign_key="parent_molecules.id", unique=True)
+    ring_size: int
+    carbonyl_carbon_idx: int
+    ester_oxygen_idx: int
+    position_to_atom: str
+    atom_to_position: str
+
+    parent: Optional[ParentMolecule] = Relationship(back_populates="numbering")
+
+
+class SideChainFragment(SQLModel, table=True):
+    """Side chain fragments from cleavage."""
+
+    __tablename__ = "side_chain_fragments"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+    parent_id: int = Field(foreign_key="parent_molecules.id", index=True)
+    fragment_id: str = Field(index=True)
+    cleavage_position: int = Field(index=True)
+    attachment_atom_idx: int
+    attachment_atom_symbol: str
+    fragment_smiles_labeled: str
+    fragment_smiles_plain: str
+    dummy_isotope: int
+    atom_count: int
+    heavy_atom_count: int
+    molecular_weight: float
+    original_bond_type: str
+    image_path: Optional[str] = None
+
+    parent: Optional[ParentMolecule] = Relationship(back_populates="fragments")
+
+
+class ValidationResult(SQLModel, table=True):
+    """Manual validation records."""
+
+    __tablename__ = "validation_results"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+    parent_id: int = Field(foreign_key="parent_molecules.id")
+    numbering_correct: Optional[bool] = None
+    cleavage_correct: Optional[bool] = None
+    classification_correct: Optional[bool] = None
+    notes: Optional[str] = None
+    validated_by: Optional[str] = None
+    validated_at: Optional[datetime] = None
+```
+
+**Step 3: Verify SQLModel imports work**
+
+Run:
+```bash
+pixi run python -c "from macro_lactone_toolkit.validation.models import ParentMolecule; print('Models OK')"
+```
+
+Expected: `Models OK`
+
+**Step 4: Commit**
+
+```bash
+git add src/macro_lactone_toolkit/validation/
+git commit -m "feat(validation): add SQLModel database models"
+```
+
+---
+
+## Task 2: Create Database Connection Module
+
+**Files:**
+- Create: `src/macro_lactone_toolkit/validation/database.py`
+
+**Context:** Provides SQLite engine, session context manager, and init function.
+
+**Step 1: Write database module**
+
+Create `src/macro_lactone_toolkit/validation/database.py`:
+
+```python
+from __future__ import annotations
+
+from contextlib import contextmanager
+from pathlib import Path
+
+from sqlmodel import Session, SQLModel, create_engine
+
+
+def get_engine(db_path: str | Path):
+    """Create SQLite engine."""
+    db_path = Path(db_path)
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    url = f"sqlite:///{db_path}"
+    return create_engine(url, echo=False)
+
+
+@contextmanager
+def get_session(engine):
+    """Context manager for database sessions."""
+    with Session(engine) as session:
+        yield session
+
+
+def init_database(engine):
+    """Create all tables."""
+    SQLModel.metadata.create_all(engine)
+```
+
+**Step 2: Test database initialization**
+
+Create test script `test_db.py`:
+```python
+from pathlib import Path
+from macro_lactone_toolkit.validation.database import get_engine, init_database, get_session
+from macro_lactone_toolkit.validation.models import ParentMolecule, ClassificationType
+
+test_db = Path("/tmp/test_fragments.db")
+if test_db.exists():
+    test_db.unlink()
+
+engine = get_engine(test_db)
+init_database(engine)
+
+with get_session(engine) as session:
+    parent = ParentMolecule(
+        source_id="TEST001",
+        smiles="O=C1CCCCCCCCCCCCCCO1",
+        classification=ClassificationType.STANDARD,
+        ring_size=16,
+    )
+    session.add(parent)
+    session.commit()
+    print(f"Inserted parent with id: {parent.id}")
+
+print("Database test passed!")
+```
+
+Run:
+```bash
+pixi run python test_db.py
+```
+
+Expected:
+```
+Inserted parent with id: 1
+Database test passed!
+```
+
+**Step 3: Cleanup and commit**
+
+Run:
+```bash
+rm test_db.py /tmp/test_fragments.db
+git add src/macro_lactone_toolkit/validation/database.py
+git commit -m "feat(validation): add database connection module"
+```
+
+---
+
+## Task 3: Create Isotope Tagging Utilities
+
+**Files:**
+- Create: `src/macro_lactone_toolkit/validation/isotope_utils.py`
+
+**Context:** Implements isotope tagging inspired by Molassembler. Uses cleavage position as isotope value.
+
+**Step 1: Write isotope utilities**
+
+Create `src/macro_lactone_toolkit/validation/isotope_utils.py`:
+
+```python
+from __future__ import annotations
+
+from rdkit import Chem
+
+
+def build_fragment_with_isotope(
+    mol: Chem.Mol,
+    side_chain_atoms: list[int],
+    side_chain_start_idx: int,
+    ring_atom_idx: int,
+    cleavage_position: int,
+) -> tuple[str, str, str]:
+    """
+    Build fragment SMILES with isotope tagging.
+
+    Returns:
+        Tuple of (labeled_smiles, plain_smiles, bond_type)
+    """
+    # Get original bond type
+    bond = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx)
+    bond_type = bond.GetBondType().name if bond else "SINGLE"
+
+    # Create editable molecule
+    emol = Chem.EditableMol(Chem.Mol(mol))
+
+    # Add dummy atom with isotope = cleavage position
+    dummy_atom = Chem.Atom(0)
+    dummy_atom.SetIsotope(cleavage_position)
+    dummy_idx = emol.AddAtom(dummy_atom)
+
+    # Add bond between dummy and side chain start
+    emol.AddBond(dummy_idx, side_chain_start_idx, bond.GetBondType())
+
+    # Remove ring atom to side chain bond (will be reconnected via dummy)
+    # Actually, we keep the side chain atoms and dummy, remove everything else
+
+    # Determine atoms to keep
+    atoms_to_keep = set([dummy_idx, side_chain_start_idx] + list(side_chain_atoms))
+
+    # Remove atoms not in keep list
+    # Need to remove in reverse order to maintain valid indices
+    all_atoms = list(range(mol.GetNumAtoms()))
+    atoms_to_remove = [i for i in all_atoms if i not in atoms_to_keep]
+
+    for atom_idx in sorted(atoms_to_remove, reverse=True):
+        emol.RemoveAtom(atom_idx)
+
+    fragment = emol.GetMol()
+    Chem.SanitizeMol(fragment)
+
+    # Get labeled SMILES (with isotope)
+    labeled_smiles = Chem.MolToSmiles(fragment)
+
+    # Get plain SMILES (without isotope)
+    plain_fragment = Chem.Mol(fragment)
+    for atom in plain_fragment.GetAtoms():
+        if atom.GetIsotope() > 0:
+            atom.SetIsotope(0)
+    plain_smiles = Chem.MolToSmiles(plain_fragment)
+
+    return labeled_smiles, plain_smiles, bond_type
+
+
+def extract_isotope_position(fragment_smiles: str) -> int:
+    """Extract cleavage position from fragment SMILES."""
+    mol = Chem.MolFromSmiles(fragment_smiles)
+    if mol is None:
+        return 0
+
+    for atom in mol.GetAtoms():
+        if atom.GetAtomicNum() == 0 and atom.GetIsotope() > 0:
+            return atom.GetIsotope()
+    return 0
+```
+
+**Step 2: Write test**
+
+Create `tests/validation/test_isotope_utils.py`:
+
+```python
+import pytest
+from rdkit import Chem
+
+from macro_lactone_toolkit.validation.isotope_utils import (
+    build_fragment_with_isotope,
+    extract_isotope_position,
+)
+
+
+def test_build_fragment_with_isotope():
+    # Create a simple test molecule: ethyl group attached to position 5
+    mol = Chem.MolFromSmiles("CCCC(CC)CCC")  # Position 4 (0-indexed) has ethyl
+    assert mol is not None
+
+    side_chain_atoms = [4, 5]  # The ethyl group atoms
+    side_chain_start = 4
+    ring_atom = 3
+    cleavage_pos = 5
+
+    labeled, plain, bond_type = build_fragment_with_isotope(
+        mol, side_chain_atoms, side_chain_start, ring_atom, cleavage_pos
+    )
+
+    assert labeled is not None
+    assert plain is not None
+    assert bond_type == "SINGLE"
+
+    # Check isotope was set
+    extracted_pos = extract_isotope_position(labeled)
+    assert extracted_pos == cleavage_pos
+
+    # Plain should have no isotope
+    extracted_plain = extract_isotope_position(plain)
+    assert extracted_plain == 0
+```
+
+**Step 3: Run test**
+
+Run:
+```bash
+pixi run pytest tests/validation/test_isotope_utils.py -v
+```
+
+Expected: `test_build_fragment_with_isotope PASSED`
+
+**Step 4: Commit**
+
+```bash
+git add src/macro_lactone_toolkit/validation/isotope_utils.py tests/validation/test_isotope_utils.py
+git commit -m "feat(validation): add isotope tagging utilities"
+```
+
+---
+
+## Task 4: Create Stratified Sampling Module
+
+**Files:**
+- Create: `src/macro_lactone_toolkit/validation/sampling.py`
+
+**Context:** Implements 10% stratified sampling by ring size (12-20).
+
+**Step 1: Write sampling module**
+
+Create `src/macro_lactone_toolkit/validation/sampling.py`:
+
+```python
+from __future__ import annotations
+
+import pandas as pd
+
+from macro_lactone_toolkit import MacroLactoneAnalyzer
+
+
+def stratified_sample_by_ring_size(
+    df: pd.DataFrame,
+    sample_ratio: float,
+    smiles_col: str = "smiles",
+    random_state: int = 42,
+) -> pd.DataFrame:
+    """
+    Perform stratified sampling by ring size.
+
+    First classifies all molecules, then samples 10% from each ring size layer.
+    """
+    analyzer = MacroLactoneAnalyzer()
+
+    # Classify all molecules
+    classifications = []
+    ring_sizes = []
+
+    for smiles in df[smiles_col]:
+        result = analyzer.classify_macrocycle(smiles)
+        classifications.append(result.classification)
+        ring_sizes.append(result.ring_size)
+
+    df = df.copy()
+    df["_classification"] = classifications
+    df["_ring_size"] = ring_sizes
+
+    # Group by ring size and sample from each group
+    sampled_groups = []
+
+    for ring_size in range(12, 21):
+        group = df[df["_ring_size"] == ring_size]
+        if len(group) > 0:
+            n_samples = max(1, int(len(group) * sample_ratio))
+            sampled = group.sample(n=min(n_samples, len(group)), random_state=random_state)
+            sampled_groups.append(sampled)
+
+    # Also sample from unknown ring size (None)
+    unknown_group = df[df["_ring_size"].isna()]
+    if len(unknown_group) > 0:
+        n_samples = max(1, int(len(unknown_group) * sample_ratio))
+        sampled = unknown_group.sample(n=min(n_samples, len(unknown_group)), random_state=random_state)
+        sampled_groups.append(sampled)
+
+    if not sampled_groups:
+        return pd.DataFrame()
+
+    result = pd.concat(sampled_groups, ignore_index=True)
+    return result
+```
+
+**Step 2: Create test**
+
+Create `tests/validation/test_sampling.py`:
+
+```python
+import pandas as pd
+import pytest
+
+from macro_lactone_toolkit.validation.sampling import stratified_sample_by_ring_size
+
+
+def test_stratified_sample():
+    # Create test data with known ring sizes
+    data = {
+        "smiles": [
+            "O=C1CCCCCCCCCCCCCCO1",  # 16-membered
+            "O=C1CCCCCCCCCCCCO1",     # 14-membered
+            "O=C1CCCCCCCCCCCCCCCCO1", # 18-membered
+        ],
+        "id": ["A", "B", "C"],
+    }
+    df = pd.DataFrame(data)
+
+    sampled = stratified_sample_by_ring_size(df, sample_ratio=0.5, random_state=42)
+
+    # Should get at least 1 from each ring size (50% of 1 = 1)
+    assert len(sampled) >= 1
+    assert len(sampled) <= 3
+```
+
+**Step 3: Run test**
+
+Run:
+```bash
+pixi run pytest tests/validation/test_sampling.py -v
+```
+
+Expected: `test_stratified_sample PASSED`
+
+**Step 4: Commit**
+
+```bash
+git add src/macro_lactone_toolkit/validation/sampling.py tests/validation/test_sampling.py
+git commit -m "feat(validation): add stratified sampling by ring size"
+```
+
+---
+
+## Task 5: Create Visualization Output Module
+
+**Files:**
+- Create: `src/macro_lactone_toolkit/validation/visualization_output.py`
+
+**Context:** Handles saving numbered molecule images and fragment images to organized directory structure.
+
+**Step 1: Write visualization output module**
+
+Create `src/macro_lactone_toolkit/validation/visualization_output.py`:
+
+```python
+from __future__ import annotations
+
+from pathlib import Path
+
+from rdkit import Chem
+
+from macro_lactone_toolkit.visualization import save_numbered_molecule_png, save_fragment_png
+
+
+def get_output_paths(output_dir: Path, source_id: str, ring_size: int, classification: str) -> dict:
+    """Get organized output paths for a molecule."""
+    ring_dir = output_dir / f"ring_size_{ring_size}"
+
+    if classification == "standard_macrolactone":
+        base_dir = ring_dir / "standard"
+        numbered_dir = base_dir / "numbered"
+        sidechains_dir = base_dir / "sidechains" / source_id
+    elif classification == "non_standard_macrocycle":
+        base_dir = ring_dir / "non_standard" / "original"
+        numbered_dir = base_dir
+        sidechains_dir = None
+    else:
+        base_dir = ring_dir / "rejected" / "original"
+        numbered_dir = base_dir
+        sidechains_dir = None
+
+    numbered_dir.mkdir(parents=True, exist_ok=True)
+    if sidechains_dir:
+        sidechains_dir.mkdir(parents=True, exist_ok=True)
+
+    return {
+        "numbered_image": numbered_dir / f"{source_id}_numbered.png",
+        "sidechains_dir": sidechains_dir,
+    }
+
+
+def save_numbered_molecule(
+    smiles: str,
+    output_path: Path,
+    ring_size: int | None = None,
+    size: tuple[int, int] = (800, 800),
+) -> Path | None:
+    """Save numbered molecule image."""
+    try:
+        return save_numbered_molecule_png(
+            smiles,
+            output_path,
+            ring_size=ring_size,
+            size=size,
+        )
+    except Exception as e:
+        print(f"Failed to save numbered image: {e}")
+        return None
+
+
+def save_fragment_images(
+    fragments: list,
+    output_dir: Path,
+    source_id: str,
+    size: tuple[int, int] = (400, 400),
+) -> list[str]:
+    """Save fragment images and return paths."""
+    paths = []
+
+    for i, fragment in enumerate(fragments):
+        try:
+            output_path = output_dir / f"{source_id}_frag_{i}_pos{fragment.cleavage_position}.png"
+            save_fragment_png(fragment.fragment_smiles_plain, output_path, size=size)
+            paths.append(str(output_path.relative_to(output_dir.parent.parent)))
+        except Exception as e:
+            print(f"Failed to save fragment {i}: {e}")
+            paths.append(None)
+
+    return paths
+```
+
+**Step 2: Commit**
+
+```bash
+git add src/macro_lactone_toolkit/validation/visualization_output.py
+git commit -m "feat(validation): add visualization output module"
+```
+
+---
+
+## Task 6: Create Main Validator Class
+
+**Files:**
+- Create: `src/macro_lactone_toolkit/validation/validator.py`
+
+**Context:** Core orchestrator that processes molecules, stores results in database, and generates visualizations.
+
+**Step 1: Write validator class**
+
+Create `src/macro_lactone_toolkit/validation/validator.py`:
+
+```python
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+
+import pandas as pd
+from rdkit import Chem
+from rdkit.Chem import Descriptors
+
+from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter
+from macro_lactone_toolkit.validation.database import get_engine, get_session, init_database
+from macro_lactone_toolkit.validation.isotope_utils import build_fragment_with_isotope
+from macro_lactone_toolkit.validation.models import (
+    ClassificationType,
+    ParentMolecule,
+    ProcessingStatus,
+    RingNumbering,
+    SideChainFragment,
+)
+from macro_lactone_toolkit.validation.sampling import stratified_sample_by_ring_size
+from macro_lactone_toolkit.validation.visualization_output import (
+    get_output_paths,
+    save_fragment_images,
+    save_numbered_molecule,
+)
+
+
+class MacrolactoneValidator:
+    """Validates macrolactone database with sampling and fragmentation."""
+
+    def __init__(
+        self,
+        output_dir: str | Path,
+        sample_ratio: float = 0.1,
+        smiles_col: str = "smiles",
+        id_col: str = "IDs",
+    ):
+        self.output_dir = Path(output_dir)
+        self.sample_ratio = sample_ratio
+        self.smiles_col = smiles_col
+        self.id_col = id_col
+
+        self.analyzer = MacroLactoneAnalyzer()
+        self.fragmenter = MacrolactoneFragmenter()
+
+        # Initialize database
+        self.db_path = self.output_dir / "fragments.db"
+        self.engine = get_engine(self.db_path)
+        init_database(self.engine)
+
+    def run(self, input_csv: str | Path) -> dict:
+        """Run validation on input CSV."""
+        # Load data
+        df = pd.read_csv(input_csv)
+        print(f"Loaded {len(df)} molecules from {input_csv}")
+
+        # Stratified sampling
+        print(f"Performing stratified sampling (ratio={self.sample_ratio})...")
+        sampled = stratified_sample_by_ring_size(df, self.sample_ratio, self.smiles_col)
+        print(f"Sampled {len(sampled)} molecules")
+
+        # Process each molecule
+        results = {"total": len(sampled), "success": 0, "failed": 0, "skipped": 0}
+
+        for idx, row in sampled.iterrows():
+            status = self._process_molecule(row)
+            results[status] += 1
+            if (idx + 1) % 100 == 0:
+                print(f"Processed {idx + 1}/{len(sampled)} molecules")
+
+        # Generate summary
+        self._generate_summary()
+
+        return results
+
+    def _process_molecule(self, row: pd.Series) -> str:
+        """Process a single molecule. Returns status."""
+        source_id = str(row[self.id_col])
+        smiles = row[self.smiles_col]
+        name = row.get("molecule_pref_name", None)
+
+        # Classify
+        classification_result = self.analyzer.classify_macrocycle(smiles)
+        classification = ClassificationType(classification_result.classification)
+        ring_size = classification_result.ring_size
+
+        # Create parent record
+        parent = ParentMolecule(
+            source_id=source_id,
+            molecule_name=name,
+            smiles=smiles,
+            classification=classification,
+            ring_size=ring_size,
+            primary_reason_code=classification_result.primary_reason_code,
+            primary_reason_message=classification_result.primary_reason_message,
+        )
+
+        with get_session(self.engine) as session:
+            session.add(parent)
+            session.commit()
+            session.refresh(parent)
+
+            # Skip non-standard molecules
+            if classification != ClassificationType.STANDARD:
+                parent.processing_status = ProcessingStatus.SKIPPED
+                session.add(parent)
+                session.commit()
+                self._save_original_image(smiles, source_id, ring_size, classification)
+                return "skipped"
+
+            # Process standard macrolactone
+            try:
+                self._process_standard_macrolactone(session, parent, smiles)
+                return "success"
+            except Exception as e:
+                parent.processing_status = ProcessingStatus.FAILED
+                parent.error_message = str(e)
+                parent.processed_at = datetime.utcnow()
+                session.add(parent)
+                session.commit()
+                return "failed"
+
+    def _process_standard_macrolactone(self, session, parent: ParentMolecule, smiles: str):
+        """Process a standard macrolactone."""
+        # Get numbering
+        numbering = self.fragmenter.number_molecule(smiles)
+
+        # Save numbering to database
+        numbering_record = RingNumbering(
+            parent_id=parent.id,
+            ring_size=numbering.ring_size,
+            carbonyl_carbon_idx=numbering.carbonyl_carbon_idx,
+            ester_oxygen_idx=numbering.ester_oxygen_idx,
+            position_to_atom=json.dumps(numbering.position_to_atom),
+            atom_to_position=json.dumps(numbering.atom_to_position),
+        )
+        session.add(numbering_record)
+
+        # Save numbered image
+        paths = get_output_paths(
+            self.output_dir, parent.source_id, parent.ring_size, "standard_macrolactone"
+        )
+        image_path = save_numbered_molecule(smiles, paths["numbered_image"], parent.ring_size)
+        if image_path:
+            parent.numbered_image_path = str(image_path.relative_to(self.output_dir))
+
+        # Fragment side chains
+        mol = Chem.MolFromSmiles(smiles)
+        ring_atom_set = set(numbering.ring_atoms)
+        fragments = []
+        fragment_idx = 0
+
+        from macro_lactone_toolkit._core import collect_side_chain_atoms, is_intrinsic_lactone_neighbor
+
+        for position, ring_atom_idx in numbering.position_to_atom.items():
+            ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
+
+            for neighbor in ring_atom.GetNeighbors():
+                neighbor_idx = neighbor.GetIdx()
+
+                # Skip ring atoms and intrinsic lactone neighbors
+                if neighbor_idx in ring_atom_set:
+                    continue
+                if is_intrinsic_lactone_neighbor(mol, numbering, ring_atom_idx, neighbor_idx):
+                    continue
+
+                # Collect side chain atoms
+                side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
+                if not side_chain_atoms:
+                    continue
+
+                # Build fragment with isotope tagging
+                labeled_smiles, plain_smiles, bond_type = build_fragment_with_isotope(
+                    mol, side_chain_atoms, neighbor_idx, ring_atom_idx, position
+                )
+
+                # Calculate properties
+                plain_mol = Chem.MolFromSmiles(plain_smiles)
+                if plain_mol is None:
+                    continue
+
+                atom_count = sum(1 for a in plain_mol.GetAtoms() if a.GetAtomicNum() != 0)
+                heavy_atom_count = sum(1 for a in plain_mol.GetAtoms() if a.GetAtomicNum() not in [0, 1])
+                mw = Descriptors.MolWt(plain_mol)
+
+                # Create fragment record
+                fragment = SideChainFragment(
+                    parent_id=parent.id,
+                    fragment_id=f"{parent.source_id}_frag_{fragment_idx}",
+                    cleavage_position=position,
+                    attachment_atom_idx=ring_atom_idx,
+                    attachment_atom_symbol=ring_atom.GetSymbol(),
+                    fragment_smiles_labeled=labeled_smiles,
+                    fragment_smiles_plain=plain_smiles,
+                    dummy_isotope=position,
+                    atom_count=atom_count,
+                    heavy_atom_count=heavy_atom_count,
+                    molecular_weight=round(mw, 4),
+                    original_bond_type=bond_type,
+                )
+                session.add(fragment)
+                fragments.append(fragment)
+                fragment_idx += 1
+
+        # Save fragment images
+        if fragments and paths["sidechains_dir"]:
+            image_paths = save_fragment_images(fragments, paths["sidechains_dir"], parent.source_id)
+            for frag, img_path in zip(fragments, image_paths):
+                frag.image_path = img_path
+                session.add(frag)
+
+        # Update parent record
+        parent.processing_status = ProcessingStatus.SUCCESS
+        parent.num_sidechains = len(fragments)
+        parent.cleavage_positions = json.dumps([f.cleavage_position for f in fragments])
+        parent.processed_at = datetime.utcnow()
+        session.add(parent)
+        session.commit()
+
+    def _save_original_image(self, smiles: str, source_id: str, ring_size: int, classification: ClassificationType):
+        """Save original image for non-standard molecules."""
+        paths = get_output_paths(self.output_dir, source_id, ring_size, classification.value)
+        try:
+            from rdkit.Chem import Draw
+            mol = Chem.MolFromSmiles(smiles)
+            if mol:
+                Draw.MolToFile(mol, str(paths["numbered_image"]), size=(400, 400))
+        except Exception:
+            pass
+
+    def _generate_summary(self):
+        """Generate summary CSV and statistics."""
+        with get_session(self.engine) as session:
+            # Query all parents
+            from sqlmodel import select
+            statement = select(ParentMolecule)
+            parents = session.exec(statement).all()
+
+            # Convert to DataFrame
+            data = []
+            for p in parents:
+                data.append({
+                    "id": p.id,
+                    "source_id": p.source_id,
+                    "molecule_name": p.molecule_name,
+                    "smiles": p.smiles,
+                    "classification": p.classification.value,
+                    "ring_size": p.ring_size,
+                    "primary_reason_code": p.primary_reason_code,
+                    "primary_reason_message": p.primary_reason_message,
+                    "processing_status": p.processing_status.value,
+                    "error_message": p.error_message,
+                    "num_sidechains": p.num_sidechains,
+                    "cleavage_positions": p.cleavage_positions,
+                    "numbered_image_path": p.numbered_image_path,
+                    "processed_at": p.processed_at,
+                })
+
+            df = pd.DataFrame(data)
+            df.to_csv(self.output_dir / "summary.csv", index=False)
+
+            # Generate statistics
+            stats = {
+                "total_molecules": len(parents),
+                "by_classification": df["classification"].value_counts().to_dict(),
+                "by_ring_size": df[df["ring_size"].notna()]["ring_size"].value_counts().to_dict(),
+                "by_status": df["processing_status"].value_counts().to_dict(),
+            }
+
+            with open(self.output_dir / "summary_statistics.json", "w") as f:
+                json.dump(stats, f, indent=2, default=str)
+
+            print(f"\nSummary saved to {self.output_dir / 'summary.csv'}")
+            print(f"Statistics: {stats}")
+```
+
+**Step 2: Commit**
+
+```bash
+git add src/macro_lactone_toolkit/validation/validator.py
+git commit -m "feat(validation): add main validator class"
+```
+
+---
+
+## Task 7: Create CLI Script
+
+**Files:**
+- Create: `scripts/validate_macrolactone_db.py`
+
+**Context:** Entry point script that uses pixi environment.
+
+**Step 1: Write CLI script**
+
+Create `scripts/validate_macrolactone_db.py`:
+
+```python
+#!/usr/bin/env python3
+"""
+Validate MacrolactoneDB 12-20 membered rings.
+
+Usage:
+    pixi run python scripts/validate_macrolactone_db.py \
+        --input data/MacrolactoneDB/ring12_20/temp.csv \
+        --output validation_output \
+        --sample-ratio 0.1
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+from macro_lactone_toolkit.validation.validator import MacrolactoneValidator
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Validate MacrolactoneDB 12-20 membered rings"
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        default="data/MacrolactoneDB/ring12_20/temp.csv",
+        help="Input CSV file path",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="validation_output",
+        help="Output directory",
+    )
+    parser.add_argument(
+        "--sample-ratio",
+        type=float,
+        default=0.1,
+        help="Sampling ratio (0.0-1.0)",
+    )
+    parser.add_argument(
+        "--smiles-col",
+        type=str,
+        default="smiles",
+        help="SMILES column name",
+    )
+    parser.add_argument(
+        "--id-col",
+        type=str,
+        default="IDs",
+        help="ID column name",
+    )
+
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("MacrolactoneDB Validation")
+    print("=" * 60)
+    print(f"Input: {args.input}")
+    print(f"Output: {args.output}")
+    print(f"Sample ratio: {args.sample_ratio}")
+    print("=" * 60)
+
+    validator = MacrolactoneValidator(
+        output_dir=args.output,
+        sample_ratio=args.sample_ratio,
+        smiles_col=args.smiles_col,
+        id_col=args.id_col,
+    )
+
+    results = validator.run(args.input)
+
+    print("\n" + "=" * 60)
+    print("Validation Complete")
+    print("=" * 60)
+    print(f"Total processed: {results['total']}")
+    print(f"Success: {results['success']}")
+    print(f"Failed: {results['failed']}")
+    print(f"Skipped: {results['skipped']}")
+    print("=" * 60)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+```
+
+Make executable:
+```bash
+chmod +x scripts/validate_macrolactone_db.py
+```
+
+**Step 2: Test help message**
+
+Run:
+```bash
+pixi run python scripts/validate_macrolactone_db.py --help
+```
+
+Expected: Shows help message with all arguments.
+
+**Step 3: Commit**
+
+```bash
+git add scripts/validate_macrolactone_db.py
+git commit -m "feat(validation): add CLI entry point script"
+```
+
+---
+
+## Task 8: Create Output Directory README
+
+**Files:**
+- Create: Template for `validation_output/README.md` (generated by validator)
+
+**Context:** README explaining the output directory structure.
+
+**Step 1: Add README generation to validator**
+
+Add method to `validator.py` before `_generate_summary`:
+
+```python
+    def _generate_readme(self):
+        """Generate README explaining output structure."""
+        readme_content = """# MacrolactoneDB Validation Output
+
+This directory contains validation results for MacrolactoneDB 12-20 membered rings.
+
+## Directory Structure
+
+```
+validation_output/
+├── README.md                    # This file
+├── fragments.db                 # SQLite database with all data
+├── summary.csv                  # Summary of all processed molecules
+├── summary_statistics.json      # Statistical summary
+│
+├── ring_size_12/                # 12-membered rings
+├── ring_size_13/                # 13-membered rings
+...
+└── ring_size_20/                # 20-membered rings
+    ├── molecules.csv            # Molecules in this ring size
+    ├── standard/                # Standard macrolactones
+    │   ├── numbered/            # Numbered ring images
+    │   │   └── {id}_numbered.png
+    │   └── sidechains/          # Fragment images
+    │       └── {id}/
+    │           └── {id}_frag_{n}_pos{pos}.png
+    ├── non_standard/            # Non-standard macrocycles
+    │   └── original/
+    │       └── {id}_original.png
+    └── rejected/                # Not macrolactones
+        └── original/
+            └── {id}_original.png
+```
+
+## Database Schema
+
+### Tables
+
+- **parent_molecules**: Original molecule information
+- **ring_numberings**: Ring atom numbering details
+- **side_chain_fragments**: Fragmentation results with isotope tags
+- **validation_results**: Manual validation records
+
+### Key Fields
+
+- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone
+- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction
+- `cleavage_position`: Position on ring where side chain was attached
+
+## Ring Numbering Convention
+
+1. Position 1 = Lactone carbonyl carbon (C=O)
+2. Position 2 = Ester oxygen (-O-)
+3. Positions 3-N = Sequential around ring
+
+## Isotope Tagging
+
+Fragments use isotope values to mark cleavage position:
+- `[5*]CCO` = Fragment from position 5, dummy atom has isotope=5
+- This enables precise reconstruction during reassembly
+
+## CSV Columns
+
+### summary.csv
+
+- `source_id`: Original molecule ID from MacrolactoneDB
+- `classification`: Classification result
+- `ring_size`: Detected ring size (12-20)
+- `num_sidechains`: Number of side chains detected
+- `cleavage_positions`: JSON array of cleavage positions
+- `processing_status`: pending | success | failed | skipped
+
+## Querying the Database
+
+```bash
+# List tables
+sqlite3 fragments.db ".tables"
+
+# Get standard macrolactones with fragments
+sqlite3 fragments.db "SELECT * FROM parent_molecules WHERE classification='standard_macrolactone' LIMIT 5;"
+
+# Get fragments for a specific molecule
+sqlite3 fragments.db "SELECT * FROM side_chain_fragments WHERE parent_id=1;"
+
+# Count by ring size
+sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY ring_size;"
+```
+"""
+        readme_path = self.output_dir / "README.md"
+        readme_path.write_text(readme_content)
+```
+
+Add call in `run` method before `return results`:
+```python
+self._generate_readme()
+self._generate_summary()
+```
+
+**Step 2: Commit**
+
+```bash
+git add src/macro_lactone_toolkit/validation/validator.py
+git commit -m "feat(validation): add README generation for output directory"
+```
+
+---
+
+## Task 9: Update Package __init__.py
+
+**Files:**
+- Modify: `src/macro_lactone_toolkit/__init__.py`
+
+**Context:** Export validation module.
+
+**Step 1: Add validation exports**
+
+Modify `src/macro_lactone_toolkit/__init__.py` to add:
+
+```python
+# Validation module (optional import)
+try:
+    from .validation.validator import MacrolactoneValidator
+    from .validation.models import ParentMolecule, SideChainFragment
+except ImportError:
+    pass  # SQLModel not installed
+```
+
+**Step 2: Commit**
+
+```bash
+git add src/macro_lactone_toolkit/__init__.py
+git commit -m "feat(validation): export validation module"
+```
+
+---
+
+## Task 10: Run Integration Test
+
+**Files:**
+- Test with: Small sample of actual data
+
+**Context:** Run the validator on a small subset to verify everything works.
+
+**Step 1: Create test with small sample**
+
+Run:
+```bash
+# Create small test sample
+head -20 data/MacrolactoneDB/ring12_20/temp.csv > /tmp/test_sample.csv
+
+# Run validation
+pixi run python scripts/validate_macrolactone_db.py \
+    --input /tmp/test_sample.csv \
+    --output /tmp/test_validation_output \
+    --sample-ratio 1.0
+```
+
+Expected output shows processing and summary.
+
+**Step 2: Verify outputs**
+
+Run:
+```bash
+ls -la /tmp/test_validation_output/
+cat /tmp/test_validation_output/summary_statistics.json
+sqlite3 /tmp/test_validation_output/fragments.db "SELECT COUNT(*) FROM parent_molecules;"
+```
+
+Expected: Directory exists, has database, summary CSV, and ring size subdirectories.
+
+**Step 3: Cleanup**
+
+```bash
+rm -rf /tmp/test_validation_output /tmp/test_sample.csv
+```
+
+---
+
+## Task 11: Final Commit and Summary
+
+**Step 1: Final review and commit**
+
+```bash
+git status
+git log --oneline -10
+```
+
+**Step 2: Push to branch (if using worktree)**
+
+```bash
+git push origin HEAD
+```
+
+---
+
+## Execution Commands Reference
+
+### Full validation run
+
+```bash
+cd /Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split
+pixi run python scripts/validate_macrolactone_db.py \
+    --input data/MacrolactoneDB/ring12_20/temp.csv \
+    --output validation_output \
+    --sample-ratio 0.1
+```
+
+### Query results
+
+```bash
+# Summary statistics
+cat validation_output/summary_statistics.json
+
+# Database queries
+sqlite3 validation_output/fragments.db "SELECT * FROM parent_molecules LIMIT 5;"
+sqlite3 validation_output/fragments.db "SELECT * FROM side_chain_fragments WHERE cleavage_position > 2 LIMIT 5;"
+```
+
+### Check specific ring size
+
+```bash
+ls validation_output/ring_size_16/standard/numbered/
+ls validation_output/ring_size_16/standard/sidechains/
+```
+
+---
+
+## Verification Checklist
+
+- [ ] All SQLModel models created and importable
+- [ ] Database initializes without errors
+- [ ] Isotope tagging preserves cleavage position
+- [ ] Stratified sampling produces even distribution
+- [ ] Visualization outputs created in correct structure
+- [ ] Summary CSV contains all expected columns
+- [ ] README generated with accurate documentation
+- [ ] CLI script runs with --help
+- [ ] Integration test passes on small sample
diff --git a/src/macro_lactone_toolkit/validation/database.py b/src/macro_lactone_toolkit/validation/database.py
new file mode 100644
index 0000000..5855ecb
--- /dev/null
+++ b/src/macro_lactone_toolkit/validation/database.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+from contextlib import contextmanager
+from pathlib import Path
+
+from sqlmodel import Session, SQLModel, create_engine
+
+
+def get_engine(db_path: str | Path):
+    """Create SQLite engine."""
+    db_path = Path(db_path)
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    url = f"sqlite:///{db_path}"
+    return create_engine(url, echo=False)
+
+
+@contextmanager
+def get_session(engine):
+    """Context manager for database sessions."""
+    with Session(engine) as session:
+        yield session
+
+
+def init_database(engine):
+    """Create all tables."""
+    SQLModel.metadata.create_all(engine)
diff --git a/src/macro_lactone_toolkit/validation/models.py b/src/macro_lactone_toolkit/validation/models.py
index f43e352..796e1c0 100644
--- a/src/macro_lactone_toolkit/validation/models.py
+++ b/src/macro_lactone_toolkit/validation/models.py
@@ -1,25 +1,26 @@
 from __future__ import annotations
 
 from datetime import datetime
-from enum import Enum
 from typing import List, Optional
 
-from sqlmodel import Field, Relationship, SQLModel
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+from sqlmodel import Field, SQLModel
 
 
-class ClassificationType(str, Enum):
+class ClassificationType:
     STANDARD = "standard_macrolactone"
     NON_STANDARD = "non_standard_macrocycle"
     NOT_MACROLACTONE = "not_macrolactone"
 
 
-class ProcessingStatus(str, Enum):
+class ProcessingStatus:
     PENDING = "pending"
     SUCCESS = "success"
     FAILED = "failed"
     SKIPPED = "skipped"
 
 
+# Define all tables without relationships first
 class ParentMolecule(SQLModel, table=True):
     """Original molecule information."""
 
@@ -29,11 +30,11 @@ class ParentMolecule(SQLModel, table=True):
     source_id: str = Field(index=True)
     molecule_name: Optional[str] = None
     smiles: str = Field(index=True)
-    classification: ClassificationType = Field(index=True)
+    classification: str = Field(index=True)
     ring_size: Optional[int] = Field(default=None, index=True)
     primary_reason_code: Optional[str] = None
     primary_reason_message: Optional[str] = None
-    processing_status: ProcessingStatus = Field(default=ProcessingStatus.PENDING)
+    processing_status: str = Field(default=ProcessingStatus.PENDING)
     error_message: Optional[str] = None
     num_sidechains: Optional[int] = None
     cleavage_positions: Optional[str] = None
@@ -41,9 +42,6 @@ class ParentMolecule(SQLModel, table=True):
     created_at: datetime = Field(default_factory=datetime.utcnow)
     processed_at: Optional[datetime] = None
 
-    fragments: List["SideChainFragment"] = Relationship(back_populates="parent")
-    numbering: Optional["RingNumbering"] = Relationship(back_populates="parent")
-
 
 class RingNumbering(SQLModel, table=True):
     """Ring numbering details."""
@@ -58,8 +56,6 @@ class RingNumbering(SQLModel, table=True):
     position_to_atom: str
     atom_to_position: str
 
-    parent: Optional[ParentMolecule] = Relationship(back_populates="numbering")
-
 
 class SideChainFragment(SQLModel, table=True):
     """Side chain fragments from cleavage."""
@@ -81,8 +77,6 @@ class SideChainFragment(SQLModel, table=True):
     original_bond_type: str
     image_path: Optional[str] = None
 
-    parent: Optional[ParentMolecule] = Relationship(back_populates="fragments")
-
 
 class ValidationResult(SQLModel, table=True):
     """Manual validation records."""