From 46a438dd36619cc2097304ee67c52ced1701ed34 Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Thu, 19 Mar 2026 14:20:32 +0800 Subject: [PATCH] feat(validation): enforce single-anchor fragments - skip fused/shared/multi-anchor side systems during extraction - add fragment library schema and fragment_library.csv export - make scaffold prep strict for non-spliceable positions --- README.md | 13 ++- src/macro_lactone_toolkit/_core.py | 38 +++++++ src/macro_lactone_toolkit/fragmenter.py | 13 ++- .../splicing/scaffold_prep.py | 24 ++++- .../validation/models.py | 26 ++++- .../validation/validator.py | 90 ++++++++++++++-- tests/helpers.py | 102 ++++++++++++++++++ tests/test_fragmentation.py | 28 ++++- tests/test_splicing_engine.py | 13 ++- tests/validation/test_validator.py | 57 ++++++++++ 10 files changed, 383 insertions(+), 21 deletions(-) create mode 100644 tests/validation/test_validator.py diff --git a/README.md b/README.md index 2a34c04..64ca723 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,7 @@ pixi run python scripts/validate_macrolactone_db.py \ validation_output/ ├── README.md # 目录说明 ├── fragments.db # SQLite 数据库 +├── fragment_library.csv # 最终片段库导出(含 has_dummy_atom / splice_ready) ├── summary.csv # 汇总表(含 ml_id, chembl_id) ├── summary_statistics.json # 统计信息 ├── ring_size_12/ # 按环大小组织 @@ -124,11 +125,15 @@ sqlite3 validation_output/fragments.db \ FROM parent_molecules \ WHERE classification='standard_macrolactone' LIMIT 5;" +# 查询最终片段库 +sqlite3 validation_output/fragments.db \ + "SELECT source_type, source_parent_ml_id, cleavage_position, has_dummy_atom, splice_ready \ + FROM fragment_library_entries LIMIT 10;" + # 查询片段 sqlite3 validation_output/fragments.db \ - "SELECT fragment_id, cleavage_position, dummy_isotope \ - FROM side_chain_fragments \ - WHERE ml_id='ML00000001';" + "SELECT fragment_id, cleavage_position, dummy_isotope, has_dummy_atom, dummy_atom_count \ + FROM side_chain_fragments LIMIT 10;" # 按环大小统计 sqlite3 validation_output/fragments.db \ @@ -144,6 +149,8 @@ sqlite3 validation_output/fragments.db \ | `classification` | standard_macrolactone / non_standard_macrocycle / not_macrolactone | | `dummy_isotope` | 裂解位置编号,用于片段重建 | | `cleavage_position` | 环上的断裂位置 | +| `has_dummy_atom` | 该片段是否带 dummy 原子,可用于区分可直接拼接片段 | +| `splice_ready` | 是否与当前单锚点拼接流程直接兼容 | ## Legacy Scripts diff --git a/src/macro_lactone_toolkit/_core.py b/src/macro_lactone_toolkit/_core.py index b6b10a2..7320f20 100644 --- a/src/macro_lactone_toolkit/_core.py +++ b/src/macro_lactone_toolkit/_core.py @@ -272,6 +272,44 @@ def collect_side_chain_atoms( return side_chain_atoms +def find_side_chain_ring_connections( + mol: Chem.Mol, + side_chain_atoms: Iterable[int], + ring_atom_indices: Iterable[int], +) -> list[tuple[int, int]]: + ring_atom_set = set(ring_atom_indices) + connections: set[tuple[int, int]] = set() + + for atom_idx in side_chain_atoms: + atom = mol.GetAtomWithIdx(atom_idx) + for neighbor in atom.GetNeighbors(): + neighbor_idx = neighbor.GetIdx() + if neighbor_idx in ring_atom_set: + connections.add((atom_idx, neighbor_idx)) + + return sorted(connections, key=lambda connection: (connection[1], connection[0])) + + +def collect_fragmentable_side_chain_atoms( + mol: Chem.Mol, + start_atom_idx: int, + ring_atom_indices: Iterable[int], + ring_atom_idx: int | None = None, +) -> list[int] | None: + side_chain_atoms = collect_side_chain_atoms(mol, start_atom_idx, ring_atom_indices) + if not side_chain_atoms: + return None + + ring_connections = find_side_chain_ring_connections(mol, side_chain_atoms, ring_atom_indices) + if len(ring_connections) != 1: + return None + + if ring_atom_idx is not None and ring_connections[0][1] != ring_atom_idx: + return None + + return side_chain_atoms + + def is_intrinsic_lactone_neighbor( mol: Chem.Mol, candidate: DetectedMacrolactone, diff --git a/src/macro_lactone_toolkit/fragmenter.py b/src/macro_lactone_toolkit/fragmenter.py index b8b348a..de08d58 100644 --- a/src/macro_lactone_toolkit/fragmenter.py +++ b/src/macro_lactone_toolkit/fragmenter.py @@ -6,7 +6,7 @@ from rdkit.Chem import Descriptors from ._core import ( build_fragment_smiles, build_numbering_result, - collect_side_chain_atoms, + collect_fragmentable_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor, @@ -44,6 +44,8 @@ class MacrolactoneFragmenter: fragments: list[SideChainFragment] = [] for position, ring_atom_idx in numbering.position_to_atom.items(): + if int(position) <= 2: + continue ring_atom = mol.GetAtomWithIdx(ring_atom_idx) for neighbor in ring_atom.GetNeighbors(): neighbor_idx = neighbor.GetIdx() @@ -52,8 +54,13 @@ class MacrolactoneFragmenter: if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx): continue - side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set) - if not side_chain_atoms: + side_chain_atoms = collect_fragmentable_side_chain_atoms( + mol=mol, + start_atom_idx=neighbor_idx, + ring_atom_indices=ring_atom_set, + ring_atom_idx=ring_atom_idx, + ) + if side_chain_atoms is None: continue try: diff --git a/src/macro_lactone_toolkit/splicing/scaffold_prep.py b/src/macro_lactone_toolkit/splicing/scaffold_prep.py index 7f4d30d..2367fcd 100644 --- a/src/macro_lactone_toolkit/splicing/scaffold_prep.py +++ b/src/macro_lactone_toolkit/splicing/scaffold_prep.py @@ -4,7 +4,7 @@ from typing import Iterable from rdkit import Chem -from .._core import collect_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor +from .._core import collect_fragmentable_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor from ..fragmenter import MacrolactoneFragmenter @@ -26,22 +26,36 @@ def prepare_macrolactone_scaffold( for position in positions: if position not in numbering.position_to_atom: raise ValueError(f"Position {position} not found in ring numbering.") + if position <= 2: + raise ValueError(f"Position {position} does not contain a single-anchor fragmentable side chain") ring_atom_idx = numbering.position_to_atom[position] ring_atom = mol.GetAtomWithIdx(ring_atom_idx) + position_dummy_specs: list[tuple[int, int, Chem.BondType]] = [] + for neighbor in ring_atom.GetNeighbors(): neighbor_idx = neighbor.GetIdx() if neighbor_idx in ring_atom_set: continue if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx): continue - side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set) + side_chain_atoms = collect_fragmentable_side_chain_atoms( + mol=mol, + start_atom_idx=neighbor_idx, + ring_atom_indices=ring_atom_set, + ring_atom_idx=ring_atom_idx, + ) + if side_chain_atoms is None: + continue atoms_to_remove.update(side_chain_atoms) bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx) if bond is not None: - dummy_specs.append((ring_atom_idx, position, bond.GetBondType())) + position_dummy_specs.append((ring_atom_idx, position, bond.GetBondType())) - if not any(spec_position == position for _, spec_position, _ in dummy_specs): - dummy_specs.append((ring_atom_idx, position, Chem.BondType.SINGLE)) + if not position_dummy_specs: + raise ValueError(f"Position {position} does not contain a single-anchor fragmentable side chain") + if len(position_dummy_specs) > 1: + raise ValueError(f"Position {position} contains multiple fragmentable side chains") + dummy_specs.extend(position_dummy_specs) rwmol = Chem.RWMol(mol) for ring_atom_idx, position, bond_type in dummy_specs: diff --git a/src/macro_lactone_toolkit/validation/models.py b/src/macro_lactone_toolkit/validation/models.py index cba96bc..0328b85 100644 --- a/src/macro_lactone_toolkit/validation/models.py +++ b/src/macro_lactone_toolkit/validation/models.py @@ -1,6 +1,6 @@ from __future__ import annotations -from datetime import datetime +from datetime import UTC, datetime from typing import List, Optional from sqlalchemy.orm import Mapped, mapped_column, relationship @@ -40,7 +40,7 @@ class ParentMolecule(SQLModel, table=True): num_sidechains: Optional[int] = None cleavage_positions: Optional[str] = None numbered_image_path: Optional[str] = None - created_at: datetime = Field(default_factory=datetime.utcnow) + created_at: datetime = Field(default_factory=lambda: datetime.now(UTC)) processed_at: Optional[datetime] = None @@ -72,6 +72,8 @@ class SideChainFragment(SQLModel, table=True): fragment_smiles_labeled: str fragment_smiles_plain: str dummy_isotope: int + has_dummy_atom: bool = Field(default=True) + dummy_atom_count: int = Field(default=1) atom_count: int heavy_atom_count: int molecular_weight: float @@ -79,6 +81,26 @@ class SideChainFragment(SQLModel, table=True): image_path: Optional[str] = None +class FragmentLibraryEntry(SQLModel, table=True): + """Unified fragment library entries.""" + + __tablename__ = "fragment_library_entries" + + id: Optional[int] = Field(default=None, primary_key=True) + source_type: str = Field(index=True) + source_fragment_id: Optional[str] = Field(default=None, index=True) + source_parent_ml_id: Optional[str] = Field(default=None, index=True) + source_parent_chembl_id: Optional[str] = Field(default=None, index=True) + cleavage_position: Optional[int] = Field(default=None, index=True) + fragment_smiles_labeled: Optional[str] = None + fragment_smiles_plain: str + has_dummy_atom: bool = Field(default=False) + dummy_atom_count: int = Field(default=0) + splice_ready: bool = Field(default=False, index=True) + original_bond_type: Optional[str] = None + created_at: datetime = Field(default_factory=lambda: datetime.now(UTC)) + + class ValidationResult(SQLModel, table=True): """Manual validation records.""" diff --git a/src/macro_lactone_toolkit/validation/validator.py b/src/macro_lactone_toolkit/validation/validator.py index 94088a0..be956ed 100644 --- a/src/macro_lactone_toolkit/validation/validator.py +++ b/src/macro_lactone_toolkit/validation/validator.py @@ -1,7 +1,7 @@ from __future__ import annotations import json -from datetime import datetime +from datetime import UTC, datetime from pathlib import Path import pandas as pd @@ -12,7 +12,7 @@ from sqlmodel import select from macro_lactone_toolkit import MacroLactoneAnalyzer from macro_lactone_toolkit._core import ( build_numbering_result, - collect_side_chain_atoms, + collect_fragmentable_side_chain_atoms, find_macrolactone_candidates, is_intrinsic_lactone_neighbor, ) @@ -20,6 +20,7 @@ from macro_lactone_toolkit.validation.database import get_engine, get_session, i from macro_lactone_toolkit.validation.isotope_utils import build_fragment_with_isotope from macro_lactone_toolkit.validation.models import ( ClassificationType, + FragmentLibraryEntry, ParentMolecule, ProcessingStatus, RingNumbering, @@ -80,6 +81,7 @@ class MacrolactoneValidator: # Generate outputs self._generate_readme() self._generate_summary() + self._generate_fragment_library() return results @@ -138,7 +140,7 @@ class MacrolactoneValidator: except Exception as e: parent.processing_status = ProcessingStatus.FAILED parent.error_message = str(e) - parent.processed_at = datetime.utcnow() + parent.processed_at = datetime.now(UTC) session.add(parent) session.commit() return "failed" @@ -188,6 +190,8 @@ class MacrolactoneValidator: fragment_idx = 0 for position, ring_atom_idx in numbering.position_to_atom.items(): + if int(position) <= 2: + continue ring_atom = mol.GetAtomWithIdx(ring_atom_idx) for neighbor in ring_atom.GetNeighbors(): @@ -200,8 +204,13 @@ class MacrolactoneValidator: continue # Collect side chain atoms - side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set) - if not side_chain_atoms: + side_chain_atoms = collect_fragmentable_side_chain_atoms( + mol=mol, + start_atom_idx=neighbor_idx, + ring_atom_indices=ring_atom_set, + ring_atom_idx=ring_atom_idx, + ) + if side_chain_atoms is None: continue # Build fragment with isotope tagging @@ -228,6 +237,8 @@ class MacrolactoneValidator: fragment_smiles_labeled=labeled_smiles, fragment_smiles_plain=plain_smiles, dummy_isotope=int(position), + has_dummy_atom=True, + dummy_atom_count=1, atom_count=atom_count, heavy_atom_count=heavy_atom_count, molecular_weight=round(mw, 4), @@ -235,6 +246,21 @@ class MacrolactoneValidator: ) session.add(fragment) fragments.append(fragment) + session.add( + FragmentLibraryEntry( + source_type="validation_extract", + source_fragment_id=fragment.fragment_id, + source_parent_ml_id=parent.ml_id, + source_parent_chembl_id=parent.chembl_id, + cleavage_position=int(position), + fragment_smiles_labeled=labeled_smiles, + fragment_smiles_plain=plain_smiles, + has_dummy_atom=True, + dummy_atom_count=1, + splice_ready=True, + original_bond_type=bond_type, + ) + ) fragment_idx += 1 # Save fragment images @@ -248,7 +274,7 @@ class MacrolactoneValidator: parent.processing_status = ProcessingStatus.SUCCESS parent.num_sidechains = len(fragments) parent.cleavage_positions = json.dumps([f.cleavage_position for f in fragments]) - parent.processed_at = datetime.utcnow() + parent.processed_at = datetime.now(UTC) session.add(parent) session.commit() @@ -276,6 +302,7 @@ This directory contains validation results for MacrolactoneDB 12-20 membered rin validation_output/ ├── README.md # This file ├── fragments.db # SQLite database with all data +├── fragment_library.csv # Unified fragment library export ├── summary.csv # Summary of all processed molecules ├── summary_statistics.json # Statistical summary │ @@ -305,6 +332,7 @@ validation_output/ - **parent_molecules**: Original molecule information - **ring_numberings**: Ring atom numbering details - **side_chain_fragments**: Fragmentation results with isotope tags +- **fragment_library_entries**: Unified fragment library rows for downstream design - **validation_results**: Manual validation records ### Key Fields @@ -312,6 +340,8 @@ validation_output/ - `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone - `dummy_isotope`: Cleavage position stored as isotope value for reconstruction - `cleavage_position`: Position on ring where side chain was attached +- `has_dummy_atom`: Whether the fragment contains a dummy atom for splicing +- `dummy_atom_count`: Number of dummy atoms in the fragment ## Ring Numbering Convention @@ -337,6 +367,13 @@ Fragments use isotope values to mark cleavage position: - `cleavage_positions`: JSON array of cleavage positions - `processing_status`: pending | success | failed | skipped +### fragment_library.csv + +- `source_type`: validation_extract | supplemental (reserved) +- `has_dummy_atom`: Whether the fragment contains a dummy atom +- `dummy_atom_count`: Number of dummy atoms +- `splice_ready`: Whether the fragment is directly compatible with single-anchor splicing + ## Querying the Database ```bash @@ -401,6 +438,47 @@ sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY print(f"\nSummary saved to {self.output_dir / 'summary.csv'}") print(f"Statistics: {stats}") + def _generate_fragment_library(self): + """Generate unified fragment library CSV.""" + columns = [ + "id", + "source_type", + "source_fragment_id", + "source_parent_ml_id", + "source_parent_chembl_id", + "cleavage_position", + "fragment_smiles_labeled", + "fragment_smiles_plain", + "has_dummy_atom", + "dummy_atom_count", + "splice_ready", + "original_bond_type", + "created_at", + ] + + with get_session(self.engine) as session: + entries = session.exec(select(FragmentLibraryEntry)).all() + data = [ + { + "id": entry.id, + "source_type": entry.source_type, + "source_fragment_id": entry.source_fragment_id, + "source_parent_ml_id": entry.source_parent_ml_id, + "source_parent_chembl_id": entry.source_parent_chembl_id, + "cleavage_position": entry.cleavage_position, + "fragment_smiles_labeled": entry.fragment_smiles_labeled, + "fragment_smiles_plain": entry.fragment_smiles_plain, + "has_dummy_atom": entry.has_dummy_atom, + "dummy_atom_count": entry.dummy_atom_count, + "splice_ready": entry.splice_ready, + "original_bond_type": entry.original_bond_type, + "created_at": entry.created_at, + } + for entry in entries + ] + + pd.DataFrame(data, columns=columns).to_csv(self.output_dir / "fragment_library.csv", index=False) + class MacrolactoneDetectionError(Exception): """Raised when macrolactone detection fails.""" diff --git a/tests/helpers.py b/tests/helpers.py index 2391d2e..599698d 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -78,6 +78,108 @@ def build_non_standard_ring_atom_macrolactone( ) +def build_macrolactone_with_fused_side_ring( + ring_size: int = 16, + fused_positions: tuple[int, int] = (5, 6), + side_chains: Mapping[int, str] | None = None, +) -> BuiltMacrolactone: + base = build_macrolactone(ring_size=ring_size, side_chains=side_chains) + position_a, position_b = fused_positions + rwmol = Chem.RWMol(Chem.Mol(base.mol)) + + atom_x = rwmol.AddAtom(Chem.Atom("C")) + atom_y = rwmol.AddAtom(Chem.Atom("C")) + rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE) + rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE) + rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE) + + mol = rwmol.GetMol() + Chem.SanitizeMol(mol) + return BuiltMacrolactone( + mol=mol, + smiles=Chem.MolToSmiles(mol, isomericSmiles=True), + position_to_atom=base.position_to_atom, + ) + + +def build_macrolactone_with_bridge_side_chain( + ring_size: int = 16, + bridge_positions: tuple[int, int] = (5, 8), + side_chains: Mapping[int, str] | None = None, +) -> BuiltMacrolactone: + base = build_macrolactone(ring_size=ring_size, side_chains=side_chains) + position_a, position_b = bridge_positions + rwmol = Chem.RWMol(Chem.Mol(base.mol)) + + atom_x = rwmol.AddAtom(Chem.Atom("C")) + atom_y = rwmol.AddAtom(Chem.Atom("C")) + rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE) + rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE) + rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE) + + mol = rwmol.GetMol() + Chem.SanitizeMol(mol) + return BuiltMacrolactone( + mol=mol, + smiles=Chem.MolToSmiles(mol, isomericSmiles=True), + position_to_atom=base.position_to_atom, + ) + + +def build_macrolactone_with_shared_atom_side_ring( + ring_size: int = 16, + position: int = 5, + side_chains: Mapping[int, str] | None = None, +) -> BuiltMacrolactone: + base = build_macrolactone(ring_size=ring_size, side_chains=side_chains) + rwmol = Chem.RWMol(Chem.Mol(base.mol)) + + atom_x = rwmol.AddAtom(Chem.Atom("C")) + atom_y = rwmol.AddAtom(Chem.Atom("C")) + atom_z = rwmol.AddAtom(Chem.Atom("C")) + ring_atom_idx = base.position_to_atom[position] + + rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE) + rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE) + rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE) + rwmol.AddBond(atom_z, ring_atom_idx, Chem.BondType.SINGLE) + + mol = rwmol.GetMol() + Chem.SanitizeMol(mol) + return BuiltMacrolactone( + mol=mol, + smiles=Chem.MolToSmiles(mol, isomericSmiles=True), + position_to_atom=base.position_to_atom, + ) + + +def build_macrolactone_with_single_anchor_side_ring( + ring_size: int = 16, + position: int = 5, + side_chains: Mapping[int, str] | None = None, +) -> BuiltMacrolactone: + base = build_macrolactone(ring_size=ring_size, side_chains=side_chains) + rwmol = Chem.RWMol(Chem.Mol(base.mol)) + + atom_x = rwmol.AddAtom(Chem.Atom("C")) + atom_y = rwmol.AddAtom(Chem.Atom("C")) + atom_z = rwmol.AddAtom(Chem.Atom("C")) + ring_atom_idx = base.position_to_atom[position] + + rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE) + rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE) + rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE) + rwmol.AddBond(atom_z, atom_x, Chem.BondType.SINGLE) + + mol = rwmol.GetMol() + Chem.SanitizeMol(mol) + return BuiltMacrolactone( + mol=mol, + smiles=Chem.MolToSmiles(mol, isomericSmiles=True), + position_to_atom=base.position_to_atom, + ) + + def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone: rwmol = Chem.RWMol() diff --git a/tests/test_fragmentation.py b/tests/test_fragmentation.py index 02a1a58..aee2d04 100644 --- a/tests/test_fragmentation.py +++ b/tests/test_fragmentation.py @@ -2,7 +2,12 @@ from rdkit import Chem from macro_lactone_toolkit import MacrolactoneFragmenter -from .helpers import build_macrolactone +from .helpers import ( + build_macrolactone, + build_macrolactone_with_fused_side_ring, + build_macrolactone_with_shared_atom_side_ring, + build_macrolactone_with_single_anchor_side_ring, +) def test_fragmentation_returns_empty_list_without_sidechains(): @@ -51,3 +56,24 @@ def test_fragmentation_preserves_attachment_bond_type(): neighbor = dummy_atom.GetNeighbors()[0] bond = mol.GetBondBetweenAtoms(dummy_atom.GetIdx(), neighbor.GetIdx()) assert bond.GetBondType() == Chem.BondType.DOUBLE + + +def test_fragmentation_skips_fused_side_ring_but_keeps_single_anchor_sidechains(): + built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"}) + result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="fused") + + assert {fragment.cleavage_position for fragment in result.fragments} == {10} + + +def test_fragmentation_skips_shared_atom_multi_anchor_component(): + built = build_macrolactone_with_shared_atom_side_ring(side_chains={11: "ethyl"}) + result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="shared_atom") + + assert {fragment.cleavage_position for fragment in result.fragments} == {11} + + +def test_fragmentation_allows_single_anchor_side_ring(): + built = build_macrolactone_with_single_anchor_side_ring() + result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="single_anchor_ring") + + assert {fragment.cleavage_position for fragment in result.fragments} == {5} diff --git a/tests/test_splicing_engine.py b/tests/test_splicing_engine.py index 09ee986..569fd2f 100644 --- a/tests/test_splicing_engine.py +++ b/tests/test_splicing_engine.py @@ -5,7 +5,7 @@ from macro_lactone_toolkit import MacrolactoneFragmenter from macro_lactone_toolkit.splicing.engine import splice_molecule from macro_lactone_toolkit.splicing.scaffold_prep import prepare_macrolactone_scaffold -from .helpers import build_macrolactone, canonicalize +from .helpers import build_macrolactone, build_macrolactone_with_fused_side_ring, canonicalize def test_splice_benzene_methyl(): @@ -49,3 +49,14 @@ def test_prepare_scaffold_and_reassemble_fragment(): product = splice_molecule(scaffold, Chem.MolFromSmiles(fragment.fragment_smiles_labeled), position=5) assert canonicalize(product) == canonicalize(built.mol) + + +def test_prepare_scaffold_rejects_position_without_single_anchor_fragment(): + built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"}) + + with pytest.raises(ValueError, match="Position 5 does not contain a single-anchor fragmentable side chain"): + prepare_macrolactone_scaffold( + built.smiles, + positions=[5], + ring_size=16, + ) diff --git a/tests/validation/test_validator.py b/tests/validation/test_validator.py new file mode 100644 index 0000000..87633fd --- /dev/null +++ b/tests/validation/test_validator.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import json +import sqlite3 + +import pandas as pd + +from macro_lactone_toolkit.validation.validator import MacrolactoneValidator + +from ..helpers import build_macrolactone_with_fused_side_ring + + +def test_validator_exports_only_single_anchor_fragments_and_fragment_library(tmp_path): + built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"}) + input_path = tmp_path / "input.csv" + output_dir = tmp_path / "validation_output" + + pd.DataFrame( + [ + { + "ml_id": "ML00000001", + "IDs": "CHEMBL0001", + "smiles": built.smiles, + } + ] + ).to_csv(input_path, index=False) + + validator = MacrolactoneValidator(output_dir=output_dir, sample_ratio=1.0) + results = validator.run(input_path) + + assert results == {"total": 1, "success": 1, "failed": 0, "skipped": 0} + + with sqlite3.connect(output_dir / "fragments.db") as connection: + fragments = connection.execute( + "SELECT cleavage_position, has_dummy_atom, dummy_atom_count FROM side_chain_fragments" + ).fetchall() + library_entries = connection.execute( + """ + SELECT source_type, source_parent_ml_id, source_parent_chembl_id, + cleavage_position, has_dummy_atom, dummy_atom_count, splice_ready + FROM fragment_library_entries + """ + ).fetchall() + + assert fragments == [(10, 1, 1)] + assert library_entries == [("validation_extract", "ML00000001", "CHEMBL0001", 10, 1, 1, 1)] + + summary = pd.read_csv(output_dir / "summary.csv") + assert summary.loc[0, "num_sidechains"] == 1 + assert json.loads(summary.loc[0, "cleavage_positions"]) == [10] + + fragment_library = pd.read_csv(output_dir / "fragment_library.csv") + assert fragment_library.loc[0, "source_type"] == "validation_extract" + assert int(fragment_library.loc[0, "cleavage_position"]) == 10 + assert bool(fragment_library.loc[0, "has_dummy_atom"]) is True + assert int(fragment_library.loc[0, "dummy_atom_count"]) == 1 + assert bool(fragment_library.loc[0, "splice_ready"]) is True