feat(validation): enforce single-anchor fragments
- skip fused/shared/multi-anchor side systems during extraction - add fragment library schema and fragment_library.csv export - make scaffold prep strict for non-spliceable positions
This commit is contained in:
102
tests/helpers.py
102
tests/helpers.py
@@ -78,6 +78,108 @@ def build_non_standard_ring_atom_macrolactone(
|
||||
)
|
||||
|
||||
|
||||
def build_macrolactone_with_fused_side_ring(
|
||||
ring_size: int = 16,
|
||||
fused_positions: tuple[int, int] = (5, 6),
|
||||
side_chains: Mapping[int, str] | None = None,
|
||||
) -> BuiltMacrolactone:
|
||||
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||
position_a, position_b = fused_positions
|
||||
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||
|
||||
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||
rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE)
|
||||
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||
rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE)
|
||||
|
||||
mol = rwmol.GetMol()
|
||||
Chem.SanitizeMol(mol)
|
||||
return BuiltMacrolactone(
|
||||
mol=mol,
|
||||
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||
position_to_atom=base.position_to_atom,
|
||||
)
|
||||
|
||||
|
||||
def build_macrolactone_with_bridge_side_chain(
|
||||
ring_size: int = 16,
|
||||
bridge_positions: tuple[int, int] = (5, 8),
|
||||
side_chains: Mapping[int, str] | None = None,
|
||||
) -> BuiltMacrolactone:
|
||||
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||
position_a, position_b = bridge_positions
|
||||
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||
|
||||
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||
rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE)
|
||||
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||
rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE)
|
||||
|
||||
mol = rwmol.GetMol()
|
||||
Chem.SanitizeMol(mol)
|
||||
return BuiltMacrolactone(
|
||||
mol=mol,
|
||||
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||
position_to_atom=base.position_to_atom,
|
||||
)
|
||||
|
||||
|
||||
def build_macrolactone_with_shared_atom_side_ring(
|
||||
ring_size: int = 16,
|
||||
position: int = 5,
|
||||
side_chains: Mapping[int, str] | None = None,
|
||||
) -> BuiltMacrolactone:
|
||||
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||
|
||||
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||
atom_z = rwmol.AddAtom(Chem.Atom("C"))
|
||||
ring_atom_idx = base.position_to_atom[position]
|
||||
|
||||
rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE)
|
||||
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||
rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE)
|
||||
rwmol.AddBond(atom_z, ring_atom_idx, Chem.BondType.SINGLE)
|
||||
|
||||
mol = rwmol.GetMol()
|
||||
Chem.SanitizeMol(mol)
|
||||
return BuiltMacrolactone(
|
||||
mol=mol,
|
||||
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||
position_to_atom=base.position_to_atom,
|
||||
)
|
||||
|
||||
|
||||
def build_macrolactone_with_single_anchor_side_ring(
|
||||
ring_size: int = 16,
|
||||
position: int = 5,
|
||||
side_chains: Mapping[int, str] | None = None,
|
||||
) -> BuiltMacrolactone:
|
||||
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||
|
||||
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||
atom_z = rwmol.AddAtom(Chem.Atom("C"))
|
||||
ring_atom_idx = base.position_to_atom[position]
|
||||
|
||||
rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE)
|
||||
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||
rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE)
|
||||
rwmol.AddBond(atom_z, atom_x, Chem.BondType.SINGLE)
|
||||
|
||||
mol = rwmol.GetMol()
|
||||
Chem.SanitizeMol(mol)
|
||||
return BuiltMacrolactone(
|
||||
mol=mol,
|
||||
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||
position_to_atom=base.position_to_atom,
|
||||
)
|
||||
|
||||
|
||||
def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone:
|
||||
rwmol = Chem.RWMol()
|
||||
|
||||
|
||||
@@ -2,7 +2,12 @@ from rdkit import Chem
|
||||
|
||||
from macro_lactone_toolkit import MacrolactoneFragmenter
|
||||
|
||||
from .helpers import build_macrolactone
|
||||
from .helpers import (
|
||||
build_macrolactone,
|
||||
build_macrolactone_with_fused_side_ring,
|
||||
build_macrolactone_with_shared_atom_side_ring,
|
||||
build_macrolactone_with_single_anchor_side_ring,
|
||||
)
|
||||
|
||||
|
||||
def test_fragmentation_returns_empty_list_without_sidechains():
|
||||
@@ -51,3 +56,24 @@ def test_fragmentation_preserves_attachment_bond_type():
|
||||
neighbor = dummy_atom.GetNeighbors()[0]
|
||||
bond = mol.GetBondBetweenAtoms(dummy_atom.GetIdx(), neighbor.GetIdx())
|
||||
assert bond.GetBondType() == Chem.BondType.DOUBLE
|
||||
|
||||
|
||||
def test_fragmentation_skips_fused_side_ring_but_keeps_single_anchor_sidechains():
|
||||
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
|
||||
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="fused")
|
||||
|
||||
assert {fragment.cleavage_position for fragment in result.fragments} == {10}
|
||||
|
||||
|
||||
def test_fragmentation_skips_shared_atom_multi_anchor_component():
|
||||
built = build_macrolactone_with_shared_atom_side_ring(side_chains={11: "ethyl"})
|
||||
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="shared_atom")
|
||||
|
||||
assert {fragment.cleavage_position for fragment in result.fragments} == {11}
|
||||
|
||||
|
||||
def test_fragmentation_allows_single_anchor_side_ring():
|
||||
built = build_macrolactone_with_single_anchor_side_ring()
|
||||
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="single_anchor_ring")
|
||||
|
||||
assert {fragment.cleavage_position for fragment in result.fragments} == {5}
|
||||
|
||||
@@ -5,7 +5,7 @@ from macro_lactone_toolkit import MacrolactoneFragmenter
|
||||
from macro_lactone_toolkit.splicing.engine import splice_molecule
|
||||
from macro_lactone_toolkit.splicing.scaffold_prep import prepare_macrolactone_scaffold
|
||||
|
||||
from .helpers import build_macrolactone, canonicalize
|
||||
from .helpers import build_macrolactone, build_macrolactone_with_fused_side_ring, canonicalize
|
||||
|
||||
|
||||
def test_splice_benzene_methyl():
|
||||
@@ -49,3 +49,14 @@ def test_prepare_scaffold_and_reassemble_fragment():
|
||||
product = splice_molecule(scaffold, Chem.MolFromSmiles(fragment.fragment_smiles_labeled), position=5)
|
||||
|
||||
assert canonicalize(product) == canonicalize(built.mol)
|
||||
|
||||
|
||||
def test_prepare_scaffold_rejects_position_without_single_anchor_fragment():
|
||||
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
|
||||
|
||||
with pytest.raises(ValueError, match="Position 5 does not contain a single-anchor fragmentable side chain"):
|
||||
prepare_macrolactone_scaffold(
|
||||
built.smiles,
|
||||
positions=[5],
|
||||
ring_size=16,
|
||||
)
|
||||
|
||||
57
tests/validation/test_validator.py
Normal file
57
tests/validation/test_validator.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from macro_lactone_toolkit.validation.validator import MacrolactoneValidator
|
||||
|
||||
from ..helpers import build_macrolactone_with_fused_side_ring
|
||||
|
||||
|
||||
def test_validator_exports_only_single_anchor_fragments_and_fragment_library(tmp_path):
|
||||
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
|
||||
input_path = tmp_path / "input.csv"
|
||||
output_dir = tmp_path / "validation_output"
|
||||
|
||||
pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"ml_id": "ML00000001",
|
||||
"IDs": "CHEMBL0001",
|
||||
"smiles": built.smiles,
|
||||
}
|
||||
]
|
||||
).to_csv(input_path, index=False)
|
||||
|
||||
validator = MacrolactoneValidator(output_dir=output_dir, sample_ratio=1.0)
|
||||
results = validator.run(input_path)
|
||||
|
||||
assert results == {"total": 1, "success": 1, "failed": 0, "skipped": 0}
|
||||
|
||||
with sqlite3.connect(output_dir / "fragments.db") as connection:
|
||||
fragments = connection.execute(
|
||||
"SELECT cleavage_position, has_dummy_atom, dummy_atom_count FROM side_chain_fragments"
|
||||
).fetchall()
|
||||
library_entries = connection.execute(
|
||||
"""
|
||||
SELECT source_type, source_parent_ml_id, source_parent_chembl_id,
|
||||
cleavage_position, has_dummy_atom, dummy_atom_count, splice_ready
|
||||
FROM fragment_library_entries
|
||||
"""
|
||||
).fetchall()
|
||||
|
||||
assert fragments == [(10, 1, 1)]
|
||||
assert library_entries == [("validation_extract", "ML00000001", "CHEMBL0001", 10, 1, 1, 1)]
|
||||
|
||||
summary = pd.read_csv(output_dir / "summary.csv")
|
||||
assert summary.loc[0, "num_sidechains"] == 1
|
||||
assert json.loads(summary.loc[0, "cleavage_positions"]) == [10]
|
||||
|
||||
fragment_library = pd.read_csv(output_dir / "fragment_library.csv")
|
||||
assert fragment_library.loc[0, "source_type"] == "validation_extract"
|
||||
assert int(fragment_library.loc[0, "cleavage_position"]) == 10
|
||||
assert bool(fragment_library.loc[0, "has_dummy_atom"]) is True
|
||||
assert int(fragment_library.loc[0, "dummy_atom_count"]) == 1
|
||||
assert bool(fragment_library.loc[0, "splice_ready"]) is True
|
||||
Reference in New Issue
Block a user