feat(validation): enforce single-anchor fragments

- skip fused/shared/multi-anchor side systems during extraction
- add fragment library schema and fragment_library.csv export
- make scaffold prep strict for non-spliceable positions
This commit is contained in:
2026-03-19 14:20:32 +08:00
parent 07ba27be2b
commit 46a438dd36
10 changed files with 383 additions and 21 deletions

View File

@@ -96,6 +96,7 @@ pixi run python scripts/validate_macrolactone_db.py \
validation_output/ validation_output/
├── README.md # 目录说明 ├── README.md # 目录说明
├── fragments.db # SQLite 数据库 ├── fragments.db # SQLite 数据库
├── fragment_library.csv # 最终片段库导出(含 has_dummy_atom / splice_ready
├── summary.csv # 汇总表(含 ml_id, chembl_id ├── summary.csv # 汇总表(含 ml_id, chembl_id
├── summary_statistics.json # 统计信息 ├── summary_statistics.json # 统计信息
├── ring_size_12/ # 按环大小组织 ├── ring_size_12/ # 按环大小组织
@@ -124,11 +125,15 @@ sqlite3 validation_output/fragments.db \
FROM parent_molecules \ FROM parent_molecules \
WHERE classification='standard_macrolactone' LIMIT 5;" WHERE classification='standard_macrolactone' LIMIT 5;"
# 查询最终片段库
sqlite3 validation_output/fragments.db \
"SELECT source_type, source_parent_ml_id, cleavage_position, has_dummy_atom, splice_ready \
FROM fragment_library_entries LIMIT 10;"
# 查询片段 # 查询片段
sqlite3 validation_output/fragments.db \ sqlite3 validation_output/fragments.db \
"SELECT fragment_id, cleavage_position, dummy_isotope \ "SELECT fragment_id, cleavage_position, dummy_isotope, has_dummy_atom, dummy_atom_count \
FROM side_chain_fragments \ FROM side_chain_fragments LIMIT 10;"
WHERE ml_id='ML00000001';"
# 按环大小统计 # 按环大小统计
sqlite3 validation_output/fragments.db \ sqlite3 validation_output/fragments.db \
@@ -144,6 +149,8 @@ sqlite3 validation_output/fragments.db \
| `classification` | standard_macrolactone / non_standard_macrocycle / not_macrolactone | | `classification` | standard_macrolactone / non_standard_macrocycle / not_macrolactone |
| `dummy_isotope` | 裂解位置编号,用于片段重建 | | `dummy_isotope` | 裂解位置编号,用于片段重建 |
| `cleavage_position` | 环上的断裂位置 | | `cleavage_position` | 环上的断裂位置 |
| `has_dummy_atom` | 该片段是否带 dummy 原子,可用于区分可直接拼接片段 |
| `splice_ready` | 是否与当前单锚点拼接流程直接兼容 |
## Legacy Scripts ## Legacy Scripts

View File

@@ -272,6 +272,44 @@ def collect_side_chain_atoms(
return side_chain_atoms return side_chain_atoms
def find_side_chain_ring_connections(
mol: Chem.Mol,
side_chain_atoms: Iterable[int],
ring_atom_indices: Iterable[int],
) -> list[tuple[int, int]]:
ring_atom_set = set(ring_atom_indices)
connections: set[tuple[int, int]] = set()
for atom_idx in side_chain_atoms:
atom = mol.GetAtomWithIdx(atom_idx)
for neighbor in atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx()
if neighbor_idx in ring_atom_set:
connections.add((atom_idx, neighbor_idx))
return sorted(connections, key=lambda connection: (connection[1], connection[0]))
def collect_fragmentable_side_chain_atoms(
mol: Chem.Mol,
start_atom_idx: int,
ring_atom_indices: Iterable[int],
ring_atom_idx: int | None = None,
) -> list[int] | None:
side_chain_atoms = collect_side_chain_atoms(mol, start_atom_idx, ring_atom_indices)
if not side_chain_atoms:
return None
ring_connections = find_side_chain_ring_connections(mol, side_chain_atoms, ring_atom_indices)
if len(ring_connections) != 1:
return None
if ring_atom_idx is not None and ring_connections[0][1] != ring_atom_idx:
return None
return side_chain_atoms
def is_intrinsic_lactone_neighbor( def is_intrinsic_lactone_neighbor(
mol: Chem.Mol, mol: Chem.Mol,
candidate: DetectedMacrolactone, candidate: DetectedMacrolactone,

View File

@@ -6,7 +6,7 @@ from rdkit.Chem import Descriptors
from ._core import ( from ._core import (
build_fragment_smiles, build_fragment_smiles,
build_numbering_result, build_numbering_result,
collect_side_chain_atoms, collect_fragmentable_side_chain_atoms,
ensure_mol, ensure_mol,
find_macrolactone_candidates, find_macrolactone_candidates,
is_intrinsic_lactone_neighbor, is_intrinsic_lactone_neighbor,
@@ -44,6 +44,8 @@ class MacrolactoneFragmenter:
fragments: list[SideChainFragment] = [] fragments: list[SideChainFragment] = []
for position, ring_atom_idx in numbering.position_to_atom.items(): for position, ring_atom_idx in numbering.position_to_atom.items():
if int(position) <= 2:
continue
ring_atom = mol.GetAtomWithIdx(ring_atom_idx) ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
for neighbor in ring_atom.GetNeighbors(): for neighbor in ring_atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx() neighbor_idx = neighbor.GetIdx()
@@ -52,8 +54,13 @@ class MacrolactoneFragmenter:
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx): if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
continue continue
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set) side_chain_atoms = collect_fragmentable_side_chain_atoms(
if not side_chain_atoms: mol=mol,
start_atom_idx=neighbor_idx,
ring_atom_indices=ring_atom_set,
ring_atom_idx=ring_atom_idx,
)
if side_chain_atoms is None:
continue continue
try: try:

View File

@@ -4,7 +4,7 @@ from typing import Iterable
from rdkit import Chem from rdkit import Chem
from .._core import collect_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor from .._core import collect_fragmentable_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor
from ..fragmenter import MacrolactoneFragmenter from ..fragmenter import MacrolactoneFragmenter
@@ -26,22 +26,36 @@ def prepare_macrolactone_scaffold(
for position in positions: for position in positions:
if position not in numbering.position_to_atom: if position not in numbering.position_to_atom:
raise ValueError(f"Position {position} not found in ring numbering.") raise ValueError(f"Position {position} not found in ring numbering.")
if position <= 2:
raise ValueError(f"Position {position} does not contain a single-anchor fragmentable side chain")
ring_atom_idx = numbering.position_to_atom[position] ring_atom_idx = numbering.position_to_atom[position]
ring_atom = mol.GetAtomWithIdx(ring_atom_idx) ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
position_dummy_specs: list[tuple[int, int, Chem.BondType]] = []
for neighbor in ring_atom.GetNeighbors(): for neighbor in ring_atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx() neighbor_idx = neighbor.GetIdx()
if neighbor_idx in ring_atom_set: if neighbor_idx in ring_atom_set:
continue continue
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx): if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
continue continue
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set) side_chain_atoms = collect_fragmentable_side_chain_atoms(
mol=mol,
start_atom_idx=neighbor_idx,
ring_atom_indices=ring_atom_set,
ring_atom_idx=ring_atom_idx,
)
if side_chain_atoms is None:
continue
atoms_to_remove.update(side_chain_atoms) atoms_to_remove.update(side_chain_atoms)
bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx) bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx)
if bond is not None: if bond is not None:
dummy_specs.append((ring_atom_idx, position, bond.GetBondType())) position_dummy_specs.append((ring_atom_idx, position, bond.GetBondType()))
if not any(spec_position == position for _, spec_position, _ in dummy_specs): if not position_dummy_specs:
dummy_specs.append((ring_atom_idx, position, Chem.BondType.SINGLE)) raise ValueError(f"Position {position} does not contain a single-anchor fragmentable side chain")
if len(position_dummy_specs) > 1:
raise ValueError(f"Position {position} contains multiple fragmentable side chains")
dummy_specs.extend(position_dummy_specs)
rwmol = Chem.RWMol(mol) rwmol = Chem.RWMol(mol)
for ring_atom_idx, position, bond_type in dummy_specs: for ring_atom_idx, position, bond_type in dummy_specs:

View File

@@ -1,6 +1,6 @@
from __future__ import annotations from __future__ import annotations
from datetime import datetime from datetime import UTC, datetime
from typing import List, Optional from typing import List, Optional
from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.orm import Mapped, mapped_column, relationship
@@ -40,7 +40,7 @@ class ParentMolecule(SQLModel, table=True):
num_sidechains: Optional[int] = None num_sidechains: Optional[int] = None
cleavage_positions: Optional[str] = None cleavage_positions: Optional[str] = None
numbered_image_path: Optional[str] = None numbered_image_path: Optional[str] = None
created_at: datetime = Field(default_factory=datetime.utcnow) created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
processed_at: Optional[datetime] = None processed_at: Optional[datetime] = None
@@ -72,6 +72,8 @@ class SideChainFragment(SQLModel, table=True):
fragment_smiles_labeled: str fragment_smiles_labeled: str
fragment_smiles_plain: str fragment_smiles_plain: str
dummy_isotope: int dummy_isotope: int
has_dummy_atom: bool = Field(default=True)
dummy_atom_count: int = Field(default=1)
atom_count: int atom_count: int
heavy_atom_count: int heavy_atom_count: int
molecular_weight: float molecular_weight: float
@@ -79,6 +81,26 @@ class SideChainFragment(SQLModel, table=True):
image_path: Optional[str] = None image_path: Optional[str] = None
class FragmentLibraryEntry(SQLModel, table=True):
"""Unified fragment library entries."""
__tablename__ = "fragment_library_entries"
id: Optional[int] = Field(default=None, primary_key=True)
source_type: str = Field(index=True)
source_fragment_id: Optional[str] = Field(default=None, index=True)
source_parent_ml_id: Optional[str] = Field(default=None, index=True)
source_parent_chembl_id: Optional[str] = Field(default=None, index=True)
cleavage_position: Optional[int] = Field(default=None, index=True)
fragment_smiles_labeled: Optional[str] = None
fragment_smiles_plain: str
has_dummy_atom: bool = Field(default=False)
dummy_atom_count: int = Field(default=0)
splice_ready: bool = Field(default=False, index=True)
original_bond_type: Optional[str] = None
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
class ValidationResult(SQLModel, table=True): class ValidationResult(SQLModel, table=True):
"""Manual validation records.""" """Manual validation records."""

View File

@@ -1,7 +1,7 @@
from __future__ import annotations from __future__ import annotations
import json import json
from datetime import datetime from datetime import UTC, datetime
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
@@ -12,7 +12,7 @@ from sqlmodel import select
from macro_lactone_toolkit import MacroLactoneAnalyzer from macro_lactone_toolkit import MacroLactoneAnalyzer
from macro_lactone_toolkit._core import ( from macro_lactone_toolkit._core import (
build_numbering_result, build_numbering_result,
collect_side_chain_atoms, collect_fragmentable_side_chain_atoms,
find_macrolactone_candidates, find_macrolactone_candidates,
is_intrinsic_lactone_neighbor, is_intrinsic_lactone_neighbor,
) )
@@ -20,6 +20,7 @@ from macro_lactone_toolkit.validation.database import get_engine, get_session, i
from macro_lactone_toolkit.validation.isotope_utils import build_fragment_with_isotope from macro_lactone_toolkit.validation.isotope_utils import build_fragment_with_isotope
from macro_lactone_toolkit.validation.models import ( from macro_lactone_toolkit.validation.models import (
ClassificationType, ClassificationType,
FragmentLibraryEntry,
ParentMolecule, ParentMolecule,
ProcessingStatus, ProcessingStatus,
RingNumbering, RingNumbering,
@@ -80,6 +81,7 @@ class MacrolactoneValidator:
# Generate outputs # Generate outputs
self._generate_readme() self._generate_readme()
self._generate_summary() self._generate_summary()
self._generate_fragment_library()
return results return results
@@ -138,7 +140,7 @@ class MacrolactoneValidator:
except Exception as e: except Exception as e:
parent.processing_status = ProcessingStatus.FAILED parent.processing_status = ProcessingStatus.FAILED
parent.error_message = str(e) parent.error_message = str(e)
parent.processed_at = datetime.utcnow() parent.processed_at = datetime.now(UTC)
session.add(parent) session.add(parent)
session.commit() session.commit()
return "failed" return "failed"
@@ -188,6 +190,8 @@ class MacrolactoneValidator:
fragment_idx = 0 fragment_idx = 0
for position, ring_atom_idx in numbering.position_to_atom.items(): for position, ring_atom_idx in numbering.position_to_atom.items():
if int(position) <= 2:
continue
ring_atom = mol.GetAtomWithIdx(ring_atom_idx) ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
for neighbor in ring_atom.GetNeighbors(): for neighbor in ring_atom.GetNeighbors():
@@ -200,8 +204,13 @@ class MacrolactoneValidator:
continue continue
# Collect side chain atoms # Collect side chain atoms
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set) side_chain_atoms = collect_fragmentable_side_chain_atoms(
if not side_chain_atoms: mol=mol,
start_atom_idx=neighbor_idx,
ring_atom_indices=ring_atom_set,
ring_atom_idx=ring_atom_idx,
)
if side_chain_atoms is None:
continue continue
# Build fragment with isotope tagging # Build fragment with isotope tagging
@@ -228,6 +237,8 @@ class MacrolactoneValidator:
fragment_smiles_labeled=labeled_smiles, fragment_smiles_labeled=labeled_smiles,
fragment_smiles_plain=plain_smiles, fragment_smiles_plain=plain_smiles,
dummy_isotope=int(position), dummy_isotope=int(position),
has_dummy_atom=True,
dummy_atom_count=1,
atom_count=atom_count, atom_count=atom_count,
heavy_atom_count=heavy_atom_count, heavy_atom_count=heavy_atom_count,
molecular_weight=round(mw, 4), molecular_weight=round(mw, 4),
@@ -235,6 +246,21 @@ class MacrolactoneValidator:
) )
session.add(fragment) session.add(fragment)
fragments.append(fragment) fragments.append(fragment)
session.add(
FragmentLibraryEntry(
source_type="validation_extract",
source_fragment_id=fragment.fragment_id,
source_parent_ml_id=parent.ml_id,
source_parent_chembl_id=parent.chembl_id,
cleavage_position=int(position),
fragment_smiles_labeled=labeled_smiles,
fragment_smiles_plain=plain_smiles,
has_dummy_atom=True,
dummy_atom_count=1,
splice_ready=True,
original_bond_type=bond_type,
)
)
fragment_idx += 1 fragment_idx += 1
# Save fragment images # Save fragment images
@@ -248,7 +274,7 @@ class MacrolactoneValidator:
parent.processing_status = ProcessingStatus.SUCCESS parent.processing_status = ProcessingStatus.SUCCESS
parent.num_sidechains = len(fragments) parent.num_sidechains = len(fragments)
parent.cleavage_positions = json.dumps([f.cleavage_position for f in fragments]) parent.cleavage_positions = json.dumps([f.cleavage_position for f in fragments])
parent.processed_at = datetime.utcnow() parent.processed_at = datetime.now(UTC)
session.add(parent) session.add(parent)
session.commit() session.commit()
@@ -276,6 +302,7 @@ This directory contains validation results for MacrolactoneDB 12-20 membered rin
validation_output/ validation_output/
├── README.md # This file ├── README.md # This file
├── fragments.db # SQLite database with all data ├── fragments.db # SQLite database with all data
├── fragment_library.csv # Unified fragment library export
├── summary.csv # Summary of all processed molecules ├── summary.csv # Summary of all processed molecules
├── summary_statistics.json # Statistical summary ├── summary_statistics.json # Statistical summary
@@ -305,6 +332,7 @@ validation_output/
- **parent_molecules**: Original molecule information - **parent_molecules**: Original molecule information
- **ring_numberings**: Ring atom numbering details - **ring_numberings**: Ring atom numbering details
- **side_chain_fragments**: Fragmentation results with isotope tags - **side_chain_fragments**: Fragmentation results with isotope tags
- **fragment_library_entries**: Unified fragment library rows for downstream design
- **validation_results**: Manual validation records - **validation_results**: Manual validation records
### Key Fields ### Key Fields
@@ -312,6 +340,8 @@ validation_output/
- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone - `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone
- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction - `dummy_isotope`: Cleavage position stored as isotope value for reconstruction
- `cleavage_position`: Position on ring where side chain was attached - `cleavage_position`: Position on ring where side chain was attached
- `has_dummy_atom`: Whether the fragment contains a dummy atom for splicing
- `dummy_atom_count`: Number of dummy atoms in the fragment
## Ring Numbering Convention ## Ring Numbering Convention
@@ -337,6 +367,13 @@ Fragments use isotope values to mark cleavage position:
- `cleavage_positions`: JSON array of cleavage positions - `cleavage_positions`: JSON array of cleavage positions
- `processing_status`: pending | success | failed | skipped - `processing_status`: pending | success | failed | skipped
### fragment_library.csv
- `source_type`: validation_extract | supplemental (reserved)
- `has_dummy_atom`: Whether the fragment contains a dummy atom
- `dummy_atom_count`: Number of dummy atoms
- `splice_ready`: Whether the fragment is directly compatible with single-anchor splicing
## Querying the Database ## Querying the Database
```bash ```bash
@@ -401,6 +438,47 @@ sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY
print(f"\nSummary saved to {self.output_dir / 'summary.csv'}") print(f"\nSummary saved to {self.output_dir / 'summary.csv'}")
print(f"Statistics: {stats}") print(f"Statistics: {stats}")
def _generate_fragment_library(self):
"""Generate unified fragment library CSV."""
columns = [
"id",
"source_type",
"source_fragment_id",
"source_parent_ml_id",
"source_parent_chembl_id",
"cleavage_position",
"fragment_smiles_labeled",
"fragment_smiles_plain",
"has_dummy_atom",
"dummy_atom_count",
"splice_ready",
"original_bond_type",
"created_at",
]
with get_session(self.engine) as session:
entries = session.exec(select(FragmentLibraryEntry)).all()
data = [
{
"id": entry.id,
"source_type": entry.source_type,
"source_fragment_id": entry.source_fragment_id,
"source_parent_ml_id": entry.source_parent_ml_id,
"source_parent_chembl_id": entry.source_parent_chembl_id,
"cleavage_position": entry.cleavage_position,
"fragment_smiles_labeled": entry.fragment_smiles_labeled,
"fragment_smiles_plain": entry.fragment_smiles_plain,
"has_dummy_atom": entry.has_dummy_atom,
"dummy_atom_count": entry.dummy_atom_count,
"splice_ready": entry.splice_ready,
"original_bond_type": entry.original_bond_type,
"created_at": entry.created_at,
}
for entry in entries
]
pd.DataFrame(data, columns=columns).to_csv(self.output_dir / "fragment_library.csv", index=False)
class MacrolactoneDetectionError(Exception): class MacrolactoneDetectionError(Exception):
"""Raised when macrolactone detection fails.""" """Raised when macrolactone detection fails."""

View File

@@ -78,6 +78,108 @@ def build_non_standard_ring_atom_macrolactone(
) )
def build_macrolactone_with_fused_side_ring(
ring_size: int = 16,
fused_positions: tuple[int, int] = (5, 6),
side_chains: Mapping[int, str] | None = None,
) -> BuiltMacrolactone:
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
position_a, position_b = fused_positions
rwmol = Chem.RWMol(Chem.Mol(base.mol))
atom_x = rwmol.AddAtom(Chem.Atom("C"))
atom_y = rwmol.AddAtom(Chem.Atom("C"))
rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE)
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom=base.position_to_atom,
)
def build_macrolactone_with_bridge_side_chain(
ring_size: int = 16,
bridge_positions: tuple[int, int] = (5, 8),
side_chains: Mapping[int, str] | None = None,
) -> BuiltMacrolactone:
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
position_a, position_b = bridge_positions
rwmol = Chem.RWMol(Chem.Mol(base.mol))
atom_x = rwmol.AddAtom(Chem.Atom("C"))
atom_y = rwmol.AddAtom(Chem.Atom("C"))
rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE)
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom=base.position_to_atom,
)
def build_macrolactone_with_shared_atom_side_ring(
ring_size: int = 16,
position: int = 5,
side_chains: Mapping[int, str] | None = None,
) -> BuiltMacrolactone:
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
rwmol = Chem.RWMol(Chem.Mol(base.mol))
atom_x = rwmol.AddAtom(Chem.Atom("C"))
atom_y = rwmol.AddAtom(Chem.Atom("C"))
atom_z = rwmol.AddAtom(Chem.Atom("C"))
ring_atom_idx = base.position_to_atom[position]
rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE)
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE)
rwmol.AddBond(atom_z, ring_atom_idx, Chem.BondType.SINGLE)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom=base.position_to_atom,
)
def build_macrolactone_with_single_anchor_side_ring(
ring_size: int = 16,
position: int = 5,
side_chains: Mapping[int, str] | None = None,
) -> BuiltMacrolactone:
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
rwmol = Chem.RWMol(Chem.Mol(base.mol))
atom_x = rwmol.AddAtom(Chem.Atom("C"))
atom_y = rwmol.AddAtom(Chem.Atom("C"))
atom_z = rwmol.AddAtom(Chem.Atom("C"))
ring_atom_idx = base.position_to_atom[position]
rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE)
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE)
rwmol.AddBond(atom_z, atom_x, Chem.BondType.SINGLE)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom=base.position_to_atom,
)
def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone: def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone:
rwmol = Chem.RWMol() rwmol = Chem.RWMol()

View File

@@ -2,7 +2,12 @@ from rdkit import Chem
from macro_lactone_toolkit import MacrolactoneFragmenter from macro_lactone_toolkit import MacrolactoneFragmenter
from .helpers import build_macrolactone from .helpers import (
build_macrolactone,
build_macrolactone_with_fused_side_ring,
build_macrolactone_with_shared_atom_side_ring,
build_macrolactone_with_single_anchor_side_ring,
)
def test_fragmentation_returns_empty_list_without_sidechains(): def test_fragmentation_returns_empty_list_without_sidechains():
@@ -51,3 +56,24 @@ def test_fragmentation_preserves_attachment_bond_type():
neighbor = dummy_atom.GetNeighbors()[0] neighbor = dummy_atom.GetNeighbors()[0]
bond = mol.GetBondBetweenAtoms(dummy_atom.GetIdx(), neighbor.GetIdx()) bond = mol.GetBondBetweenAtoms(dummy_atom.GetIdx(), neighbor.GetIdx())
assert bond.GetBondType() == Chem.BondType.DOUBLE assert bond.GetBondType() == Chem.BondType.DOUBLE
def test_fragmentation_skips_fused_side_ring_but_keeps_single_anchor_sidechains():
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="fused")
assert {fragment.cleavage_position for fragment in result.fragments} == {10}
def test_fragmentation_skips_shared_atom_multi_anchor_component():
built = build_macrolactone_with_shared_atom_side_ring(side_chains={11: "ethyl"})
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="shared_atom")
assert {fragment.cleavage_position for fragment in result.fragments} == {11}
def test_fragmentation_allows_single_anchor_side_ring():
built = build_macrolactone_with_single_anchor_side_ring()
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="single_anchor_ring")
assert {fragment.cleavage_position for fragment in result.fragments} == {5}

View File

@@ -5,7 +5,7 @@ from macro_lactone_toolkit import MacrolactoneFragmenter
from macro_lactone_toolkit.splicing.engine import splice_molecule from macro_lactone_toolkit.splicing.engine import splice_molecule
from macro_lactone_toolkit.splicing.scaffold_prep import prepare_macrolactone_scaffold from macro_lactone_toolkit.splicing.scaffold_prep import prepare_macrolactone_scaffold
from .helpers import build_macrolactone, canonicalize from .helpers import build_macrolactone, build_macrolactone_with_fused_side_ring, canonicalize
def test_splice_benzene_methyl(): def test_splice_benzene_methyl():
@@ -49,3 +49,14 @@ def test_prepare_scaffold_and_reassemble_fragment():
product = splice_molecule(scaffold, Chem.MolFromSmiles(fragment.fragment_smiles_labeled), position=5) product = splice_molecule(scaffold, Chem.MolFromSmiles(fragment.fragment_smiles_labeled), position=5)
assert canonicalize(product) == canonicalize(built.mol) assert canonicalize(product) == canonicalize(built.mol)
def test_prepare_scaffold_rejects_position_without_single_anchor_fragment():
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
with pytest.raises(ValueError, match="Position 5 does not contain a single-anchor fragmentable side chain"):
prepare_macrolactone_scaffold(
built.smiles,
positions=[5],
ring_size=16,
)

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
import json
import sqlite3
import pandas as pd
from macro_lactone_toolkit.validation.validator import MacrolactoneValidator
from ..helpers import build_macrolactone_with_fused_side_ring
def test_validator_exports_only_single_anchor_fragments_and_fragment_library(tmp_path):
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
input_path = tmp_path / "input.csv"
output_dir = tmp_path / "validation_output"
pd.DataFrame(
[
{
"ml_id": "ML00000001",
"IDs": "CHEMBL0001",
"smiles": built.smiles,
}
]
).to_csv(input_path, index=False)
validator = MacrolactoneValidator(output_dir=output_dir, sample_ratio=1.0)
results = validator.run(input_path)
assert results == {"total": 1, "success": 1, "failed": 0, "skipped": 0}
with sqlite3.connect(output_dir / "fragments.db") as connection:
fragments = connection.execute(
"SELECT cleavage_position, has_dummy_atom, dummy_atom_count FROM side_chain_fragments"
).fetchall()
library_entries = connection.execute(
"""
SELECT source_type, source_parent_ml_id, source_parent_chembl_id,
cleavage_position, has_dummy_atom, dummy_atom_count, splice_ready
FROM fragment_library_entries
"""
).fetchall()
assert fragments == [(10, 1, 1)]
assert library_entries == [("validation_extract", "ML00000001", "CHEMBL0001", 10, 1, 1, 1)]
summary = pd.read_csv(output_dir / "summary.csv")
assert summary.loc[0, "num_sidechains"] == 1
assert json.loads(summary.loc[0, "cleavage_positions"]) == [10]
fragment_library = pd.read_csv(output_dir / "fragment_library.csv")
assert fragment_library.loc[0, "source_type"] == "validation_extract"
assert int(fragment_library.loc[0, "cleavage_position"]) == 10
assert bool(fragment_library.loc[0, "has_dummy_atom"]) is True
assert int(fragment_library.loc[0, "dummy_atom_count"]) == 1
assert bool(fragment_library.loc[0, "splice_ready"]) is True