feat(toolkit): ship macro_lactone_toolkit package

Unify macrolactone detection, numbering, fragmentation, and splicing under the installable macro_lactone_toolkit package. - replace legacy src.* modules with the new package layout - add analyze/number/fragment CLI entrypoints and pixi tasks - migrate tests, README, and scripts to the new package API
2026-03-18 22:06:45 +08:00
parent a768d26e47
commit 5e7b236f31
45 changed files with 1302 additions and 6304 deletions
--- a/src/macro_lactone_toolkit/init.py
+++ b/src/macro_lactone_toolkit/init.py
@@ -0,0 +1,25 @@
+from .analyzer import MacroLactoneAnalyzer
+from .errors import (
+    AmbiguousMacrolactoneError,
+    FragmentationError,
+    MacrolactoneDetectionError,
+    MacrolactoneError,
+    RingNumberingError,
+)
+from .fragmenter import MacrolactoneFragmenter
+from .models import FragmentationResult, RingNumberingResult, SideChainFragment
+
+__all__ = [
+    "AmbiguousMacrolactoneError",
+    "FragmentationError",
+    "FragmentationResult",
+    "MacroLactoneAnalyzer",
+    "MacrolactoneDetectionError",
+    "MacrolactoneError",
+    "MacrolactoneFragmenter",
+    "RingNumberingError",
+    "RingNumberingResult",
+    "SideChainFragment",
+]
+
+__version__ = "0.1.0"
--- a/src/macro_lactone_toolkit/_core.py
+++ b/src/macro_lactone_toolkit/_core.py
@@ -0,0 +1,243 @@
+from __future__ import annotations
+
+from collections import deque
+from dataclasses import dataclass
+from typing import Iterable
+
+from rdkit import Chem
+
+from .errors import MacrolactoneDetectionError, RingNumberingError
+from .models import RingNumberingResult
+
+
+VALID_RING_SIZES = tuple(range(12, 21))
+
+
+@dataclass(frozen=True)
+class DetectedMacrolactone:
+    ring_size: int
+    ring_atoms: tuple[int, ...]
+    carbonyl_carbon_idx: int
+    ester_oxygen_idx: int
+
+
+def ensure_mol(mol_input: str | Chem.Mol) -> tuple[Chem.Mol, str]:
+    if isinstance(mol_input, Chem.Mol):
+        mol = Chem.Mol(mol_input)
+    else:
+        mol = Chem.MolFromSmiles(mol_input)
+        if mol is None:
+            raise MacrolactoneDetectionError(f"Unable to parse SMILES: {mol_input}")
+    return mol, Chem.MolToSmiles(mol, isomericSmiles=True)
+
+
+def find_macrolactone_candidates(
+    mol: Chem.Mol,
+    ring_size: int | None = None,
+) -> list[DetectedMacrolactone]:
+    candidates: list[DetectedMacrolactone] = []
+    seen: set[tuple[tuple[int, ...], int, int]] = set()
+
+    for ring in mol.GetRingInfo().AtomRings():
+        if len(ring) not in VALID_RING_SIZES:
+            continue
+        if ring_size is not None and len(ring) != ring_size:
+            continue
+
+        ester_match = _find_ring_ester_atoms(mol, ring)
+        if ester_match is None:
+            continue
+
+        carbonyl_carbon_idx, ester_oxygen_idx = ester_match
+        key = (tuple(sorted(ring)), carbonyl_carbon_idx, ester_oxygen_idx)
+        if key in seen:
+            continue
+        seen.add(key)
+        candidates.append(
+            DetectedMacrolactone(
+                ring_size=len(ring),
+                ring_atoms=tuple(ring),
+                carbonyl_carbon_idx=carbonyl_carbon_idx,
+                ester_oxygen_idx=ester_oxygen_idx,
+            )
+        )
+
+    return sorted(
+        candidates,
+        key=lambda candidate: (
+            candidate.ring_size,
+            tuple(sorted(candidate.ring_atoms)),
+            candidate.carbonyl_carbon_idx,
+            candidate.ester_oxygen_idx,
+        ),
+    )
+
+
+def build_numbering_result(mol: Chem.Mol, candidate: DetectedMacrolactone) -> RingNumberingResult:
+    ring_atoms = list(candidate.ring_atoms)
+    ring_atom_set = set(ring_atoms)
+    adjacency = {
+        atom_idx: sorted(
+            neighbor.GetIdx()
+            for neighbor in mol.GetAtomWithIdx(atom_idx).GetNeighbors()
+            if neighbor.GetIdx() in ring_atom_set
+        )
+        for atom_idx in ring_atoms
+    }
+
+    ordered_atoms = [candidate.carbonyl_carbon_idx, candidate.ester_oxygen_idx]
+    visited = set(ordered_atoms)
+    previous_atom = candidate.carbonyl_carbon_idx
+    current_atom = candidate.ester_oxygen_idx
+
+    while len(ordered_atoms) < candidate.ring_size:
+        next_atoms = [
+            atom_idx
+            for atom_idx in adjacency[current_atom]
+            if atom_idx != previous_atom and atom_idx not in visited
+        ]
+        if len(next_atoms) != 1:
+            raise RingNumberingError("Unable to determine a unique ring traversal order.")
+        next_atom = next_atoms[0]
+        ordered_atoms.append(next_atom)
+        visited.add(next_atom)
+        previous_atom, current_atom = current_atom, next_atom
+
+    if candidate.carbonyl_carbon_idx not in adjacency[current_atom]:
+        raise RingNumberingError("Ring traversal did not close on the lactone carbonyl carbon.")
+
+    position_to_atom = {position: atom_idx for position, atom_idx in enumerate(ordered_atoms, start=1)}
+    atom_to_position = {atom_idx: position for position, atom_idx in position_to_atom.items()}
+
+    return RingNumberingResult(
+        ring_size=candidate.ring_size,
+        ring_atoms=sorted(ring_atoms),
+        ordered_atoms=ordered_atoms,
+        carbonyl_carbon_idx=candidate.carbonyl_carbon_idx,
+        ester_oxygen_idx=candidate.ester_oxygen_idx,
+        atom_to_position=atom_to_position,
+        position_to_atom=position_to_atom,
+    )
+
+
+def collect_side_chain_atoms(
+    mol: Chem.Mol,
+    start_atom_idx: int,
+    ring_atom_indices: Iterable[int],
+) -> list[int]:
+    ring_atom_set = set(ring_atom_indices)
+    queue: deque[int] = deque([start_atom_idx])
+    visited: set[int] = set()
+    side_chain_atoms: list[int] = []
+
+    while queue:
+        atom_idx = queue.popleft()
+        if atom_idx in visited or atom_idx in ring_atom_set:
+            continue
+        visited.add(atom_idx)
+        side_chain_atoms.append(atom_idx)
+
+        atom = mol.GetAtomWithIdx(atom_idx)
+        for neighbor in atom.GetNeighbors():
+            neighbor_idx = neighbor.GetIdx()
+            if neighbor_idx not in visited and neighbor_idx not in ring_atom_set:
+                queue.append(neighbor_idx)
+
+    return side_chain_atoms
+
+
+def is_intrinsic_lactone_neighbor(
+    mol: Chem.Mol,
+    candidate: DetectedMacrolactone,
+    ring_atom_idx: int,
+    neighbor_idx: int,
+) -> bool:
+    if ring_atom_idx != candidate.carbonyl_carbon_idx:
+        return False
+    bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx)
+    neighbor = mol.GetAtomWithIdx(neighbor_idx)
+    return bond is not None and bond.GetBondType() == Chem.BondType.DOUBLE and neighbor.GetAtomicNum() == 8
+
+
+def build_fragment_smiles(
+    mol: Chem.Mol,
+    side_chain_atoms: list[int],
+    side_chain_start_idx: int,
+    ring_atom_idx: int,
+    dummy_isotope: int,
+) -> str:
+    fragment_mol = Chem.RWMol()
+    old_to_new: dict[int, int] = {}
+
+    for old_idx in side_chain_atoms:
+        old_atom = mol.GetAtomWithIdx(old_idx)
+        new_atom = Chem.Atom(old_atom)
+        new_idx = fragment_mol.AddAtom(new_atom)
+        old_to_new[old_idx] = new_idx
+
+    for old_idx in side_chain_atoms:
+        atom = mol.GetAtomWithIdx(old_idx)
+        for neighbor in atom.GetNeighbors():
+            neighbor_idx = neighbor.GetIdx()
+            if neighbor_idx in old_to_new and old_idx < neighbor_idx:
+                bond = mol.GetBondBetweenAtoms(old_idx, neighbor_idx)
+                fragment_mol.AddBond(
+                    old_to_new[old_idx],
+                    old_to_new[neighbor_idx],
+                    bond.GetBondType(),
+                )
+
+    bond_to_ring = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx)
+    if bond_to_ring is None:
+        raise RingNumberingError("Missing ring-to-side-chain bond during fragmentation.")
+
+    dummy_atom = Chem.Atom(0)
+    dummy_atom.SetIsotope(dummy_isotope)
+    dummy_idx = fragment_mol.AddAtom(dummy_atom)
+    fragment_mol.AddBond(
+        old_to_new[side_chain_start_idx],
+        dummy_idx,
+        bond_to_ring.GetBondType(),
+    )
+
+    fragment = fragment_mol.GetMol()
+    Chem.SanitizeMol(fragment)
+    return Chem.MolToSmiles(fragment, isomericSmiles=True)
+
+
+def _find_ring_ester_atoms(mol: Chem.Mol, ring_atoms: Iterable[int]) -> tuple[int, int] | None:
+    ring_atom_set = set(ring_atoms)
+    matches: list[tuple[int, int]] = []
+
+    for atom_idx in ring_atom_set:
+        atom = mol.GetAtomWithIdx(atom_idx)
+        if atom.GetAtomicNum() != 6:
+            continue
+
+        has_carbonyl_oxygen = any(
+            bond.GetBondType() == Chem.BondType.DOUBLE and bond.GetOtherAtom(atom).GetAtomicNum() == 8
+            for bond in atom.GetBonds()
+        )
+        if not has_carbonyl_oxygen:
+            continue
+
+        for neighbor in atom.GetNeighbors():
+            neighbor_idx = neighbor.GetIdx()
+            bond = mol.GetBondBetweenAtoms(atom_idx, neighbor_idx)
+            if (
+                neighbor_idx in ring_atom_set
+                and neighbor.GetAtomicNum() == 8
+                and bond is not None
+                and bond.GetBondType() == Chem.BondType.SINGLE
+            ):
+                ring_neighbors = [
+                    ring_neighbor.GetIdx()
+                    for ring_neighbor in neighbor.GetNeighbors()
+                    if ring_neighbor.GetIdx() in ring_atom_set
+                ]
+                if len(ring_neighbors) == 2:
+                    matches.append((atom_idx, neighbor_idx))
+
+    if not matches:
+        return None
+    return sorted(matches)[0]
--- a/src/macro_lactone_toolkit/analyzer.py
+++ b/src/macro_lactone_toolkit/analyzer.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+from rdkit import Chem
+
+from ._core import ensure_mol, find_macrolactone_candidates
+
+
+class MacroLactoneAnalyzer:
+    """Identify valid 12-20 membered macrolactones."""
+
+    def get_valid_ring_sizes(self, mol_input: str | Chem.Mol) -> list[int]:
+        mol, _ = ensure_mol(mol_input)
+        candidates = find_macrolactone_candidates(mol)
+        return sorted({candidate.ring_size for candidate in candidates})
+
+    def analyze_molecule(self, mol_input: str | Chem.Mol) -> dict:
+        mol, smiles = ensure_mol(mol_input)
+        candidates = find_macrolactone_candidates(mol)
+        valid_ring_sizes = sorted({candidate.ring_size for candidate in candidates})
+        is_ambiguous = len(valid_ring_sizes) > 1 or len(candidates) > 1
+        return {
+            "smiles": smiles,
+            "valid_ring_sizes": valid_ring_sizes,
+            "candidate_count": len(candidates),
+            "is_ambiguous": is_ambiguous,
+            "selected_ring_size": valid_ring_sizes[0] if len(valid_ring_sizes) == 1 and len(candidates) == 1 else None,
+        }
--- a/src/macro_lactone_toolkit/cli.py
+++ b/src/macro_lactone_toolkit/cli.py
@@ -0,0 +1,160 @@
+from __future__ import annotations
+
+import argparse
+import json
+from dataclasses import asdict
+from pathlib import Path
+
+import pandas as pd
+
+from .analyzer import MacroLactoneAnalyzer
+from .errors import MacrolactoneError
+from .fragmenter import MacrolactoneFragmenter
+
+
+def main() -> None:
+    parser = build_parser()
+    args = parser.parse_args()
+    if not hasattr(args, "func"):
+        parser.print_help()
+        raise SystemExit(1)
+    args.func(args)
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="macro-lactone-toolkit")
+    subparsers = parser.add_subparsers(dest="command")
+
+    analyze = subparsers.add_parser("analyze")
+    _add_common_input_arguments(analyze)
+    analyze.add_argument("--ring-size", type=int, default=None)
+    analyze.set_defaults(func=run_analyze)
+
+    number = subparsers.add_parser("number")
+    number.add_argument("--smiles", required=True)
+    number.add_argument("--ring-size", type=int, default=None)
+    number.set_defaults(func=run_number)
+
+    fragment = subparsers.add_parser("fragment")
+    _add_common_input_arguments(fragment)
+    fragment.add_argument("--ring-size", type=int, default=None)
+    fragment.add_argument("--parent-id", default=None)
+    fragment.add_argument("--errors-output", default=None)
+    fragment.set_defaults(func=run_fragment)
+
+    return parser
+
+
+def run_analyze(args: argparse.Namespace) -> None:
+    analyzer = MacroLactoneAnalyzer()
+    if args.smiles:
+        payload = analyzer.analyze_molecule(args.smiles)
+        _write_output(payload, args.output)
+        return
+
+    rows = _read_csv_rows(args.input, args.smiles_column, args.id_column)
+    payload = []
+    for row in rows:
+        analysis = analyzer.analyze_molecule(row["smiles"])
+        analysis["parent_id"] = row["parent_id"]
+        payload.append(analysis)
+    _write_output(payload, args.output)
+
+
+def run_number(args: argparse.Namespace) -> None:
+    fragmenter = MacrolactoneFragmenter(ring_size=args.ring_size)
+    payload = fragmenter.number_molecule(args.smiles).to_dict()
+    _write_json(payload, None)
+
+
+def run_fragment(args: argparse.Namespace) -> None:
+    fragmenter = MacrolactoneFragmenter(ring_size=args.ring_size)
+
+    if args.smiles:
+        result = fragmenter.fragment_molecule(args.smiles, parent_id=args.parent_id)
+        _write_output(result.to_dict(), args.output)
+        return
+
+    rows = _read_csv_rows(args.input, args.smiles_column, args.id_column)
+    fragment_rows: list[dict] = []
+    error_rows: list[dict] = []
+
+    for row in rows:
+        try:
+            result = fragmenter.fragment_molecule(row["smiles"], parent_id=row["parent_id"])
+        except MacrolactoneError as exc:
+            error_rows.append(
+                {
+                    "parent_id": row["parent_id"],
+                    "smiles": row["smiles"],
+                    "error_type": type(exc).__name__,
+                    "error_message": str(exc),
+                }
+            )
+            continue
+
+        for fragment in result.fragments:
+            fragment_rows.append(
+                {
+                    "parent_id": result.parent_id,
+                    "ring_size": result.ring_size,
+                    **fragment.to_dict(),
+                }
+            )
+
+    if args.output:
+        _write_output(fragment_rows, args.output)
+    else:
+        _write_json({"fragments": fragment_rows, "errors": error_rows}, None)
+
+    if args.errors_output:
+        _write_output(error_rows, args.errors_output)
+
+
+def _add_common_input_arguments(parser: argparse.ArgumentParser) -> None:
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--smiles")
+    group.add_argument("--input")
+    parser.add_argument("--smiles-column", default="smiles")
+    parser.add_argument("--id-column", default="id")
+    parser.add_argument("--output", default=None)
+
+
+def _read_csv_rows(input_path: str, smiles_column: str, id_column: str) -> list[dict]:
+    dataframe = pd.read_csv(input_path)
+    rows = []
+    for index, row in dataframe.iterrows():
+        parent_id = row[id_column] if id_column in dataframe.columns else f"row_{index}"
+        rows.append(
+            {
+                "parent_id": str(parent_id),
+                "smiles": row[smiles_column],
+            }
+        )
+    return rows
+
+
+def _write_output(payload: list[dict] | dict, output_path: str | None) -> None:
+    if output_path is None:
+        _write_json(payload, None)
+        return
+
+    path = Path(output_path)
+    if path.suffix.lower() == ".csv":
+        dataframe = pd.DataFrame(payload)
+        dataframe.to_csv(path, index=False)
+        return
+
+    _write_json(payload, path)
+
+
+def _write_json(payload: list[dict] | dict, output_path: Path | None) -> None:
+    text = json.dumps(payload, indent=2, ensure_ascii=False)
+    if output_path is None:
+        print(text)
+    else:
+        output_path.write_text(text + "\n", encoding="utf-8")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/macro_lactone_toolkit/errors.py
+++ b/src/macro_lactone_toolkit/errors.py
@@ -0,0 +1,18 @@
+class MacrolactoneError(Exception):
+    """Base class for macrolactone toolkit errors."""
+
+
+class MacrolactoneDetectionError(MacrolactoneError):
+    """Raised when a valid macrolactone cannot be detected."""
+
+
+class AmbiguousMacrolactoneError(MacrolactoneDetectionError):
+    """Raised when multiple valid macrolactone candidates are detected."""
+
+
+class RingNumberingError(MacrolactoneError):
+    """Raised when ring numbering cannot be assigned."""
+
+
+class FragmentationError(MacrolactoneError):
+    """Raised when side-chain fragmentation fails."""
--- a/src/macro_lactone_toolkit/fragmenter.py
+++ b/src/macro_lactone_toolkit/fragmenter.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+from rdkit import Chem
+from rdkit.Chem import Descriptors
+
+from ._core import (
+    build_fragment_smiles,
+    build_numbering_result,
+    collect_side_chain_atoms,
+    ensure_mol,
+    find_macrolactone_candidates,
+    is_intrinsic_lactone_neighbor,
+)
+from .analyzer import MacroLactoneAnalyzer
+from .errors import AmbiguousMacrolactoneError, FragmentationError, MacrolactoneDetectionError
+from .models import FragmentationResult, RingNumberingResult, SideChainFragment
+
+
+class MacrolactoneFragmenter:
+    """Number and fragment 12-20 membered macrolactones."""
+
+    def __init__(self, ring_size: int | None = None):
+        if ring_size is not None and not 12 <= ring_size <= 20:
+            raise ValueError("ring_size must be between 12 and 20")
+        self.ring_size = ring_size
+        self.analyzer = MacroLactoneAnalyzer()
+
+    def number_molecule(self, mol_input: str | Chem.Mol) -> RingNumberingResult:
+        mol, _ = ensure_mol(mol_input)
+        candidate = self._select_candidate(mol)
+        return build_numbering_result(mol, candidate)
+
+    def fragment_molecule(
+        self,
+        mol_input: str | Chem.Mol,
+        parent_id: str | None = None,
+    ) -> FragmentationResult:
+        mol, parent_smiles = ensure_mol(mol_input)
+        candidate = self._select_candidate(mol)
+        numbering = build_numbering_result(mol, candidate)
+        parent_id = parent_id or "molecule_0"
+
+        ring_atom_set = set(candidate.ring_atoms)
+        fragments: list[SideChainFragment] = []
+
+        for position, ring_atom_idx in numbering.position_to_atom.items():
+            ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
+            for neighbor in ring_atom.GetNeighbors():
+                neighbor_idx = neighbor.GetIdx()
+                if neighbor_idx in ring_atom_set:
+                    continue
+                if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
+                    continue
+
+                side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
+                if not side_chain_atoms:
+                    continue
+
+                try:
+                    fragment_smiles_labeled = build_fragment_smiles(
+                        mol=mol,
+                        side_chain_atoms=side_chain_atoms,
+                        side_chain_start_idx=neighbor_idx,
+                        ring_atom_idx=ring_atom_idx,
+                        dummy_isotope=position,
+                    )
+                    fragment_smiles_plain = build_fragment_smiles(
+                        mol=mol,
+                        side_chain_atoms=side_chain_atoms,
+                        side_chain_start_idx=neighbor_idx,
+                        ring_atom_idx=ring_atom_idx,
+                        dummy_isotope=0,
+                    )
+                except Exception as exc:  # pragma: no cover - sanitized by tests
+                    raise FragmentationError(str(exc)) from exc
+
+                plain_fragment = Chem.MolFromSmiles(fragment_smiles_plain)
+                if plain_fragment is None:
+                    raise FragmentationError("Generated fragment_smiles_plain could not be read by RDKit.")
+
+                fragment_id = f"{parent_id}_frag_{len(fragments)}"
+                fragments.append(
+                    SideChainFragment(
+                        parent_id=parent_id,
+                        fragment_id=fragment_id,
+                        cleavage_position=position,
+                        attachment_atom_idx=ring_atom_idx,
+                        fragment_smiles_labeled=fragment_smiles_labeled,
+                        fragment_smiles_plain=fragment_smiles_plain,
+                        atom_count=sum(1 for atom in plain_fragment.GetAtoms() if atom.GetAtomicNum() != 0),
+                        molecular_weight=round(Descriptors.MolWt(plain_fragment), 4),
+                    )
+                )
+
+        return FragmentationResult(
+            parent_id=parent_id,
+            parent_smiles=parent_smiles,
+            ring_size=numbering.ring_size,
+            numbering=numbering,
+            fragments=fragments,
+        )
+
+    def _select_candidate(self, mol: Chem.Mol):
+        candidates = find_macrolactone_candidates(mol, ring_size=self.ring_size)
+        if not candidates:
+            requested = f"{self.ring_size}-membered " if self.ring_size is not None else ""
+            raise MacrolactoneDetectionError(f"No valid {requested}macrolactone was detected.")
+
+        valid_ring_sizes = sorted({candidate.ring_size for candidate in candidates})
+        if len(candidates) > 1 or len(valid_ring_sizes) > 1:
+            raise AmbiguousMacrolactoneError(
+                "Multiple valid macrolactone candidates were detected. Pass ring_size explicitly."
+            )
+        return candidates[0]
--- a/src/macro_lactone_toolkit/models.py
+++ b/src/macro_lactone_toolkit/models.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+
+
+@dataclass(frozen=True)
+class RingNumberingResult:
+    ring_size: int
+    ring_atoms: list[int]
+    ordered_atoms: list[int]
+    carbonyl_carbon_idx: int
+    ester_oxygen_idx: int
+    atom_to_position: dict[int, int]
+    position_to_atom: dict[int, int]
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+
+@dataclass(frozen=True)
+class SideChainFragment:
+    parent_id: str
+    fragment_id: str
+    cleavage_position: int
+    attachment_atom_idx: int
+    fragment_smiles_labeled: str
+    fragment_smiles_plain: str
+    atom_count: int
+    molecular_weight: float
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+
+@dataclass(frozen=True)
+class FragmentationResult:
+    parent_id: str
+    parent_smiles: str
+    ring_size: int
+    numbering: RingNumberingResult
+    fragments: list[SideChainFragment] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+
+    def to_dict(self) -> dict:
+        return {
+            "parent_id": self.parent_id,
+            "parent_smiles": self.parent_smiles,
+            "ring_size": self.ring_size,
+            "numbering": self.numbering.to_dict(),
+            "fragments": [fragment.to_dict() for fragment in self.fragments],
+            "warnings": list(self.warnings),
+        }
--- a/src/macro_lactone_toolkit/splicing/init.py
+++ b/src/macro_lactone_toolkit/splicing/init.py
@@ -0,0 +1,9 @@
+from .engine import splice_molecule
+from .fragment_prep import activate_fragment
+from .scaffold_prep import prepare_macrolactone_scaffold
+
+__all__ = [
+    "activate_fragment",
+    "prepare_macrolactone_scaffold",
+    "splice_molecule",
+]
--- a/src/macro_lactone_toolkit/splicing/engine.py
+++ b/src/macro_lactone_toolkit/splicing/engine.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from rdkit import Chem
+
+
+def splice_molecule(scaffold: Chem.Mol, fragment: Chem.Mol, position: int) -> Chem.Mol:
+    combined = Chem.CombineMols(scaffold, fragment)
+    rwmol = Chem.RWMol(combined)
+
+    scaffold_atom_count = scaffold.GetNumAtoms()
+    total_atoms = rwmol.GetNumAtoms()
+
+    scaffold_dummy_idx = -1
+    for atom_idx in range(scaffold_atom_count):
+        atom = rwmol.GetAtomWithIdx(atom_idx)
+        if atom.GetAtomicNum() == 0 and atom.GetIsotope() == position:
+            scaffold_dummy_idx = atom_idx
+            break
+    if scaffold_dummy_idx == -1:
+        raise ValueError(f"Scaffold dummy atom with isotope {position} not found")
+
+    fragment_dummy_idx = -1
+    for atom_idx in range(scaffold_atom_count, total_atoms):
+        atom = rwmol.GetAtomWithIdx(atom_idx)
+        if atom.GetAtomicNum() == 0:
+            fragment_dummy_idx = atom_idx
+            break
+    if fragment_dummy_idx == -1:
+        raise ValueError("Fragment does not contain a dummy atom")
+
+    scaffold_dummy = rwmol.GetAtomWithIdx(scaffold_dummy_idx)
+    fragment_dummy = rwmol.GetAtomWithIdx(fragment_dummy_idx)
+    if scaffold_dummy.GetDegree() != 1:
+        raise ValueError("Scaffold dummy atom must have exactly one neighbor")
+    if fragment_dummy.GetDegree() != 1:
+        raise ValueError("Fragment dummy atom must have exactly one neighbor")
+
+    scaffold_anchor_idx = scaffold_dummy.GetNeighbors()[0].GetIdx()
+    fragment_anchor_idx = fragment_dummy.GetNeighbors()[0].GetIdx()
+    fragment_bond = rwmol.GetBondBetweenAtoms(fragment_dummy_idx, fragment_anchor_idx)
+    if fragment_bond is None:
+        raise ValueError("Fragment dummy atom is not bonded to an anchor atom")
+
+    rwmol.AddBond(scaffold_anchor_idx, fragment_anchor_idx, fragment_bond.GetBondType())
+
+    for atom_idx in sorted((scaffold_dummy_idx, fragment_dummy_idx), reverse=True):
+        rwmol.RemoveAtom(atom_idx)
+
+    product = rwmol.GetMol()
+    Chem.SanitizeMol(product)
+    return product
--- a/src/macro_lactone_toolkit/splicing/fragment_prep.py
+++ b/src/macro_lactone_toolkit/splicing/fragment_prep.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from rdkit import Chem
+
+
+def activate_fragment(smiles: str, strategy: str = "smart") -> Chem.Mol:
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        raise ValueError(f"Invalid SMILES string: {smiles}")
+
+    target_idx = -1
+
+    if strategy == "smart":
+        for smarts in ("[N;!H0]", "[O;H1]", "[S;H1]"):
+            pattern = Chem.MolFromSmarts(smarts)
+            if pattern is None:
+                continue
+            matches = mol.GetSubstructMatches(pattern)
+            if matches:
+                target_idx = matches[0][0]
+                break
+        if target_idx == -1:
+            strategy = "random"
+
+    if strategy == "random":
+        carbon_candidates: list[int] = []
+        other_candidates: list[int] = []
+        for atom in mol.GetAtoms():
+            if atom.GetTotalNumHs() == 0:
+                continue
+            if atom.GetAtomicNum() == 6:
+                carbon_candidates.append(atom.GetIdx())
+            else:
+                other_candidates.append(atom.GetIdx())
+        candidates = carbon_candidates or other_candidates
+        if not candidates:
+            raise ValueError("No suitable atoms with hydrogens found to attach to.")
+        target_idx = candidates[0]
+
+    if target_idx == -1:
+        raise ValueError("Could not identify a target atom for activation.")
+
+    rwmol = Chem.RWMol(mol)
+    dummy_idx = rwmol.AddAtom(Chem.Atom(0))
+    rwmol.AddBond(target_idx, dummy_idx, Chem.BondType.SINGLE)
+
+    result = rwmol.GetMol()
+    Chem.SanitizeMol(result)
+    return result
--- a/src/macro_lactone_toolkit/splicing/scaffold_prep.py
+++ b/src/macro_lactone_toolkit/splicing/scaffold_prep.py
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+from typing import Iterable
+
+from rdkit import Chem
+
+from .._core import collect_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor
+from ..fragmenter import MacrolactoneFragmenter
+
+
+def prepare_macrolactone_scaffold(
+    mol_input: str | Chem.Mol,
+    positions: Iterable[int],
+    ring_size: int | None = None,
+) -> tuple[Chem.Mol, dict[int, int]]:
+    positions = list(positions)
+    mol, _ = ensure_mol(mol_input)
+    fragmenter = MacrolactoneFragmenter(ring_size=ring_size)
+    numbering = fragmenter.number_molecule(mol)
+    candidate = find_macrolactone_candidates(mol, ring_size=numbering.ring_size)[0]
+    ring_atom_set = set(numbering.ordered_atoms)
+
+    atoms_to_remove: set[int] = set()
+    dummy_specs: list[tuple[int, int, Chem.BondType]] = []
+
+    for position in positions:
+        if position not in numbering.position_to_atom:
+            raise ValueError(f"Position {position} not found in ring numbering.")
+        ring_atom_idx = numbering.position_to_atom[position]
+        ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
+        for neighbor in ring_atom.GetNeighbors():
+            neighbor_idx = neighbor.GetIdx()
+            if neighbor_idx in ring_atom_set:
+                continue
+            if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
+                continue
+            side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
+            atoms_to_remove.update(side_chain_atoms)
+            bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx)
+            if bond is not None:
+                dummy_specs.append((ring_atom_idx, position, bond.GetBondType()))
+
+        if not any(spec_position == position for _, spec_position, _ in dummy_specs):
+            dummy_specs.append((ring_atom_idx, position, Chem.BondType.SINGLE))
+
+    rwmol = Chem.RWMol(mol)
+    for ring_atom_idx, position, bond_type in dummy_specs:
+        dummy_atom = Chem.Atom(0)
+        dummy_atom.SetIsotope(position)
+        dummy_idx = rwmol.AddAtom(dummy_atom)
+        rwmol.AddBond(ring_atom_idx, dummy_idx, bond_type)
+
+    for atom_idx in sorted(atoms_to_remove, reverse=True):
+        rwmol.RemoveAtom(atom_idx)
+
+    scaffold = rwmol.GetMol()
+    Chem.SanitizeMol(scaffold)
+    dummy_map = {
+        atom.GetIsotope(): atom.GetIdx()
+        for atom in scaffold.GetAtoms()
+        if atom.GetAtomicNum() == 0 and atom.GetIsotope() in positions
+    }
+    return scaffold, dummy_map