feat(toolkit): ship macro_lactone_toolkit package
Unify macrolactone detection, numbering, fragmentation, and splicing under the installable macro_lactone_toolkit package. - replace legacy src.* modules with the new package layout - add analyze/number/fragment CLI entrypoints and pixi tasks - migrate tests, README, and scripts to the new package API
This commit is contained in:
25
src/macro_lactone_toolkit/__init__.py
Normal file
25
src/macro_lactone_toolkit/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from .analyzer import MacroLactoneAnalyzer
|
||||
from .errors import (
|
||||
AmbiguousMacrolactoneError,
|
||||
FragmentationError,
|
||||
MacrolactoneDetectionError,
|
||||
MacrolactoneError,
|
||||
RingNumberingError,
|
||||
)
|
||||
from .fragmenter import MacrolactoneFragmenter
|
||||
from .models import FragmentationResult, RingNumberingResult, SideChainFragment
|
||||
|
||||
__all__ = [
|
||||
"AmbiguousMacrolactoneError",
|
||||
"FragmentationError",
|
||||
"FragmentationResult",
|
||||
"MacroLactoneAnalyzer",
|
||||
"MacrolactoneDetectionError",
|
||||
"MacrolactoneError",
|
||||
"MacrolactoneFragmenter",
|
||||
"RingNumberingError",
|
||||
"RingNumberingResult",
|
||||
"SideChainFragment",
|
||||
]
|
||||
|
||||
__version__ = "0.1.0"
|
||||
243
src/macro_lactone_toolkit/_core.py
Normal file
243
src/macro_lactone_toolkit/_core.py
Normal file
@@ -0,0 +1,243 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import deque
|
||||
from dataclasses import dataclass
|
||||
from typing import Iterable
|
||||
|
||||
from rdkit import Chem
|
||||
|
||||
from .errors import MacrolactoneDetectionError, RingNumberingError
|
||||
from .models import RingNumberingResult
|
||||
|
||||
|
||||
VALID_RING_SIZES = tuple(range(12, 21))
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DetectedMacrolactone:
|
||||
ring_size: int
|
||||
ring_atoms: tuple[int, ...]
|
||||
carbonyl_carbon_idx: int
|
||||
ester_oxygen_idx: int
|
||||
|
||||
|
||||
def ensure_mol(mol_input: str | Chem.Mol) -> tuple[Chem.Mol, str]:
|
||||
if isinstance(mol_input, Chem.Mol):
|
||||
mol = Chem.Mol(mol_input)
|
||||
else:
|
||||
mol = Chem.MolFromSmiles(mol_input)
|
||||
if mol is None:
|
||||
raise MacrolactoneDetectionError(f"Unable to parse SMILES: {mol_input}")
|
||||
return mol, Chem.MolToSmiles(mol, isomericSmiles=True)
|
||||
|
||||
|
||||
def find_macrolactone_candidates(
|
||||
mol: Chem.Mol,
|
||||
ring_size: int | None = None,
|
||||
) -> list[DetectedMacrolactone]:
|
||||
candidates: list[DetectedMacrolactone] = []
|
||||
seen: set[tuple[tuple[int, ...], int, int]] = set()
|
||||
|
||||
for ring in mol.GetRingInfo().AtomRings():
|
||||
if len(ring) not in VALID_RING_SIZES:
|
||||
continue
|
||||
if ring_size is not None and len(ring) != ring_size:
|
||||
continue
|
||||
|
||||
ester_match = _find_ring_ester_atoms(mol, ring)
|
||||
if ester_match is None:
|
||||
continue
|
||||
|
||||
carbonyl_carbon_idx, ester_oxygen_idx = ester_match
|
||||
key = (tuple(sorted(ring)), carbonyl_carbon_idx, ester_oxygen_idx)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
candidates.append(
|
||||
DetectedMacrolactone(
|
||||
ring_size=len(ring),
|
||||
ring_atoms=tuple(ring),
|
||||
carbonyl_carbon_idx=carbonyl_carbon_idx,
|
||||
ester_oxygen_idx=ester_oxygen_idx,
|
||||
)
|
||||
)
|
||||
|
||||
return sorted(
|
||||
candidates,
|
||||
key=lambda candidate: (
|
||||
candidate.ring_size,
|
||||
tuple(sorted(candidate.ring_atoms)),
|
||||
candidate.carbonyl_carbon_idx,
|
||||
candidate.ester_oxygen_idx,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def build_numbering_result(mol: Chem.Mol, candidate: DetectedMacrolactone) -> RingNumberingResult:
|
||||
ring_atoms = list(candidate.ring_atoms)
|
||||
ring_atom_set = set(ring_atoms)
|
||||
adjacency = {
|
||||
atom_idx: sorted(
|
||||
neighbor.GetIdx()
|
||||
for neighbor in mol.GetAtomWithIdx(atom_idx).GetNeighbors()
|
||||
if neighbor.GetIdx() in ring_atom_set
|
||||
)
|
||||
for atom_idx in ring_atoms
|
||||
}
|
||||
|
||||
ordered_atoms = [candidate.carbonyl_carbon_idx, candidate.ester_oxygen_idx]
|
||||
visited = set(ordered_atoms)
|
||||
previous_atom = candidate.carbonyl_carbon_idx
|
||||
current_atom = candidate.ester_oxygen_idx
|
||||
|
||||
while len(ordered_atoms) < candidate.ring_size:
|
||||
next_atoms = [
|
||||
atom_idx
|
||||
for atom_idx in adjacency[current_atom]
|
||||
if atom_idx != previous_atom and atom_idx not in visited
|
||||
]
|
||||
if len(next_atoms) != 1:
|
||||
raise RingNumberingError("Unable to determine a unique ring traversal order.")
|
||||
next_atom = next_atoms[0]
|
||||
ordered_atoms.append(next_atom)
|
||||
visited.add(next_atom)
|
||||
previous_atom, current_atom = current_atom, next_atom
|
||||
|
||||
if candidate.carbonyl_carbon_idx not in adjacency[current_atom]:
|
||||
raise RingNumberingError("Ring traversal did not close on the lactone carbonyl carbon.")
|
||||
|
||||
position_to_atom = {position: atom_idx for position, atom_idx in enumerate(ordered_atoms, start=1)}
|
||||
atom_to_position = {atom_idx: position for position, atom_idx in position_to_atom.items()}
|
||||
|
||||
return RingNumberingResult(
|
||||
ring_size=candidate.ring_size,
|
||||
ring_atoms=sorted(ring_atoms),
|
||||
ordered_atoms=ordered_atoms,
|
||||
carbonyl_carbon_idx=candidate.carbonyl_carbon_idx,
|
||||
ester_oxygen_idx=candidate.ester_oxygen_idx,
|
||||
atom_to_position=atom_to_position,
|
||||
position_to_atom=position_to_atom,
|
||||
)
|
||||
|
||||
|
||||
def collect_side_chain_atoms(
|
||||
mol: Chem.Mol,
|
||||
start_atom_idx: int,
|
||||
ring_atom_indices: Iterable[int],
|
||||
) -> list[int]:
|
||||
ring_atom_set = set(ring_atom_indices)
|
||||
queue: deque[int] = deque([start_atom_idx])
|
||||
visited: set[int] = set()
|
||||
side_chain_atoms: list[int] = []
|
||||
|
||||
while queue:
|
||||
atom_idx = queue.popleft()
|
||||
if atom_idx in visited or atom_idx in ring_atom_set:
|
||||
continue
|
||||
visited.add(atom_idx)
|
||||
side_chain_atoms.append(atom_idx)
|
||||
|
||||
atom = mol.GetAtomWithIdx(atom_idx)
|
||||
for neighbor in atom.GetNeighbors():
|
||||
neighbor_idx = neighbor.GetIdx()
|
||||
if neighbor_idx not in visited and neighbor_idx not in ring_atom_set:
|
||||
queue.append(neighbor_idx)
|
||||
|
||||
return side_chain_atoms
|
||||
|
||||
|
||||
def is_intrinsic_lactone_neighbor(
|
||||
mol: Chem.Mol,
|
||||
candidate: DetectedMacrolactone,
|
||||
ring_atom_idx: int,
|
||||
neighbor_idx: int,
|
||||
) -> bool:
|
||||
if ring_atom_idx != candidate.carbonyl_carbon_idx:
|
||||
return False
|
||||
bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx)
|
||||
neighbor = mol.GetAtomWithIdx(neighbor_idx)
|
||||
return bond is not None and bond.GetBondType() == Chem.BondType.DOUBLE and neighbor.GetAtomicNum() == 8
|
||||
|
||||
|
||||
def build_fragment_smiles(
|
||||
mol: Chem.Mol,
|
||||
side_chain_atoms: list[int],
|
||||
side_chain_start_idx: int,
|
||||
ring_atom_idx: int,
|
||||
dummy_isotope: int,
|
||||
) -> str:
|
||||
fragment_mol = Chem.RWMol()
|
||||
old_to_new: dict[int, int] = {}
|
||||
|
||||
for old_idx in side_chain_atoms:
|
||||
old_atom = mol.GetAtomWithIdx(old_idx)
|
||||
new_atom = Chem.Atom(old_atom)
|
||||
new_idx = fragment_mol.AddAtom(new_atom)
|
||||
old_to_new[old_idx] = new_idx
|
||||
|
||||
for old_idx in side_chain_atoms:
|
||||
atom = mol.GetAtomWithIdx(old_idx)
|
||||
for neighbor in atom.GetNeighbors():
|
||||
neighbor_idx = neighbor.GetIdx()
|
||||
if neighbor_idx in old_to_new and old_idx < neighbor_idx:
|
||||
bond = mol.GetBondBetweenAtoms(old_idx, neighbor_idx)
|
||||
fragment_mol.AddBond(
|
||||
old_to_new[old_idx],
|
||||
old_to_new[neighbor_idx],
|
||||
bond.GetBondType(),
|
||||
)
|
||||
|
||||
bond_to_ring = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx)
|
||||
if bond_to_ring is None:
|
||||
raise RingNumberingError("Missing ring-to-side-chain bond during fragmentation.")
|
||||
|
||||
dummy_atom = Chem.Atom(0)
|
||||
dummy_atom.SetIsotope(dummy_isotope)
|
||||
dummy_idx = fragment_mol.AddAtom(dummy_atom)
|
||||
fragment_mol.AddBond(
|
||||
old_to_new[side_chain_start_idx],
|
||||
dummy_idx,
|
||||
bond_to_ring.GetBondType(),
|
||||
)
|
||||
|
||||
fragment = fragment_mol.GetMol()
|
||||
Chem.SanitizeMol(fragment)
|
||||
return Chem.MolToSmiles(fragment, isomericSmiles=True)
|
||||
|
||||
|
||||
def _find_ring_ester_atoms(mol: Chem.Mol, ring_atoms: Iterable[int]) -> tuple[int, int] | None:
|
||||
ring_atom_set = set(ring_atoms)
|
||||
matches: list[tuple[int, int]] = []
|
||||
|
||||
for atom_idx in ring_atom_set:
|
||||
atom = mol.GetAtomWithIdx(atom_idx)
|
||||
if atom.GetAtomicNum() != 6:
|
||||
continue
|
||||
|
||||
has_carbonyl_oxygen = any(
|
||||
bond.GetBondType() == Chem.BondType.DOUBLE and bond.GetOtherAtom(atom).GetAtomicNum() == 8
|
||||
for bond in atom.GetBonds()
|
||||
)
|
||||
if not has_carbonyl_oxygen:
|
||||
continue
|
||||
|
||||
for neighbor in atom.GetNeighbors():
|
||||
neighbor_idx = neighbor.GetIdx()
|
||||
bond = mol.GetBondBetweenAtoms(atom_idx, neighbor_idx)
|
||||
if (
|
||||
neighbor_idx in ring_atom_set
|
||||
and neighbor.GetAtomicNum() == 8
|
||||
and bond is not None
|
||||
and bond.GetBondType() == Chem.BondType.SINGLE
|
||||
):
|
||||
ring_neighbors = [
|
||||
ring_neighbor.GetIdx()
|
||||
for ring_neighbor in neighbor.GetNeighbors()
|
||||
if ring_neighbor.GetIdx() in ring_atom_set
|
||||
]
|
||||
if len(ring_neighbors) == 2:
|
||||
matches.append((atom_idx, neighbor_idx))
|
||||
|
||||
if not matches:
|
||||
return None
|
||||
return sorted(matches)[0]
|
||||
27
src/macro_lactone_toolkit/analyzer.py
Normal file
27
src/macro_lactone_toolkit/analyzer.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from rdkit import Chem
|
||||
|
||||
from ._core import ensure_mol, find_macrolactone_candidates
|
||||
|
||||
|
||||
class MacroLactoneAnalyzer:
|
||||
"""Identify valid 12-20 membered macrolactones."""
|
||||
|
||||
def get_valid_ring_sizes(self, mol_input: str | Chem.Mol) -> list[int]:
|
||||
mol, _ = ensure_mol(mol_input)
|
||||
candidates = find_macrolactone_candidates(mol)
|
||||
return sorted({candidate.ring_size for candidate in candidates})
|
||||
|
||||
def analyze_molecule(self, mol_input: str | Chem.Mol) -> dict:
|
||||
mol, smiles = ensure_mol(mol_input)
|
||||
candidates = find_macrolactone_candidates(mol)
|
||||
valid_ring_sizes = sorted({candidate.ring_size for candidate in candidates})
|
||||
is_ambiguous = len(valid_ring_sizes) > 1 or len(candidates) > 1
|
||||
return {
|
||||
"smiles": smiles,
|
||||
"valid_ring_sizes": valid_ring_sizes,
|
||||
"candidate_count": len(candidates),
|
||||
"is_ambiguous": is_ambiguous,
|
||||
"selected_ring_size": valid_ring_sizes[0] if len(valid_ring_sizes) == 1 and len(candidates) == 1 else None,
|
||||
}
|
||||
160
src/macro_lactone_toolkit/cli.py
Normal file
160
src/macro_lactone_toolkit/cli.py
Normal file
@@ -0,0 +1,160 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from .analyzer import MacroLactoneAnalyzer
|
||||
from .errors import MacrolactoneError
|
||||
from .fragmenter import MacrolactoneFragmenter
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
if not hasattr(args, "func"):
|
||||
parser.print_help()
|
||||
raise SystemExit(1)
|
||||
args.func(args)
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(prog="macro-lactone-toolkit")
|
||||
subparsers = parser.add_subparsers(dest="command")
|
||||
|
||||
analyze = subparsers.add_parser("analyze")
|
||||
_add_common_input_arguments(analyze)
|
||||
analyze.add_argument("--ring-size", type=int, default=None)
|
||||
analyze.set_defaults(func=run_analyze)
|
||||
|
||||
number = subparsers.add_parser("number")
|
||||
number.add_argument("--smiles", required=True)
|
||||
number.add_argument("--ring-size", type=int, default=None)
|
||||
number.set_defaults(func=run_number)
|
||||
|
||||
fragment = subparsers.add_parser("fragment")
|
||||
_add_common_input_arguments(fragment)
|
||||
fragment.add_argument("--ring-size", type=int, default=None)
|
||||
fragment.add_argument("--parent-id", default=None)
|
||||
fragment.add_argument("--errors-output", default=None)
|
||||
fragment.set_defaults(func=run_fragment)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def run_analyze(args: argparse.Namespace) -> None:
|
||||
analyzer = MacroLactoneAnalyzer()
|
||||
if args.smiles:
|
||||
payload = analyzer.analyze_molecule(args.smiles)
|
||||
_write_output(payload, args.output)
|
||||
return
|
||||
|
||||
rows = _read_csv_rows(args.input, args.smiles_column, args.id_column)
|
||||
payload = []
|
||||
for row in rows:
|
||||
analysis = analyzer.analyze_molecule(row["smiles"])
|
||||
analysis["parent_id"] = row["parent_id"]
|
||||
payload.append(analysis)
|
||||
_write_output(payload, args.output)
|
||||
|
||||
|
||||
def run_number(args: argparse.Namespace) -> None:
|
||||
fragmenter = MacrolactoneFragmenter(ring_size=args.ring_size)
|
||||
payload = fragmenter.number_molecule(args.smiles).to_dict()
|
||||
_write_json(payload, None)
|
||||
|
||||
|
||||
def run_fragment(args: argparse.Namespace) -> None:
|
||||
fragmenter = MacrolactoneFragmenter(ring_size=args.ring_size)
|
||||
|
||||
if args.smiles:
|
||||
result = fragmenter.fragment_molecule(args.smiles, parent_id=args.parent_id)
|
||||
_write_output(result.to_dict(), args.output)
|
||||
return
|
||||
|
||||
rows = _read_csv_rows(args.input, args.smiles_column, args.id_column)
|
||||
fragment_rows: list[dict] = []
|
||||
error_rows: list[dict] = []
|
||||
|
||||
for row in rows:
|
||||
try:
|
||||
result = fragmenter.fragment_molecule(row["smiles"], parent_id=row["parent_id"])
|
||||
except MacrolactoneError as exc:
|
||||
error_rows.append(
|
||||
{
|
||||
"parent_id": row["parent_id"],
|
||||
"smiles": row["smiles"],
|
||||
"error_type": type(exc).__name__,
|
||||
"error_message": str(exc),
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
for fragment in result.fragments:
|
||||
fragment_rows.append(
|
||||
{
|
||||
"parent_id": result.parent_id,
|
||||
"ring_size": result.ring_size,
|
||||
**fragment.to_dict(),
|
||||
}
|
||||
)
|
||||
|
||||
if args.output:
|
||||
_write_output(fragment_rows, args.output)
|
||||
else:
|
||||
_write_json({"fragments": fragment_rows, "errors": error_rows}, None)
|
||||
|
||||
if args.errors_output:
|
||||
_write_output(error_rows, args.errors_output)
|
||||
|
||||
|
||||
def _add_common_input_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument("--smiles")
|
||||
group.add_argument("--input")
|
||||
parser.add_argument("--smiles-column", default="smiles")
|
||||
parser.add_argument("--id-column", default="id")
|
||||
parser.add_argument("--output", default=None)
|
||||
|
||||
|
||||
def _read_csv_rows(input_path: str, smiles_column: str, id_column: str) -> list[dict]:
|
||||
dataframe = pd.read_csv(input_path)
|
||||
rows = []
|
||||
for index, row in dataframe.iterrows():
|
||||
parent_id = row[id_column] if id_column in dataframe.columns else f"row_{index}"
|
||||
rows.append(
|
||||
{
|
||||
"parent_id": str(parent_id),
|
||||
"smiles": row[smiles_column],
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _write_output(payload: list[dict] | dict, output_path: str | None) -> None:
|
||||
if output_path is None:
|
||||
_write_json(payload, None)
|
||||
return
|
||||
|
||||
path = Path(output_path)
|
||||
if path.suffix.lower() == ".csv":
|
||||
dataframe = pd.DataFrame(payload)
|
||||
dataframe.to_csv(path, index=False)
|
||||
return
|
||||
|
||||
_write_json(payload, path)
|
||||
|
||||
|
||||
def _write_json(payload: list[dict] | dict, output_path: Path | None) -> None:
|
||||
text = json.dumps(payload, indent=2, ensure_ascii=False)
|
||||
if output_path is None:
|
||||
print(text)
|
||||
else:
|
||||
output_path.write_text(text + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
18
src/macro_lactone_toolkit/errors.py
Normal file
18
src/macro_lactone_toolkit/errors.py
Normal file
@@ -0,0 +1,18 @@
|
||||
class MacrolactoneError(Exception):
|
||||
"""Base class for macrolactone toolkit errors."""
|
||||
|
||||
|
||||
class MacrolactoneDetectionError(MacrolactoneError):
|
||||
"""Raised when a valid macrolactone cannot be detected."""
|
||||
|
||||
|
||||
class AmbiguousMacrolactoneError(MacrolactoneDetectionError):
|
||||
"""Raised when multiple valid macrolactone candidates are detected."""
|
||||
|
||||
|
||||
class RingNumberingError(MacrolactoneError):
|
||||
"""Raised when ring numbering cannot be assigned."""
|
||||
|
||||
|
||||
class FragmentationError(MacrolactoneError):
|
||||
"""Raised when side-chain fragmentation fails."""
|
||||
114
src/macro_lactone_toolkit/fragmenter.py
Normal file
114
src/macro_lactone_toolkit/fragmenter.py
Normal file
@@ -0,0 +1,114 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import Descriptors
|
||||
|
||||
from ._core import (
|
||||
build_fragment_smiles,
|
||||
build_numbering_result,
|
||||
collect_side_chain_atoms,
|
||||
ensure_mol,
|
||||
find_macrolactone_candidates,
|
||||
is_intrinsic_lactone_neighbor,
|
||||
)
|
||||
from .analyzer import MacroLactoneAnalyzer
|
||||
from .errors import AmbiguousMacrolactoneError, FragmentationError, MacrolactoneDetectionError
|
||||
from .models import FragmentationResult, RingNumberingResult, SideChainFragment
|
||||
|
||||
|
||||
class MacrolactoneFragmenter:
|
||||
"""Number and fragment 12-20 membered macrolactones."""
|
||||
|
||||
def __init__(self, ring_size: int | None = None):
|
||||
if ring_size is not None and not 12 <= ring_size <= 20:
|
||||
raise ValueError("ring_size must be between 12 and 20")
|
||||
self.ring_size = ring_size
|
||||
self.analyzer = MacroLactoneAnalyzer()
|
||||
|
||||
def number_molecule(self, mol_input: str | Chem.Mol) -> RingNumberingResult:
|
||||
mol, _ = ensure_mol(mol_input)
|
||||
candidate = self._select_candidate(mol)
|
||||
return build_numbering_result(mol, candidate)
|
||||
|
||||
def fragment_molecule(
|
||||
self,
|
||||
mol_input: str | Chem.Mol,
|
||||
parent_id: str | None = None,
|
||||
) -> FragmentationResult:
|
||||
mol, parent_smiles = ensure_mol(mol_input)
|
||||
candidate = self._select_candidate(mol)
|
||||
numbering = build_numbering_result(mol, candidate)
|
||||
parent_id = parent_id or "molecule_0"
|
||||
|
||||
ring_atom_set = set(candidate.ring_atoms)
|
||||
fragments: list[SideChainFragment] = []
|
||||
|
||||
for position, ring_atom_idx in numbering.position_to_atom.items():
|
||||
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
||||
for neighbor in ring_atom.GetNeighbors():
|
||||
neighbor_idx = neighbor.GetIdx()
|
||||
if neighbor_idx in ring_atom_set:
|
||||
continue
|
||||
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
|
||||
continue
|
||||
|
||||
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
|
||||
if not side_chain_atoms:
|
||||
continue
|
||||
|
||||
try:
|
||||
fragment_smiles_labeled = build_fragment_smiles(
|
||||
mol=mol,
|
||||
side_chain_atoms=side_chain_atoms,
|
||||
side_chain_start_idx=neighbor_idx,
|
||||
ring_atom_idx=ring_atom_idx,
|
||||
dummy_isotope=position,
|
||||
)
|
||||
fragment_smiles_plain = build_fragment_smiles(
|
||||
mol=mol,
|
||||
side_chain_atoms=side_chain_atoms,
|
||||
side_chain_start_idx=neighbor_idx,
|
||||
ring_atom_idx=ring_atom_idx,
|
||||
dummy_isotope=0,
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - sanitized by tests
|
||||
raise FragmentationError(str(exc)) from exc
|
||||
|
||||
plain_fragment = Chem.MolFromSmiles(fragment_smiles_plain)
|
||||
if plain_fragment is None:
|
||||
raise FragmentationError("Generated fragment_smiles_plain could not be read by RDKit.")
|
||||
|
||||
fragment_id = f"{parent_id}_frag_{len(fragments)}"
|
||||
fragments.append(
|
||||
SideChainFragment(
|
||||
parent_id=parent_id,
|
||||
fragment_id=fragment_id,
|
||||
cleavage_position=position,
|
||||
attachment_atom_idx=ring_atom_idx,
|
||||
fragment_smiles_labeled=fragment_smiles_labeled,
|
||||
fragment_smiles_plain=fragment_smiles_plain,
|
||||
atom_count=sum(1 for atom in plain_fragment.GetAtoms() if atom.GetAtomicNum() != 0),
|
||||
molecular_weight=round(Descriptors.MolWt(plain_fragment), 4),
|
||||
)
|
||||
)
|
||||
|
||||
return FragmentationResult(
|
||||
parent_id=parent_id,
|
||||
parent_smiles=parent_smiles,
|
||||
ring_size=numbering.ring_size,
|
||||
numbering=numbering,
|
||||
fragments=fragments,
|
||||
)
|
||||
|
||||
def _select_candidate(self, mol: Chem.Mol):
|
||||
candidates = find_macrolactone_candidates(mol, ring_size=self.ring_size)
|
||||
if not candidates:
|
||||
requested = f"{self.ring_size}-membered " if self.ring_size is not None else ""
|
||||
raise MacrolactoneDetectionError(f"No valid {requested}macrolactone was detected.")
|
||||
|
||||
valid_ring_sizes = sorted({candidate.ring_size for candidate in candidates})
|
||||
if len(candidates) > 1 or len(valid_ring_sizes) > 1:
|
||||
raise AmbiguousMacrolactoneError(
|
||||
"Multiple valid macrolactone candidates were detected. Pass ring_size explicitly."
|
||||
)
|
||||
return candidates[0]
|
||||
52
src/macro_lactone_toolkit/models.py
Normal file
52
src/macro_lactone_toolkit/models.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import asdict, dataclass, field
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RingNumberingResult:
|
||||
ring_size: int
|
||||
ring_atoms: list[int]
|
||||
ordered_atoms: list[int]
|
||||
carbonyl_carbon_idx: int
|
||||
ester_oxygen_idx: int
|
||||
atom_to_position: dict[int, int]
|
||||
position_to_atom: dict[int, int]
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SideChainFragment:
|
||||
parent_id: str
|
||||
fragment_id: str
|
||||
cleavage_position: int
|
||||
attachment_atom_idx: int
|
||||
fragment_smiles_labeled: str
|
||||
fragment_smiles_plain: str
|
||||
atom_count: int
|
||||
molecular_weight: float
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FragmentationResult:
|
||||
parent_id: str
|
||||
parent_smiles: str
|
||||
ring_size: int
|
||||
numbering: RingNumberingResult
|
||||
fragments: list[SideChainFragment] = field(default_factory=list)
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"parent_id": self.parent_id,
|
||||
"parent_smiles": self.parent_smiles,
|
||||
"ring_size": self.ring_size,
|
||||
"numbering": self.numbering.to_dict(),
|
||||
"fragments": [fragment.to_dict() for fragment in self.fragments],
|
||||
"warnings": list(self.warnings),
|
||||
}
|
||||
9
src/macro_lactone_toolkit/splicing/__init__.py
Normal file
9
src/macro_lactone_toolkit/splicing/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from .engine import splice_molecule
|
||||
from .fragment_prep import activate_fragment
|
||||
from .scaffold_prep import prepare_macrolactone_scaffold
|
||||
|
||||
__all__ = [
|
||||
"activate_fragment",
|
||||
"prepare_macrolactone_scaffold",
|
||||
"splice_molecule",
|
||||
]
|
||||
51
src/macro_lactone_toolkit/splicing/engine.py
Normal file
51
src/macro_lactone_toolkit/splicing/engine.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from rdkit import Chem
|
||||
|
||||
|
||||
def splice_molecule(scaffold: Chem.Mol, fragment: Chem.Mol, position: int) -> Chem.Mol:
|
||||
combined = Chem.CombineMols(scaffold, fragment)
|
||||
rwmol = Chem.RWMol(combined)
|
||||
|
||||
scaffold_atom_count = scaffold.GetNumAtoms()
|
||||
total_atoms = rwmol.GetNumAtoms()
|
||||
|
||||
scaffold_dummy_idx = -1
|
||||
for atom_idx in range(scaffold_atom_count):
|
||||
atom = rwmol.GetAtomWithIdx(atom_idx)
|
||||
if atom.GetAtomicNum() == 0 and atom.GetIsotope() == position:
|
||||
scaffold_dummy_idx = atom_idx
|
||||
break
|
||||
if scaffold_dummy_idx == -1:
|
||||
raise ValueError(f"Scaffold dummy atom with isotope {position} not found")
|
||||
|
||||
fragment_dummy_idx = -1
|
||||
for atom_idx in range(scaffold_atom_count, total_atoms):
|
||||
atom = rwmol.GetAtomWithIdx(atom_idx)
|
||||
if atom.GetAtomicNum() == 0:
|
||||
fragment_dummy_idx = atom_idx
|
||||
break
|
||||
if fragment_dummy_idx == -1:
|
||||
raise ValueError("Fragment does not contain a dummy atom")
|
||||
|
||||
scaffold_dummy = rwmol.GetAtomWithIdx(scaffold_dummy_idx)
|
||||
fragment_dummy = rwmol.GetAtomWithIdx(fragment_dummy_idx)
|
||||
if scaffold_dummy.GetDegree() != 1:
|
||||
raise ValueError("Scaffold dummy atom must have exactly one neighbor")
|
||||
if fragment_dummy.GetDegree() != 1:
|
||||
raise ValueError("Fragment dummy atom must have exactly one neighbor")
|
||||
|
||||
scaffold_anchor_idx = scaffold_dummy.GetNeighbors()[0].GetIdx()
|
||||
fragment_anchor_idx = fragment_dummy.GetNeighbors()[0].GetIdx()
|
||||
fragment_bond = rwmol.GetBondBetweenAtoms(fragment_dummy_idx, fragment_anchor_idx)
|
||||
if fragment_bond is None:
|
||||
raise ValueError("Fragment dummy atom is not bonded to an anchor atom")
|
||||
|
||||
rwmol.AddBond(scaffold_anchor_idx, fragment_anchor_idx, fragment_bond.GetBondType())
|
||||
|
||||
for atom_idx in sorted((scaffold_dummy_idx, fragment_dummy_idx), reverse=True):
|
||||
rwmol.RemoveAtom(atom_idx)
|
||||
|
||||
product = rwmol.GetMol()
|
||||
Chem.SanitizeMol(product)
|
||||
return product
|
||||
49
src/macro_lactone_toolkit/splicing/fragment_prep.py
Normal file
49
src/macro_lactone_toolkit/splicing/fragment_prep.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from rdkit import Chem
|
||||
|
||||
|
||||
def activate_fragment(smiles: str, strategy: str = "smart") -> Chem.Mol:
|
||||
mol = Chem.MolFromSmiles(smiles)
|
||||
if mol is None:
|
||||
raise ValueError(f"Invalid SMILES string: {smiles}")
|
||||
|
||||
target_idx = -1
|
||||
|
||||
if strategy == "smart":
|
||||
for smarts in ("[N;!H0]", "[O;H1]", "[S;H1]"):
|
||||
pattern = Chem.MolFromSmarts(smarts)
|
||||
if pattern is None:
|
||||
continue
|
||||
matches = mol.GetSubstructMatches(pattern)
|
||||
if matches:
|
||||
target_idx = matches[0][0]
|
||||
break
|
||||
if target_idx == -1:
|
||||
strategy = "random"
|
||||
|
||||
if strategy == "random":
|
||||
carbon_candidates: list[int] = []
|
||||
other_candidates: list[int] = []
|
||||
for atom in mol.GetAtoms():
|
||||
if atom.GetTotalNumHs() == 0:
|
||||
continue
|
||||
if atom.GetAtomicNum() == 6:
|
||||
carbon_candidates.append(atom.GetIdx())
|
||||
else:
|
||||
other_candidates.append(atom.GetIdx())
|
||||
candidates = carbon_candidates or other_candidates
|
||||
if not candidates:
|
||||
raise ValueError("No suitable atoms with hydrogens found to attach to.")
|
||||
target_idx = candidates[0]
|
||||
|
||||
if target_idx == -1:
|
||||
raise ValueError("Could not identify a target atom for activation.")
|
||||
|
||||
rwmol = Chem.RWMol(mol)
|
||||
dummy_idx = rwmol.AddAtom(Chem.Atom(0))
|
||||
rwmol.AddBond(target_idx, dummy_idx, Chem.BondType.SINGLE)
|
||||
|
||||
result = rwmol.GetMol()
|
||||
Chem.SanitizeMol(result)
|
||||
return result
|
||||
63
src/macro_lactone_toolkit/splicing/scaffold_prep.py
Normal file
63
src/macro_lactone_toolkit/splicing/scaffold_prep.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable
|
||||
|
||||
from rdkit import Chem
|
||||
|
||||
from .._core import collect_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor
|
||||
from ..fragmenter import MacrolactoneFragmenter
|
||||
|
||||
|
||||
def prepare_macrolactone_scaffold(
|
||||
mol_input: str | Chem.Mol,
|
||||
positions: Iterable[int],
|
||||
ring_size: int | None = None,
|
||||
) -> tuple[Chem.Mol, dict[int, int]]:
|
||||
positions = list(positions)
|
||||
mol, _ = ensure_mol(mol_input)
|
||||
fragmenter = MacrolactoneFragmenter(ring_size=ring_size)
|
||||
numbering = fragmenter.number_molecule(mol)
|
||||
candidate = find_macrolactone_candidates(mol, ring_size=numbering.ring_size)[0]
|
||||
ring_atom_set = set(numbering.ordered_atoms)
|
||||
|
||||
atoms_to_remove: set[int] = set()
|
||||
dummy_specs: list[tuple[int, int, Chem.BondType]] = []
|
||||
|
||||
for position in positions:
|
||||
if position not in numbering.position_to_atom:
|
||||
raise ValueError(f"Position {position} not found in ring numbering.")
|
||||
ring_atom_idx = numbering.position_to_atom[position]
|
||||
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
||||
for neighbor in ring_atom.GetNeighbors():
|
||||
neighbor_idx = neighbor.GetIdx()
|
||||
if neighbor_idx in ring_atom_set:
|
||||
continue
|
||||
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
|
||||
continue
|
||||
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
|
||||
atoms_to_remove.update(side_chain_atoms)
|
||||
bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx)
|
||||
if bond is not None:
|
||||
dummy_specs.append((ring_atom_idx, position, bond.GetBondType()))
|
||||
|
||||
if not any(spec_position == position for _, spec_position, _ in dummy_specs):
|
||||
dummy_specs.append((ring_atom_idx, position, Chem.BondType.SINGLE))
|
||||
|
||||
rwmol = Chem.RWMol(mol)
|
||||
for ring_atom_idx, position, bond_type in dummy_specs:
|
||||
dummy_atom = Chem.Atom(0)
|
||||
dummy_atom.SetIsotope(position)
|
||||
dummy_idx = rwmol.AddAtom(dummy_atom)
|
||||
rwmol.AddBond(ring_atom_idx, dummy_idx, bond_type)
|
||||
|
||||
for atom_idx in sorted(atoms_to_remove, reverse=True):
|
||||
rwmol.RemoveAtom(atom_idx)
|
||||
|
||||
scaffold = rwmol.GetMol()
|
||||
Chem.SanitizeMol(scaffold)
|
||||
dummy_map = {
|
||||
atom.GetIsotope(): atom.GetIdx()
|
||||
for atom in scaffold.GetAtoms()
|
||||
if atom.GetAtomicNum() == 0 and atom.GetIsotope() in positions
|
||||
}
|
||||
return scaffold, dummy_map
|
||||
Reference in New Issue
Block a user