Files
macrolactone-toolkit/tests/helpers.py
lingyuzeng c0ead42384 feat(toolkit): add classification and migration
Implement the standard/non-standard/not-macrolactone classification layer
and integrate it into analyzer, fragmenter, and CLI outputs.

Port the remaining legacy package capabilities into new visualization and
workflow modules, restore batch/statistics/SDF scripts on top of the flat
CSV workflow, and update active docs to the new package API.
2026-03-18 23:56:41 +08:00

202 lines
5.5 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
from typing import Mapping
from rdkit import Chem
@dataclass(frozen=True)
class BuiltMacrolactone:
mol: Chem.Mol
smiles: str
position_to_atom: dict[int, int]
def build_macrolactone(
ring_size: int,
side_chains: Mapping[int, str] | None = None,
ring_atom_symbols: Mapping[int, str] | None = None,
) -> BuiltMacrolactone:
if not 12 <= ring_size <= 20:
raise ValueError("ring_size must be between 12 and 20")
side_chains = dict(side_chains or {})
ring_atom_symbols = dict(ring_atom_symbols or {})
rwmol = Chem.RWMol()
position_to_atom: dict[int, int] = {
1: rwmol.AddAtom(Chem.Atom("C")),
2: rwmol.AddAtom(Chem.Atom("O")),
}
for position in range(3, ring_size + 1):
position_to_atom[position] = rwmol.AddAtom(Chem.Atom(ring_atom_symbols.get(position, "C")))
carbonyl_oxygen_idx = rwmol.AddAtom(Chem.Atom("O"))
rwmol.AddBond(position_to_atom[1], position_to_atom[2], Chem.BondType.SINGLE)
for position in range(2, ring_size):
rwmol.AddBond(
position_to_atom[position],
position_to_atom[position + 1],
Chem.BondType.SINGLE,
)
rwmol.AddBond(position_to_atom[ring_size], position_to_atom[1], Chem.BondType.SINGLE)
rwmol.AddBond(position_to_atom[1], carbonyl_oxygen_idx, Chem.BondType.DOUBLE)
for position, side_chain in side_chains.items():
if position not in position_to_atom:
raise ValueError(f"Invalid ring position: {position}")
_add_side_chain(rwmol, position_to_atom[position], side_chain)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom=position_to_atom,
)
def build_ambiguous_smiles() -> str:
mol_12 = build_macrolactone(12).mol
mol_14 = build_macrolactone(14).mol
combined = Chem.CombineMols(mol_12, mol_14)
return Chem.MolToSmiles(combined, isomericSmiles=True)
def build_non_standard_ring_atom_macrolactone(
ring_size: int = 16,
hetero_position: int = 5,
atom_symbol: str = "N",
) -> BuiltMacrolactone:
if hetero_position < 3 or hetero_position > ring_size:
raise ValueError("hetero_position must be between 3 and ring_size")
return build_macrolactone(
ring_size=ring_size,
ring_atom_symbols={hetero_position: atom_symbol},
)
def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone:
rwmol = Chem.RWMol()
atom_labels = (
"A1",
"A2",
"S1",
"S2",
"S3",
"S4",
"A5",
"A6",
"A7",
"A8",
"A9",
"A10",
"B1",
"B2",
"B5",
"B6",
"B7",
"B8",
"B9",
"B10",
"AO",
"BO",
)
atom_symbols = {
"A1": "C",
"A2": "O",
"S1": "C",
"S2": "C",
"S3": "C",
"S4": "C",
"A5": "C",
"A6": "C",
"A7": "C",
"A8": "C",
"A9": "C",
"A10": "C",
"B1": "C",
"B2": "O",
"B5": "C",
"B6": "C",
"B7": "C",
"B8": "C",
"B9": "C",
"B10": "C",
"AO": "O",
"BO": "O",
}
atoms = {label: rwmol.AddAtom(Chem.Atom(atom_symbols[label])) for label in atom_labels}
for atom_a, atom_b in (
("A1", "A2"),
("A2", "S1"),
("S1", "S2"),
("S2", "S3"),
("S3", "S4"),
("S4", "A5"),
("A5", "A6"),
("A6", "A7"),
("A7", "A8"),
("A8", "A9"),
("A9", "A10"),
("A10", "A1"),
("B1", "B2"),
("B2", "S1"),
("S4", "B5"),
("B5", "B6"),
("B6", "B7"),
("B7", "B8"),
("B8", "B9"),
("B9", "B10"),
("B10", "B1"),
):
rwmol.AddBond(atoms[atom_a], atoms[atom_b], Chem.BondType.SINGLE)
rwmol.AddBond(atoms["A1"], atoms["AO"], Chem.BondType.DOUBLE)
rwmol.AddBond(atoms["B1"], atoms["BO"], Chem.BondType.DOUBLE)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom={},
)
def canonicalize(smiles_or_mol: str | Chem.Mol) -> str:
if isinstance(smiles_or_mol, Chem.Mol):
mol = smiles_or_mol
else:
mol = Chem.MolFromSmiles(smiles_or_mol)
if mol is None:
raise ValueError(f"Unable to parse SMILES: {smiles_or_mol}")
return Chem.MolToSmiles(mol, isomericSmiles=True)
def _add_side_chain(rwmol: Chem.RWMol, ring_atom_idx: int, side_chain: str) -> None:
if side_chain == "methyl":
carbon_idx = rwmol.AddAtom(Chem.Atom("C"))
rwmol.AddBond(ring_atom_idx, carbon_idx, Chem.BondType.SINGLE)
return
if side_chain == "ethyl":
carbon_1_idx = rwmol.AddAtom(Chem.Atom("C"))
carbon_2_idx = rwmol.AddAtom(Chem.Atom("C"))
rwmol.AddBond(ring_atom_idx, carbon_1_idx, Chem.BondType.SINGLE)
rwmol.AddBond(carbon_1_idx, carbon_2_idx, Chem.BondType.SINGLE)
return
if side_chain == "exocyclic_alkene":
carbon_1_idx = rwmol.AddAtom(Chem.Atom("C"))
carbon_2_idx = rwmol.AddAtom(Chem.Atom("C"))
rwmol.AddBond(ring_atom_idx, carbon_1_idx, Chem.BondType.DOUBLE)
rwmol.AddBond(carbon_1_idx, carbon_2_idx, Chem.BondType.SINGLE)
return
raise ValueError(f"Unsupported side chain: {side_chain}")