feat(toolkit): add classification and migration

Implement the standard/non-standard/not-macrolactone classification layer
and integrate it into analyzer, fragmenter, and CLI outputs.

Port the remaining legacy package capabilities into new visualization and
workflow modules, restore batch/statistics/SDF scripts on top of the flat
CSV workflow, and update active docs to the new package API.
This commit is contained in:
2026-03-18 23:56:41 +08:00
parent 9ccbcfcd04
commit c0ead42384
24 changed files with 1497 additions and 313 deletions

View File

@@ -16,11 +16,13 @@ class BuiltMacrolactone:
def build_macrolactone(
ring_size: int,
side_chains: Mapping[int, str] | None = None,
ring_atom_symbols: Mapping[int, str] | None = None,
) -> BuiltMacrolactone:
if not 12 <= ring_size <= 20:
raise ValueError("ring_size must be between 12 and 20")
side_chains = dict(side_chains or {})
ring_atom_symbols = dict(ring_atom_symbols or {})
rwmol = Chem.RWMol()
position_to_atom: dict[int, int] = {
@@ -28,7 +30,7 @@ def build_macrolactone(
2: rwmol.AddAtom(Chem.Atom("O")),
}
for position in range(3, ring_size + 1):
position_to_atom[position] = rwmol.AddAtom(Chem.Atom("C"))
position_to_atom[position] = rwmol.AddAtom(Chem.Atom(ring_atom_symbols.get(position, "C")))
carbonyl_oxygen_idx = rwmol.AddAtom(Chem.Atom("O"))
@@ -63,6 +65,109 @@ def build_ambiguous_smiles() -> str:
return Chem.MolToSmiles(combined, isomericSmiles=True)
def build_non_standard_ring_atom_macrolactone(
ring_size: int = 16,
hetero_position: int = 5,
atom_symbol: str = "N",
) -> BuiltMacrolactone:
if hetero_position < 3 or hetero_position > ring_size:
raise ValueError("hetero_position must be between 3 and ring_size")
return build_macrolactone(
ring_size=ring_size,
ring_atom_symbols={hetero_position: atom_symbol},
)
def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone:
rwmol = Chem.RWMol()
atom_labels = (
"A1",
"A2",
"S1",
"S2",
"S3",
"S4",
"A5",
"A6",
"A7",
"A8",
"A9",
"A10",
"B1",
"B2",
"B5",
"B6",
"B7",
"B8",
"B9",
"B10",
"AO",
"BO",
)
atom_symbols = {
"A1": "C",
"A2": "O",
"S1": "C",
"S2": "C",
"S3": "C",
"S4": "C",
"A5": "C",
"A6": "C",
"A7": "C",
"A8": "C",
"A9": "C",
"A10": "C",
"B1": "C",
"B2": "O",
"B5": "C",
"B6": "C",
"B7": "C",
"B8": "C",
"B9": "C",
"B10": "C",
"AO": "O",
"BO": "O",
}
atoms = {label: rwmol.AddAtom(Chem.Atom(atom_symbols[label])) for label in atom_labels}
for atom_a, atom_b in (
("A1", "A2"),
("A2", "S1"),
("S1", "S2"),
("S2", "S3"),
("S3", "S4"),
("S4", "A5"),
("A5", "A6"),
("A6", "A7"),
("A7", "A8"),
("A8", "A9"),
("A9", "A10"),
("A10", "A1"),
("B1", "B2"),
("B2", "S1"),
("S4", "B5"),
("B5", "B6"),
("B6", "B7"),
("B7", "B8"),
("B8", "B9"),
("B9", "B10"),
("B10", "B1"),
):
rwmol.AddBond(atoms[atom_a], atoms[atom_b], Chem.BondType.SINGLE)
rwmol.AddBond(atoms["A1"], atoms["AO"], Chem.BondType.DOUBLE)
rwmol.AddBond(atoms["B1"], atoms["BO"], Chem.BondType.DOUBLE)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom={},
)
def canonicalize(smiles_or_mol: str | Chem.Mol) -> str:
if isinstance(smiles_or_mol, Chem.Mol):
mol = smiles_or_mol

View File

@@ -6,7 +6,12 @@ import sys
import pandas as pd
from .helpers import build_ambiguous_smiles, build_macrolactone
from .helpers import (
build_ambiguous_smiles,
build_macrolactone,
build_non_standard_ring_atom_macrolactone,
build_overlapping_candidate_macrolactone,
)
def run_cli(*args: str) -> subprocess.CompletedProcess[str]:
@@ -24,7 +29,10 @@ def test_cli_smoke_commands():
analyze = run_cli("analyze", "--smiles", built.smiles)
assert analyze.returncode == 0, analyze.stderr
analyze_payload = json.loads(analyze.stdout)
assert analyze_payload["valid_ring_sizes"] == [16]
assert analyze_payload["classification"] == "standard_macrolactone"
assert analyze_payload["ring_size"] == 16
assert analyze_payload["primary_reason_code"] is None
assert analyze_payload["candidate_ring_sizes"] == [16]
number = run_cli("number", "--smiles", built.smiles)
assert number.returncode == 0, number.stderr
@@ -40,6 +48,55 @@ def test_cli_smoke_commands():
assert fragment_payload["fragments"][0]["fragment_smiles_labeled"]
def test_cli_analyze_reports_non_standard_classifications():
hetero = build_non_standard_ring_atom_macrolactone()
overlap = build_overlapping_candidate_macrolactone()
hetero_result = run_cli("analyze", "--smiles", hetero.smiles)
assert hetero_result.returncode == 0, hetero_result.stderr
hetero_payload = json.loads(hetero_result.stdout)
assert hetero_payload["classification"] == "non_standard_macrocycle"
assert hetero_payload["primary_reason_code"] == "contains_non_carbon_ring_atoms_outside_positions_1_2"
assert hetero_payload["ring_size"] == 16
overlap_result = run_cli("analyze", "--smiles", overlap.smiles)
assert overlap_result.returncode == 0, overlap_result.stderr
overlap_payload = json.loads(overlap_result.stdout)
assert overlap_payload["classification"] == "non_standard_macrocycle"
assert overlap_payload["primary_reason_code"] == "multiple_overlapping_macrocycle_candidates"
assert overlap_payload["ring_size"] == 12
def test_cli_analyze_csv_reports_classification_fields(tmp_path):
valid = build_macrolactone(14)
hetero = build_non_standard_ring_atom_macrolactone()
input_path = tmp_path / "molecules.csv"
output_path = tmp_path / "analysis.csv"
pd.DataFrame(
[
{"id": "valid_1", "smiles": valid.smiles},
{"id": "hetero_1", "smiles": hetero.smiles},
]
).to_csv(input_path, index=False)
completed = run_cli(
"analyze",
"--input",
str(input_path),
"--output",
str(output_path),
)
assert completed.returncode == 0, completed.stderr
analysis = pd.read_csv(output_path)
assert set(analysis["parent_id"]) == {"valid_1", "hetero_1"}
assert set(analysis["classification"]) == {"standard_macrolactone", "non_standard_macrocycle"}
assert "primary_reason_code" in analysis.columns
assert "ring_size" in analysis.columns
def test_cli_fragment_csv_skips_ambiguous_and_records_errors(tmp_path):
valid = build_macrolactone(14, {4: "methyl"})
ambiguous = build_ambiguous_smiles()

View File

@@ -8,7 +8,12 @@ from macro_lactone_toolkit import (
MacrolactoneFragmenter,
)
from .helpers import build_ambiguous_smiles, build_macrolactone
from .helpers import (
build_ambiguous_smiles,
build_macrolactone,
build_non_standard_ring_atom_macrolactone,
build_overlapping_candidate_macrolactone,
)
@pytest.mark.parametrize("ring_size", [12, 14, 16, 20])
@@ -25,6 +30,77 @@ def test_analyzer_rejects_non_lactone_macrocycle():
assert analyzer.get_valid_ring_sizes("C1CCCCCCCCCCC1") == []
@pytest.mark.parametrize("ring_size", [12, 14, 16, 20])
def test_analyzer_classifies_supported_ring_sizes(ring_size: int):
built = build_macrolactone(ring_size)
analyzer = MacroLactoneAnalyzer()
result = analyzer.classify_macrocycle(built.smiles)
assert result.classification == "standard_macrolactone"
assert result.ring_size == ring_size
assert result.primary_reason_code is None
assert result.primary_reason_message is None
assert result.all_reason_codes == []
assert result.all_reason_messages == []
assert result.candidate_ring_sizes == [ring_size]
def test_analyzer_classifies_ring_heteroatom_as_non_standard():
built = build_non_standard_ring_atom_macrolactone()
analyzer = MacroLactoneAnalyzer()
result = analyzer.classify_macrocycle(built.smiles)
assert result.classification == "non_standard_macrocycle"
assert result.ring_size == 16
assert result.primary_reason_code == "contains_non_carbon_ring_atoms_outside_positions_1_2"
assert result.primary_reason_message == "Ring positions 3..N contain non-carbon atoms."
assert result.all_reason_codes == ["contains_non_carbon_ring_atoms_outside_positions_1_2"]
assert result.candidate_ring_sizes == [16]
def test_analyzer_classifies_overlapping_candidates_as_non_standard():
built = build_overlapping_candidate_macrolactone()
analyzer = MacroLactoneAnalyzer()
result = analyzer.classify_macrocycle(built.smiles)
assert result.classification == "non_standard_macrocycle"
assert result.ring_size == 12
assert result.primary_reason_code == "multiple_overlapping_macrocycle_candidates"
assert result.primary_reason_message == "Overlapping macrolactone candidate rings were detected."
assert result.all_reason_codes == ["multiple_overlapping_macrocycle_candidates"]
assert result.candidate_ring_sizes == [12]
def test_analyzer_classifies_non_lactone_macrocycle():
analyzer = MacroLactoneAnalyzer()
result = analyzer.classify_macrocycle("C1CCCCCCCCCCC1")
assert result.classification == "not_macrolactone"
assert result.ring_size is None
assert result.primary_reason_code == "no_lactone_ring_in_12_to_20_range"
assert result.primary_reason_message == "No 12-20 membered lactone ring was detected."
assert result.all_reason_codes == ["no_lactone_ring_in_12_to_20_range"]
assert result.candidate_ring_sizes == []
def test_analyzer_explicit_ring_size_miss_returns_requested_ring_not_found():
built = build_macrolactone(12)
analyzer = MacroLactoneAnalyzer()
result = analyzer.classify_macrocycle(built.smiles, ring_size=16)
assert result.classification == "not_macrolactone"
assert result.ring_size is None
assert result.primary_reason_code == "requested_ring_size_not_found"
assert result.primary_reason_message == "The requested ring size was not detected as a lactone ring."
assert result.all_reason_codes == ["requested_ring_size_not_found"]
assert result.candidate_ring_sizes == []
def test_fragmenter_auto_numbers_ring_with_expected_positions():
built = build_macrolactone(16, {5: "methyl"})
result = MacrolactoneFragmenter().number_molecule(built.mol)
@@ -55,10 +131,35 @@ def test_fragmenter_requires_explicit_ring_size_for_ambiguous_molecule():
def test_fragmenter_raises_for_missing_macrolactone():
with pytest.raises(MacrolactoneDetectionError):
with pytest.raises(
MacrolactoneDetectionError,
match="classification=not_macrolactone primary_reason_code=no_lactone_ring_in_12_to_20_range",
):
MacrolactoneFragmenter().number_molecule("CCO")
def test_fragmenter_rejects_non_standard_macrocycle_with_reason_code():
built = build_non_standard_ring_atom_macrolactone()
with pytest.raises(
MacrolactoneDetectionError,
match="classification=non_standard_macrocycle "
"primary_reason_code=contains_non_carbon_ring_atoms_outside_positions_1_2",
):
MacrolactoneFragmenter().number_molecule(built.smiles)
def test_fragmenter_rejects_non_standard_macrocycle_during_fragmentation():
built = build_overlapping_candidate_macrolactone()
with pytest.raises(
MacrolactoneDetectionError,
match="classification=non_standard_macrocycle "
"primary_reason_code=multiple_overlapping_macrocycle_candidates",
):
MacrolactoneFragmenter().fragment_molecule(built.smiles)
def test_explicit_ring_size_selects_requested_ring():
built = build_macrolactone(14)
result = MacrolactoneFragmenter(ring_size=14).number_molecule(built.smiles)

View File

@@ -0,0 +1,149 @@
from __future__ import annotations
import json
import subprocess
import sys
from pathlib import Path
import pandas as pd
from macro_lactone_toolkit import MacrolactoneFragmenter
from .helpers import build_ambiguous_smiles, build_macrolactone
PROJECT_ROOT = Path(__file__).resolve().parents[1]
ACTIVE_TEXT_ASSETS = [
PROJECT_ROOT / "scripts" / "README.md",
PROJECT_ROOT / "docs" / "SUMMARY.md",
PROJECT_ROOT / "docs" / "project-docs" / "QUICK_COMMANDS.md",
PROJECT_ROOT / "notebooks" / "README_analyze_ring16.md",
]
def run_script(script_name: str, *args: str) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[sys.executable, str(PROJECT_ROOT / "scripts" / script_name), *args],
capture_output=True,
text=True,
check=False,
cwd=PROJECT_ROOT,
)
def test_batch_process_script_writes_flat_outputs_and_summary(tmp_path):
valid = build_macrolactone(14, {4: "methyl"})
ambiguous = build_ambiguous_smiles()
input_path = tmp_path / "molecules.csv"
output_path = tmp_path / "fragments.csv"
errors_path = tmp_path / "errors.csv"
summary_path = tmp_path / "summary.json"
pd.DataFrame(
[
{"id": "valid_1", "smiles": valid.smiles},
{"id": "ambiguous_1", "smiles": ambiguous},
]
).to_csv(input_path, index=False)
completed = run_script(
"batch_process.py",
"--input",
str(input_path),
"--output",
str(output_path),
"--errors-output",
str(errors_path),
"--summary-output",
str(summary_path),
)
assert completed.returncode == 0, completed.stderr
assert output_path.exists()
assert errors_path.exists()
assert summary_path.exists()
summary = json.loads(summary_path.read_text(encoding="utf-8"))
assert summary["processed"] == 2
assert summary["successful"] == 1
assert summary["failed"] == 1
def test_analyze_fragments_script_generates_reports_and_plot(tmp_path):
built = build_macrolactone(16, {5: "methyl", 7: "ethyl"})
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="analysis_1")
fragments = pd.DataFrame(
[
{
"parent_id": result.parent_id,
"parent_smiles": result.parent_smiles,
"ring_size": result.ring_size,
**fragment.to_dict(),
}
for fragment in result.fragments
]
)
input_path = tmp_path / "fragments.csv"
output_dir = tmp_path / "analysis"
fragments.to_csv(input_path, index=False)
completed = run_script(
"analyze_fragments.py",
"--input",
str(input_path),
"--output-dir",
str(output_dir),
)
assert completed.returncode == 0, completed.stderr
assert (output_dir / "position_statistics.csv").exists()
assert (output_dir / "fragment_property_summary.csv").exists()
assert (output_dir / "position_frequencies.png").exists()
assert (output_dir / "analysis_summary.txt").exists()
def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path):
built = build_macrolactone(16, {5: "methyl"})
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="sdf_1")
fragments = pd.DataFrame(
[
{
"parent_id": result.parent_id,
"parent_smiles": result.parent_smiles,
"ring_size": result.ring_size,
**fragment.to_dict(),
}
for fragment in result.fragments
]
)
input_path = tmp_path / "fragments.csv"
output_dir = tmp_path / "sdf_output"
fragments.to_csv(input_path, index=False)
completed = run_script(
"generate_sdf_and_statistics.py",
"--input",
str(input_path),
"--output-dir",
str(output_dir),
)
assert completed.returncode == 0, completed.stderr
assert (output_dir / "cleavage_position_statistics.json").exists()
assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()
def test_active_text_assets_do_not_reference_legacy_api():
forbidden_patterns = [
"from src.",
"import src.",
"process_csv(",
"batch_to_dataframe(",
"visualize_molecule(",
"save_to_json(",
]
for path in ACTIVE_TEXT_ASSETS:
text = path.read_text(encoding="utf-8")
for pattern in forbidden_patterns:
assert pattern not in text, f"{path} still contains legacy reference: {pattern}"

View File

@@ -0,0 +1,171 @@
from __future__ import annotations
import json
import pandas as pd
import pytest
from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter
from .helpers import (
build_ambiguous_smiles,
build_macrolactone,
build_non_standard_ring_atom_macrolactone,
)
def test_visualization_exports_numbered_svg_and_png(tmp_path):
from macro_lactone_toolkit.visualization import (
numbered_molecule_svg,
save_fragment_png,
save_numbered_molecule_png,
)
built = build_macrolactone(16, {5: "methyl"})
fragment = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="viz_1").fragments[0]
svg = numbered_molecule_svg(built.smiles)
assert "<svg" in svg
numbered_path = tmp_path / "numbered.png"
returned_numbered_path = save_numbered_molecule_png(built.smiles, numbered_path)
assert returned_numbered_path == numbered_path
assert numbered_path.exists()
assert numbered_path.stat().st_size > 0
fragment_path = tmp_path / "fragment.png"
returned_fragment_path = save_fragment_png(fragment.fragment_smiles_labeled, fragment_path)
assert returned_fragment_path == fragment_path
assert fragment_path.exists()
assert fragment_path.stat().st_size > 0
def test_visualization_supports_allowed_ring_atom_type_filtering():
from macro_lactone_toolkit.visualization import numbered_molecule_svg
hetero = build_non_standard_ring_atom_macrolactone()
svg = numbered_molecule_svg(hetero.smiles, allowed_ring_atom_types=["C", "N"])
assert "<svg" in svg
with pytest.raises(ValueError, match="allowed ring atom types"):
numbered_molecule_svg(hetero.smiles, allowed_ring_atom_types=["C"])
def test_fragment_csv_and_results_to_dataframe(tmp_path):
from macro_lactone_toolkit.workflows import fragment_csv, results_to_dataframe, write_result_json
valid_14 = build_macrolactone(14, {4: "methyl"})
valid_16 = build_macrolactone(16, {6: "ethyl"})
input_path = tmp_path / "molecules.csv"
pd.DataFrame(
[
{"id": "mol_14", "smiles": valid_14.smiles},
{"id": "mol_16", "smiles": valid_16.smiles},
]
).to_csv(input_path, index=False)
results = fragment_csv(str(input_path))
dataframe = results_to_dataframe(results)
assert {result.parent_id for result in results} == {"mol_14", "mol_16"}
assert {
"parent_id",
"parent_smiles",
"ring_size",
"fragment_id",
"cleavage_position",
"attachment_atom_idx",
"fragment_smiles_labeled",
"fragment_smiles_plain",
"atom_count",
"molecular_weight",
}.issubset(dataframe.columns)
json_path = tmp_path / "result.json"
returned_json_path = write_result_json(results[0], json_path)
assert returned_json_path == json_path
payload = json.loads(json_path.read_text(encoding="utf-8"))
assert payload["parent_id"] in {"mol_14", "mol_16"}
assert payload["fragments"]
def test_fragment_csv_raises_for_invalid_or_ambiguous_rows(tmp_path):
from macro_lactone_toolkit.workflows import fragment_csv
valid = build_macrolactone(14)
input_path = tmp_path / "molecules.csv"
pd.DataFrame(
[
{"id": "valid_1", "smiles": valid.smiles},
{"id": "ambiguous_1", "smiles": build_ambiguous_smiles()},
]
).to_csv(input_path, index=False)
with pytest.raises(Exception, match="ambiguous|Multiple valid macrolactone candidates"):
fragment_csv(str(input_path))
def test_export_numbered_macrolactone_csv_writes_status_and_images(tmp_path):
from macro_lactone_toolkit.workflows import export_numbered_macrolactone_csv
valid = build_macrolactone(14)
hetero = build_non_standard_ring_atom_macrolactone()
input_path = tmp_path / "molecules.csv"
output_dir = tmp_path / "numbered"
pd.DataFrame(
[
{"id": "valid_1", "smiles": valid.smiles},
{"id": "hetero_1", "smiles": hetero.smiles},
]
).to_csv(input_path, index=False)
csv_path = export_numbered_macrolactone_csv(
str(input_path),
output_dir=output_dir,
allowed_ring_atom_types=["C", "N"],
)
exported = pd.read_csv(csv_path)
assert {"parent_id", "status", "image_path", "classification", "primary_reason_code", "ring_size"}.issubset(
exported.columns
)
assert set(exported["parent_id"]) == {"valid_1", "hetero_1"}
assert set(exported["status"]) == {"success"}
for image_path in exported["image_path"]:
assert image_path
assert (tmp_path / image_path).exists()
def test_analyzer_bulk_helpers():
valid = build_macrolactone(12)
hetero = build_non_standard_ring_atom_macrolactone()
non_lactone = "C1CCCCCCCCCCC1"
dataframe = pd.DataFrame(
[
{"id": "valid_1", "smiles": valid.smiles},
{"id": "hetero_1", "smiles": hetero.smiles},
{"id": "plain_1", "smiles": non_lactone},
]
)
analyzer = MacroLactoneAnalyzer()
summary = analyzer.analyze_many([valid.smiles, hetero.smiles, non_lactone])
ring_size_groups, rejected = analyzer.classify_dataframe(dataframe)
smarts_match = analyzer.match_dynamic_smarts(valid.smiles, ring_size=12)
properties = analyzer.calculate_properties(valid.smiles)
assert summary["total"] == 3
assert summary["classification_counts"]["standard_macrolactone"] == 1
assert summary["classification_counts"]["non_standard_macrocycle"] == 1
assert summary["classification_counts"]["not_macrolactone"] == 1
assert 12 in ring_size_groups
assert list(ring_size_groups[12]["id"]) == ["valid_1"]
assert set(rejected["classification"]) == {"non_standard_macrocycle", "not_macrolactone"}
assert smarts_match is not None
assert properties is not None
assert {"molecular_weight", "logp", "qed", "tpsa"}.issubset(properties)