Implement the standard/non-standard/not-macrolactone classification layer and integrate it into analyzer, fragmenter, and CLI outputs. Port the remaining legacy package capabilities into new visualization and workflow modules, restore batch/statistics/SDF scripts on top of the flat CSV workflow, and update active docs to the new package API.
132 lines
4.4 KiB
Python
132 lines
4.4 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
|
|
import pandas as pd
|
|
|
|
from .helpers import (
|
|
build_ambiguous_smiles,
|
|
build_macrolactone,
|
|
build_non_standard_ring_atom_macrolactone,
|
|
build_overlapping_candidate_macrolactone,
|
|
)
|
|
|
|
|
|
def run_cli(*args: str) -> subprocess.CompletedProcess[str]:
|
|
return subprocess.run(
|
|
[sys.executable, "-m", "macro_lactone_toolkit.cli", *args],
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
|
|
|
|
def test_cli_smoke_commands():
|
|
built = build_macrolactone(16, {5: "methyl"})
|
|
|
|
analyze = run_cli("analyze", "--smiles", built.smiles)
|
|
assert analyze.returncode == 0, analyze.stderr
|
|
analyze_payload = json.loads(analyze.stdout)
|
|
assert analyze_payload["classification"] == "standard_macrolactone"
|
|
assert analyze_payload["ring_size"] == 16
|
|
assert analyze_payload["primary_reason_code"] is None
|
|
assert analyze_payload["candidate_ring_sizes"] == [16]
|
|
|
|
number = run_cli("number", "--smiles", built.smiles)
|
|
assert number.returncode == 0, number.stderr
|
|
number_payload = json.loads(number.stdout)
|
|
assert number_payload["ring_size"] == 16
|
|
assert number_payload["position_to_atom"]["1"] >= 0
|
|
|
|
fragment = run_cli("fragment", "--smiles", built.smiles, "--parent-id", "cli_1")
|
|
assert fragment.returncode == 0, fragment.stderr
|
|
fragment_payload = json.loads(fragment.stdout)
|
|
assert fragment_payload["parent_id"] == "cli_1"
|
|
assert fragment_payload["ring_size"] == 16
|
|
assert fragment_payload["fragments"][0]["fragment_smiles_labeled"]
|
|
|
|
|
|
def test_cli_analyze_reports_non_standard_classifications():
|
|
hetero = build_non_standard_ring_atom_macrolactone()
|
|
overlap = build_overlapping_candidate_macrolactone()
|
|
|
|
hetero_result = run_cli("analyze", "--smiles", hetero.smiles)
|
|
assert hetero_result.returncode == 0, hetero_result.stderr
|
|
hetero_payload = json.loads(hetero_result.stdout)
|
|
assert hetero_payload["classification"] == "non_standard_macrocycle"
|
|
assert hetero_payload["primary_reason_code"] == "contains_non_carbon_ring_atoms_outside_positions_1_2"
|
|
assert hetero_payload["ring_size"] == 16
|
|
|
|
overlap_result = run_cli("analyze", "--smiles", overlap.smiles)
|
|
assert overlap_result.returncode == 0, overlap_result.stderr
|
|
overlap_payload = json.loads(overlap_result.stdout)
|
|
assert overlap_payload["classification"] == "non_standard_macrocycle"
|
|
assert overlap_payload["primary_reason_code"] == "multiple_overlapping_macrocycle_candidates"
|
|
assert overlap_payload["ring_size"] == 12
|
|
|
|
|
|
def test_cli_analyze_csv_reports_classification_fields(tmp_path):
|
|
valid = build_macrolactone(14)
|
|
hetero = build_non_standard_ring_atom_macrolactone()
|
|
input_path = tmp_path / "molecules.csv"
|
|
output_path = tmp_path / "analysis.csv"
|
|
|
|
pd.DataFrame(
|
|
[
|
|
{"id": "valid_1", "smiles": valid.smiles},
|
|
{"id": "hetero_1", "smiles": hetero.smiles},
|
|
]
|
|
).to_csv(input_path, index=False)
|
|
|
|
completed = run_cli(
|
|
"analyze",
|
|
"--input",
|
|
str(input_path),
|
|
"--output",
|
|
str(output_path),
|
|
)
|
|
|
|
assert completed.returncode == 0, completed.stderr
|
|
|
|
analysis = pd.read_csv(output_path)
|
|
assert set(analysis["parent_id"]) == {"valid_1", "hetero_1"}
|
|
assert set(analysis["classification"]) == {"standard_macrolactone", "non_standard_macrocycle"}
|
|
assert "primary_reason_code" in analysis.columns
|
|
assert "ring_size" in analysis.columns
|
|
|
|
|
|
def test_cli_fragment_csv_skips_ambiguous_and_records_errors(tmp_path):
|
|
valid = build_macrolactone(14, {4: "methyl"})
|
|
ambiguous = build_ambiguous_smiles()
|
|
input_path = tmp_path / "molecules.csv"
|
|
output_path = tmp_path / "fragments.csv"
|
|
errors_path = tmp_path / "errors.csv"
|
|
|
|
pd.DataFrame(
|
|
[
|
|
{"id": "valid_1", "smiles": valid.smiles},
|
|
{"id": "ambiguous_1", "smiles": ambiguous},
|
|
]
|
|
).to_csv(input_path, index=False)
|
|
|
|
completed = run_cli(
|
|
"fragment",
|
|
"--input",
|
|
str(input_path),
|
|
"--output",
|
|
str(output_path),
|
|
"--errors-output",
|
|
str(errors_path),
|
|
)
|
|
|
|
assert completed.returncode == 0, completed.stderr
|
|
|
|
fragments = pd.read_csv(output_path)
|
|
errors = pd.read_csv(errors_path)
|
|
|
|
assert set(fragments["parent_id"]) == {"valid_1"}
|
|
assert errors.loc[0, "parent_id"] == "ambiguous_1"
|
|
assert errors.loc[0, "error_type"] == "AmbiguousMacrolactoneError"
|