feat(toolkit): add classification and migration

Implement the standard/non-standard/not-macrolactone classification layer
and integrate it into analyzer, fragmenter, and CLI outputs.

Port the remaining legacy package capabilities into new visualization and
workflow modules, restore batch/statistics/SDF scripts on top of the flat
CSV workflow, and update active docs to the new package API.
This commit is contained in:
2026-03-18 23:56:41 +08:00
parent 9ccbcfcd04
commit c0ead42384
24 changed files with 1497 additions and 313 deletions

View File

@@ -0,0 +1,149 @@
from __future__ import annotations
import json
import subprocess
import sys
from pathlib import Path
import pandas as pd
from macro_lactone_toolkit import MacrolactoneFragmenter
from .helpers import build_ambiguous_smiles, build_macrolactone
PROJECT_ROOT = Path(__file__).resolve().parents[1]
ACTIVE_TEXT_ASSETS = [
PROJECT_ROOT / "scripts" / "README.md",
PROJECT_ROOT / "docs" / "SUMMARY.md",
PROJECT_ROOT / "docs" / "project-docs" / "QUICK_COMMANDS.md",
PROJECT_ROOT / "notebooks" / "README_analyze_ring16.md",
]
def run_script(script_name: str, *args: str) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[sys.executable, str(PROJECT_ROOT / "scripts" / script_name), *args],
capture_output=True,
text=True,
check=False,
cwd=PROJECT_ROOT,
)
def test_batch_process_script_writes_flat_outputs_and_summary(tmp_path):
valid = build_macrolactone(14, {4: "methyl"})
ambiguous = build_ambiguous_smiles()
input_path = tmp_path / "molecules.csv"
output_path = tmp_path / "fragments.csv"
errors_path = tmp_path / "errors.csv"
summary_path = tmp_path / "summary.json"
pd.DataFrame(
[
{"id": "valid_1", "smiles": valid.smiles},
{"id": "ambiguous_1", "smiles": ambiguous},
]
).to_csv(input_path, index=False)
completed = run_script(
"batch_process.py",
"--input",
str(input_path),
"--output",
str(output_path),
"--errors-output",
str(errors_path),
"--summary-output",
str(summary_path),
)
assert completed.returncode == 0, completed.stderr
assert output_path.exists()
assert errors_path.exists()
assert summary_path.exists()
summary = json.loads(summary_path.read_text(encoding="utf-8"))
assert summary["processed"] == 2
assert summary["successful"] == 1
assert summary["failed"] == 1
def test_analyze_fragments_script_generates_reports_and_plot(tmp_path):
built = build_macrolactone(16, {5: "methyl", 7: "ethyl"})
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="analysis_1")
fragments = pd.DataFrame(
[
{
"parent_id": result.parent_id,
"parent_smiles": result.parent_smiles,
"ring_size": result.ring_size,
**fragment.to_dict(),
}
for fragment in result.fragments
]
)
input_path = tmp_path / "fragments.csv"
output_dir = tmp_path / "analysis"
fragments.to_csv(input_path, index=False)
completed = run_script(
"analyze_fragments.py",
"--input",
str(input_path),
"--output-dir",
str(output_dir),
)
assert completed.returncode == 0, completed.stderr
assert (output_dir / "position_statistics.csv").exists()
assert (output_dir / "fragment_property_summary.csv").exists()
assert (output_dir / "position_frequencies.png").exists()
assert (output_dir / "analysis_summary.txt").exists()
def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path):
built = build_macrolactone(16, {5: "methyl"})
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="sdf_1")
fragments = pd.DataFrame(
[
{
"parent_id": result.parent_id,
"parent_smiles": result.parent_smiles,
"ring_size": result.ring_size,
**fragment.to_dict(),
}
for fragment in result.fragments
]
)
input_path = tmp_path / "fragments.csv"
output_dir = tmp_path / "sdf_output"
fragments.to_csv(input_path, index=False)
completed = run_script(
"generate_sdf_and_statistics.py",
"--input",
str(input_path),
"--output-dir",
str(output_dir),
)
assert completed.returncode == 0, completed.stderr
assert (output_dir / "cleavage_position_statistics.json").exists()
assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()
def test_active_text_assets_do_not_reference_legacy_api():
forbidden_patterns = [
"from src.",
"import src.",
"process_csv(",
"batch_to_dataframe(",
"visualize_molecule(",
"save_to_json(",
]
for path in ACTIVE_TEXT_ASSETS:
text = path.read_text(encoding="utf-8")
for pattern in forbidden_patterns:
assert pattern not in text, f"{path} still contains legacy reference: {pattern}"