feat(toolkit): add classification and migration

Implement the standard/non-standard/not-macrolactone classification layer and integrate it into analyzer, fragmenter, and CLI outputs. Port the remaining legacy package capabilities into new visualization and workflow modules, restore batch/statistics/SDF scripts on top of the flat CSV workflow, and update active docs to the new package API.
2026-03-18 23:56:41 +08:00
parent 9ccbcfcd04
commit c0ead42384
24 changed files with 1497 additions and 313 deletions
--- a/tests/test_scripts_and_docs.py
+++ b/tests/test_scripts_and_docs.py
@@ -0,0 +1,149 @@
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pandas as pd
+
+from macro_lactone_toolkit import MacrolactoneFragmenter
+
+from .helpers import build_ambiguous_smiles, build_macrolactone
+
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+ACTIVE_TEXT_ASSETS = [
+    PROJECT_ROOT / "scripts" / "README.md",
+    PROJECT_ROOT / "docs" / "SUMMARY.md",
+    PROJECT_ROOT / "docs" / "project-docs" / "QUICK_COMMANDS.md",
+    PROJECT_ROOT / "notebooks" / "README_analyze_ring16.md",
+]
+
+
+def run_script(script_name: str, *args: str) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(
+        [sys.executable, str(PROJECT_ROOT / "scripts" / script_name), *args],
+        capture_output=True,
+        text=True,
+        check=False,
+        cwd=PROJECT_ROOT,
+    )
+
+
+def test_batch_process_script_writes_flat_outputs_and_summary(tmp_path):
+    valid = build_macrolactone(14, {4: "methyl"})
+    ambiguous = build_ambiguous_smiles()
+    input_path = tmp_path / "molecules.csv"
+    output_path = tmp_path / "fragments.csv"
+    errors_path = tmp_path / "errors.csv"
+    summary_path = tmp_path / "summary.json"
+
+    pd.DataFrame(
+        [
+            {"id": "valid_1", "smiles": valid.smiles},
+            {"id": "ambiguous_1", "smiles": ambiguous},
+        ]
+    ).to_csv(input_path, index=False)
+
+    completed = run_script(
+        "batch_process.py",
+        "--input",
+        str(input_path),
+        "--output",
+        str(output_path),
+        "--errors-output",
+        str(errors_path),
+        "--summary-output",
+        str(summary_path),
+    )
+
+    assert completed.returncode == 0, completed.stderr
+    assert output_path.exists()
+    assert errors_path.exists()
+    assert summary_path.exists()
+
+    summary = json.loads(summary_path.read_text(encoding="utf-8"))
+    assert summary["processed"] == 2
+    assert summary["successful"] == 1
+    assert summary["failed"] == 1
+
+
+def test_analyze_fragments_script_generates_reports_and_plot(tmp_path):
+    built = build_macrolactone(16, {5: "methyl", 7: "ethyl"})
+    result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="analysis_1")
+    fragments = pd.DataFrame(
+        [
+            {
+                "parent_id": result.parent_id,
+                "parent_smiles": result.parent_smiles,
+                "ring_size": result.ring_size,
+                **fragment.to_dict(),
+            }
+            for fragment in result.fragments
+        ]
+    )
+    input_path = tmp_path / "fragments.csv"
+    output_dir = tmp_path / "analysis"
+    fragments.to_csv(input_path, index=False)
+
+    completed = run_script(
+        "analyze_fragments.py",
+        "--input",
+        str(input_path),
+        "--output-dir",
+        str(output_dir),
+    )
+
+    assert completed.returncode == 0, completed.stderr
+    assert (output_dir / "position_statistics.csv").exists()
+    assert (output_dir / "fragment_property_summary.csv").exists()
+    assert (output_dir / "position_frequencies.png").exists()
+    assert (output_dir / "analysis_summary.txt").exists()
+
+
+def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path):
+    built = build_macrolactone(16, {5: "methyl"})
+    result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="sdf_1")
+    fragments = pd.DataFrame(
+        [
+            {
+                "parent_id": result.parent_id,
+                "parent_smiles": result.parent_smiles,
+                "ring_size": result.ring_size,
+                **fragment.to_dict(),
+            }
+            for fragment in result.fragments
+        ]
+    )
+    input_path = tmp_path / "fragments.csv"
+    output_dir = tmp_path / "sdf_output"
+    fragments.to_csv(input_path, index=False)
+
+    completed = run_script(
+        "generate_sdf_and_statistics.py",
+        "--input",
+        str(input_path),
+        "--output-dir",
+        str(output_dir),
+    )
+
+    assert completed.returncode == 0, completed.stderr
+    assert (output_dir / "cleavage_position_statistics.json").exists()
+    assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()
+
+
+def test_active_text_assets_do_not_reference_legacy_api():
+    forbidden_patterns = [
+        "from src.",
+        "import src.",
+        "process_csv(",
+        "batch_to_dataframe(",
+        "visualize_molecule(",
+        "save_to_json(",
+    ]
+
+    for path in ACTIVE_TEXT_ASSETS:
+        text = path.read_text(encoding="utf-8")
+        for pattern in forbidden_patterns:
+            assert pattern not in text, f"{path} still contains legacy reference: {pattern}"