feat(toolkit): add classification and migration
Implement the standard/non-standard/not-macrolactone classification layer and integrate it into analyzer, fragmenter, and CLI outputs. Port the remaining legacy package capabilities into new visualization and workflow modules, restore batch/statistics/SDF scripts on top of the flat CSV workflow, and update active docs to the new package API.
This commit is contained in:
149
tests/test_scripts_and_docs.py
Normal file
149
tests/test_scripts_and_docs.py
Normal file
@@ -0,0 +1,149 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from macro_lactone_toolkit import MacrolactoneFragmenter
|
||||
|
||||
from .helpers import build_ambiguous_smiles, build_macrolactone
|
||||
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
ACTIVE_TEXT_ASSETS = [
|
||||
PROJECT_ROOT / "scripts" / "README.md",
|
||||
PROJECT_ROOT / "docs" / "SUMMARY.md",
|
||||
PROJECT_ROOT / "docs" / "project-docs" / "QUICK_COMMANDS.md",
|
||||
PROJECT_ROOT / "notebooks" / "README_analyze_ring16.md",
|
||||
]
|
||||
|
||||
|
||||
def run_script(script_name: str, *args: str) -> subprocess.CompletedProcess[str]:
|
||||
return subprocess.run(
|
||||
[sys.executable, str(PROJECT_ROOT / "scripts" / script_name), *args],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
cwd=PROJECT_ROOT,
|
||||
)
|
||||
|
||||
|
||||
def test_batch_process_script_writes_flat_outputs_and_summary(tmp_path):
|
||||
valid = build_macrolactone(14, {4: "methyl"})
|
||||
ambiguous = build_ambiguous_smiles()
|
||||
input_path = tmp_path / "molecules.csv"
|
||||
output_path = tmp_path / "fragments.csv"
|
||||
errors_path = tmp_path / "errors.csv"
|
||||
summary_path = tmp_path / "summary.json"
|
||||
|
||||
pd.DataFrame(
|
||||
[
|
||||
{"id": "valid_1", "smiles": valid.smiles},
|
||||
{"id": "ambiguous_1", "smiles": ambiguous},
|
||||
]
|
||||
).to_csv(input_path, index=False)
|
||||
|
||||
completed = run_script(
|
||||
"batch_process.py",
|
||||
"--input",
|
||||
str(input_path),
|
||||
"--output",
|
||||
str(output_path),
|
||||
"--errors-output",
|
||||
str(errors_path),
|
||||
"--summary-output",
|
||||
str(summary_path),
|
||||
)
|
||||
|
||||
assert completed.returncode == 0, completed.stderr
|
||||
assert output_path.exists()
|
||||
assert errors_path.exists()
|
||||
assert summary_path.exists()
|
||||
|
||||
summary = json.loads(summary_path.read_text(encoding="utf-8"))
|
||||
assert summary["processed"] == 2
|
||||
assert summary["successful"] == 1
|
||||
assert summary["failed"] == 1
|
||||
|
||||
|
||||
def test_analyze_fragments_script_generates_reports_and_plot(tmp_path):
|
||||
built = build_macrolactone(16, {5: "methyl", 7: "ethyl"})
|
||||
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="analysis_1")
|
||||
fragments = pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"parent_id": result.parent_id,
|
||||
"parent_smiles": result.parent_smiles,
|
||||
"ring_size": result.ring_size,
|
||||
**fragment.to_dict(),
|
||||
}
|
||||
for fragment in result.fragments
|
||||
]
|
||||
)
|
||||
input_path = tmp_path / "fragments.csv"
|
||||
output_dir = tmp_path / "analysis"
|
||||
fragments.to_csv(input_path, index=False)
|
||||
|
||||
completed = run_script(
|
||||
"analyze_fragments.py",
|
||||
"--input",
|
||||
str(input_path),
|
||||
"--output-dir",
|
||||
str(output_dir),
|
||||
)
|
||||
|
||||
assert completed.returncode == 0, completed.stderr
|
||||
assert (output_dir / "position_statistics.csv").exists()
|
||||
assert (output_dir / "fragment_property_summary.csv").exists()
|
||||
assert (output_dir / "position_frequencies.png").exists()
|
||||
assert (output_dir / "analysis_summary.txt").exists()
|
||||
|
||||
|
||||
def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path):
|
||||
built = build_macrolactone(16, {5: "methyl"})
|
||||
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="sdf_1")
|
||||
fragments = pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"parent_id": result.parent_id,
|
||||
"parent_smiles": result.parent_smiles,
|
||||
"ring_size": result.ring_size,
|
||||
**fragment.to_dict(),
|
||||
}
|
||||
for fragment in result.fragments
|
||||
]
|
||||
)
|
||||
input_path = tmp_path / "fragments.csv"
|
||||
output_dir = tmp_path / "sdf_output"
|
||||
fragments.to_csv(input_path, index=False)
|
||||
|
||||
completed = run_script(
|
||||
"generate_sdf_and_statistics.py",
|
||||
"--input",
|
||||
str(input_path),
|
||||
"--output-dir",
|
||||
str(output_dir),
|
||||
)
|
||||
|
||||
assert completed.returncode == 0, completed.stderr
|
||||
assert (output_dir / "cleavage_position_statistics.json").exists()
|
||||
assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()
|
||||
|
||||
|
||||
def test_active_text_assets_do_not_reference_legacy_api():
|
||||
forbidden_patterns = [
|
||||
"from src.",
|
||||
"import src.",
|
||||
"process_csv(",
|
||||
"batch_to_dataframe(",
|
||||
"visualize_molecule(",
|
||||
"save_to_json(",
|
||||
]
|
||||
|
||||
for path in ACTIVE_TEXT_ASSETS:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
for pattern in forbidden_patterns:
|
||||
assert pattern not in text, f"{path} still contains legacy reference: {pattern}"
|
||||
Reference in New Issue
Block a user