Implement the standard/non-standard/not-macrolactone classification layer and integrate it into analyzer, fragmenter, and CLI outputs. Port the remaining legacy package capabilities into new visualization and workflow modules, restore batch/statistics/SDF scripts on top of the flat CSV workflow, and update active docs to the new package API.
150 lines
4.5 KiB
Python
150 lines
4.5 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
from macro_lactone_toolkit import MacrolactoneFragmenter
|
|
|
|
from .helpers import build_ambiguous_smiles, build_macrolactone
|
|
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
|
ACTIVE_TEXT_ASSETS = [
|
|
PROJECT_ROOT / "scripts" / "README.md",
|
|
PROJECT_ROOT / "docs" / "SUMMARY.md",
|
|
PROJECT_ROOT / "docs" / "project-docs" / "QUICK_COMMANDS.md",
|
|
PROJECT_ROOT / "notebooks" / "README_analyze_ring16.md",
|
|
]
|
|
|
|
|
|
def run_script(script_name: str, *args: str) -> subprocess.CompletedProcess[str]:
|
|
return subprocess.run(
|
|
[sys.executable, str(PROJECT_ROOT / "scripts" / script_name), *args],
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
cwd=PROJECT_ROOT,
|
|
)
|
|
|
|
|
|
def test_batch_process_script_writes_flat_outputs_and_summary(tmp_path):
|
|
valid = build_macrolactone(14, {4: "methyl"})
|
|
ambiguous = build_ambiguous_smiles()
|
|
input_path = tmp_path / "molecules.csv"
|
|
output_path = tmp_path / "fragments.csv"
|
|
errors_path = tmp_path / "errors.csv"
|
|
summary_path = tmp_path / "summary.json"
|
|
|
|
pd.DataFrame(
|
|
[
|
|
{"id": "valid_1", "smiles": valid.smiles},
|
|
{"id": "ambiguous_1", "smiles": ambiguous},
|
|
]
|
|
).to_csv(input_path, index=False)
|
|
|
|
completed = run_script(
|
|
"batch_process.py",
|
|
"--input",
|
|
str(input_path),
|
|
"--output",
|
|
str(output_path),
|
|
"--errors-output",
|
|
str(errors_path),
|
|
"--summary-output",
|
|
str(summary_path),
|
|
)
|
|
|
|
assert completed.returncode == 0, completed.stderr
|
|
assert output_path.exists()
|
|
assert errors_path.exists()
|
|
assert summary_path.exists()
|
|
|
|
summary = json.loads(summary_path.read_text(encoding="utf-8"))
|
|
assert summary["processed"] == 2
|
|
assert summary["successful"] == 1
|
|
assert summary["failed"] == 1
|
|
|
|
|
|
def test_analyze_fragments_script_generates_reports_and_plot(tmp_path):
|
|
built = build_macrolactone(16, {5: "methyl", 7: "ethyl"})
|
|
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="analysis_1")
|
|
fragments = pd.DataFrame(
|
|
[
|
|
{
|
|
"parent_id": result.parent_id,
|
|
"parent_smiles": result.parent_smiles,
|
|
"ring_size": result.ring_size,
|
|
**fragment.to_dict(),
|
|
}
|
|
for fragment in result.fragments
|
|
]
|
|
)
|
|
input_path = tmp_path / "fragments.csv"
|
|
output_dir = tmp_path / "analysis"
|
|
fragments.to_csv(input_path, index=False)
|
|
|
|
completed = run_script(
|
|
"analyze_fragments.py",
|
|
"--input",
|
|
str(input_path),
|
|
"--output-dir",
|
|
str(output_dir),
|
|
)
|
|
|
|
assert completed.returncode == 0, completed.stderr
|
|
assert (output_dir / "position_statistics.csv").exists()
|
|
assert (output_dir / "fragment_property_summary.csv").exists()
|
|
assert (output_dir / "position_frequencies.png").exists()
|
|
assert (output_dir / "analysis_summary.txt").exists()
|
|
|
|
|
|
def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path):
|
|
built = build_macrolactone(16, {5: "methyl"})
|
|
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="sdf_1")
|
|
fragments = pd.DataFrame(
|
|
[
|
|
{
|
|
"parent_id": result.parent_id,
|
|
"parent_smiles": result.parent_smiles,
|
|
"ring_size": result.ring_size,
|
|
**fragment.to_dict(),
|
|
}
|
|
for fragment in result.fragments
|
|
]
|
|
)
|
|
input_path = tmp_path / "fragments.csv"
|
|
output_dir = tmp_path / "sdf_output"
|
|
fragments.to_csv(input_path, index=False)
|
|
|
|
completed = run_script(
|
|
"generate_sdf_and_statistics.py",
|
|
"--input",
|
|
str(input_path),
|
|
"--output-dir",
|
|
str(output_dir),
|
|
)
|
|
|
|
assert completed.returncode == 0, completed.stderr
|
|
assert (output_dir / "cleavage_position_statistics.json").exists()
|
|
assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()
|
|
|
|
|
|
def test_active_text_assets_do_not_reference_legacy_api():
|
|
forbidden_patterns = [
|
|
"from src.",
|
|
"import src.",
|
|
"process_csv(",
|
|
"batch_to_dataframe(",
|
|
"visualize_molecule(",
|
|
"save_to_json(",
|
|
]
|
|
|
|
for path in ACTIVE_TEXT_ASSETS:
|
|
text = path.read_text(encoding="utf-8")
|
|
for pattern in forbidden_patterns:
|
|
assert pattern not in text, f"{path} still contains legacy reference: {pattern}"
|