Files
macrolactone-toolkit/tests/test_scripts_and_docs.py
lingyuzeng 3e07402f4e feat(numbering): publish canonical numbering API
Add a public numbering module and route fragmenting, validation,
and scaffold preparation through the canonical numbering entry.

Rewrite the repository entry docs around the fixed numbering
contract, add MkDocs landing pages, and document the mirror
mapping used for medicinal-chemistry comparisons.

Also refresh the validation analysis reports to explain the
canonical-versus-mirrored numbering relationship.
2026-03-20 15:14:31 +08:00

308 lines
11 KiB
Python

from __future__ import annotations
import json
import sqlite3
import subprocess
import sys
from pathlib import Path
import pandas as pd
from macro_lactone_toolkit import MacrolactoneFragmenter
from .helpers import build_ambiguous_smiles, build_macrolactone
PROJECT_ROOT = Path(__file__).resolve().parents[1]
ACTIVE_TEXT_ASSETS = [
PROJECT_ROOT / "scripts" / "README.md",
PROJECT_ROOT / "docs" / "SUMMARY.md",
PROJECT_ROOT / "docs" / "project-docs" / "QUICK_COMMANDS.md",
PROJECT_ROOT / "notebooks" / "README_analyze_ring16.md",
]
def run_script(script_name: str, *args: str) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[sys.executable, str(PROJECT_ROOT / "scripts" / script_name), *args],
capture_output=True,
text=True,
check=False,
cwd=PROJECT_ROOT,
)
def test_batch_process_script_writes_flat_outputs_and_summary(tmp_path):
valid = build_macrolactone(14, {4: "methyl"})
ambiguous = build_ambiguous_smiles()
input_path = tmp_path / "molecules.csv"
output_path = tmp_path / "fragments.csv"
errors_path = tmp_path / "errors.csv"
summary_path = tmp_path / "summary.json"
pd.DataFrame(
[
{"id": "valid_1", "smiles": valid.smiles},
{"id": "ambiguous_1", "smiles": ambiguous},
]
).to_csv(input_path, index=False)
completed = run_script(
"batch_process.py",
"--input",
str(input_path),
"--output",
str(output_path),
"--errors-output",
str(errors_path),
"--summary-output",
str(summary_path),
)
assert completed.returncode == 0, completed.stderr
assert output_path.exists()
assert errors_path.exists()
assert summary_path.exists()
summary = json.loads(summary_path.read_text(encoding="utf-8"))
assert summary["processed"] == 2
assert summary["successful"] == 1
assert summary["failed"] == 1
def test_analyze_fragments_script_generates_reports_and_plot(tmp_path):
built = build_macrolactone(16, {5: "methyl", 7: "ethyl"})
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="analysis_1")
fragments = pd.DataFrame(
[
{
"parent_id": result.parent_id,
"parent_smiles": result.parent_smiles,
"ring_size": result.ring_size,
**fragment.to_dict(),
}
for fragment in result.fragments
]
)
input_path = tmp_path / "fragments.csv"
output_dir = tmp_path / "analysis"
fragments.to_csv(input_path, index=False)
completed = run_script(
"analyze_fragments.py",
"--input",
str(input_path),
"--output-dir",
str(output_dir),
)
assert completed.returncode == 0, completed.stderr
assert (output_dir / "position_statistics.csv").exists()
assert (output_dir / "fragment_property_summary.csv").exists()
assert (output_dir / "position_frequencies.png").exists()
assert (output_dir / "analysis_summary.txt").exists()
def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path):
built = build_macrolactone(16, {5: "methyl"})
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="sdf_1")
fragments = pd.DataFrame(
[
{
"parent_id": result.parent_id,
"parent_smiles": result.parent_smiles,
"ring_size": result.ring_size,
**fragment.to_dict(),
}
for fragment in result.fragments
]
)
input_path = tmp_path / "fragments.csv"
output_dir = tmp_path / "sdf_output"
fragments.to_csv(input_path, index=False)
completed = run_script(
"generate_sdf_and_statistics.py",
"--input",
str(input_path),
"--output-dir",
str(output_dir),
)
assert completed.returncode == 0, completed.stderr
assert (output_dir / "cleavage_position_statistics.json").exists()
assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()
def test_analyze_validation_fragment_library_script_generates_reports(tmp_path):
input_path = tmp_path / "fragment_library.csv"
db_path = tmp_path / "fragments.db"
output_dir = tmp_path / "fragment_library_analysis"
pd.DataFrame(
[
{
"id": 1,
"source_type": "validation_extract",
"source_fragment_id": "ML16A_frag_0",
"source_parent_ml_id": "ML16A",
"source_parent_chembl_id": None,
"cleavage_position": 3,
"fragment_smiles_labeled": "[3*]C",
"fragment_smiles_plain": "*C",
"has_dummy_atom": True,
"dummy_atom_count": 1,
"splice_ready": True,
"original_bond_type": "SINGLE",
"created_at": "2026-03-19 00:00:00",
},
{
"id": 2,
"source_type": "validation_extract",
"source_fragment_id": "ML16A_frag_1",
"source_parent_ml_id": "ML16A",
"source_parent_chembl_id": None,
"cleavage_position": 3,
"fragment_smiles_labeled": "[3*]CC",
"fragment_smiles_plain": "*CC",
"has_dummy_atom": True,
"dummy_atom_count": 1,
"splice_ready": True,
"original_bond_type": "SINGLE",
"created_at": "2026-03-19 00:00:00",
},
{
"id": 3,
"source_type": "validation_extract",
"source_fragment_id": "ML16B_frag_0",
"source_parent_ml_id": "ML16B",
"source_parent_chembl_id": None,
"cleavage_position": 4,
"fragment_smiles_labeled": "[4*]O",
"fragment_smiles_plain": "*O",
"has_dummy_atom": True,
"dummy_atom_count": 1,
"splice_ready": True,
"original_bond_type": "SINGLE",
"created_at": "2026-03-19 00:00:00",
},
{
"id": 4,
"source_type": "validation_extract",
"source_fragment_id": "ML14A_frag_0",
"source_parent_ml_id": "ML14A",
"source_parent_chembl_id": None,
"cleavage_position": 3,
"fragment_smiles_labeled": "[3*]CCC",
"fragment_smiles_plain": "*CCC",
"has_dummy_atom": True,
"dummy_atom_count": 1,
"splice_ready": True,
"original_bond_type": "SINGLE",
"created_at": "2026-03-19 00:00:00",
},
]
).to_csv(input_path, index=False)
with sqlite3.connect(db_path) as connection:
connection.execute(
"""
CREATE TABLE parent_molecules (
id INTEGER PRIMARY KEY,
ml_id TEXT NOT NULL,
chembl_id TEXT,
molecule_name TEXT,
smiles TEXT NOT NULL,
classification TEXT NOT NULL,
ring_size INTEGER,
primary_reason_code TEXT,
primary_reason_message TEXT,
processing_status TEXT NOT NULL,
error_message TEXT,
num_sidechains INTEGER,
cleavage_positions TEXT,
numbered_image_path TEXT,
created_at TEXT NOT NULL,
processed_at TEXT
)
"""
)
connection.executemany(
"""
INSERT INTO parent_molecules (
id, ml_id, chembl_id, molecule_name, smiles, classification, ring_size,
primary_reason_code, primary_reason_message, processing_status, error_message,
num_sidechains, cleavage_positions, numbered_image_path, created_at, processed_at
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
[
(1, "ML16A", None, None, "C1CCCCCCCCCCCCCC(=O)O1", "standard_macrolactone", 16, None, None, "success", None, 2, "[3]", None, "2026-03-19 00:00:00", None),
(2, "ML16B", None, None, "C1CCCCCCCCCCCCCC(=O)O1", "standard_macrolactone", 16, None, None, "success", None, 1, "[4]", None, "2026-03-19 00:00:00", None),
(3, "ML14A", None, None, "C1CCCCCCCCCCCC(=O)O1", "standard_macrolactone", 14, None, None, "success", None, 1, "[3]", None, "2026-03-19 00:00:00", None),
],
)
connection.commit()
completed = run_script(
"analyze_validation_fragment_library.py",
"--input",
str(input_path),
"--db",
str(db_path),
"--output-dir",
str(output_dir),
"--ring-size",
"16",
)
assert completed.returncode == 0, completed.stderr
assert (output_dir / "fragment_atom_count_distribution.png").exists()
assert (output_dir / "fragment_atom_count_summary.csv").exists()
assert (output_dir / "fragment_atom_count_filter_candidates.csv").exists()
assert (output_dir / "ring16_position_count_comparison.csv").exists()
assert (output_dir / "ring16_position_count_comparison.png").exists()
assert (output_dir / "ring16_position_atom_count_distribution.png").exists()
assert (output_dir / "ring16_position_atom_count_boxplot_gt3.png").exists()
assert (output_dir / "ring16_position_diversity.csv").exists()
assert (output_dir / "ring16_position_diversity_gt3.csv").exists()
assert (output_dir / "ring16_position_diversity_gt3.png").exists()
assert (output_dir / "ring16_position_ring_sensitivity.csv").exists()
assert (output_dir / "ring16_position_ring_sensitivity.png").exists()
assert (output_dir / "ring16_medchem_hotspot_comparison.csv").exists()
assert (output_dir / "ring16_medchem_hotspot_comparison.png").exists()
assert (output_dir / "fragment_library_analysis_report.md").exists()
assert (output_dir / "fragment_library_analysis_report_zh.md").exists()
assert (output_dir / "analysis_summary.txt").exists()
diversity = pd.read_csv(output_dir / "ring16_position_diversity.csv")
assert set(diversity["cleavage_position"]) == {3, 4}
assert set(diversity["total_fragments"]) == {1, 2}
diversity_gt3 = pd.read_csv(output_dir / "ring16_position_diversity_gt3.csv")
assert diversity_gt3.empty
report_zh = (output_dir / "fragment_library_analysis_report_zh.md").read_text(encoding="utf-8")
assert "桥连或双锚点侧链不会进入当前片段库" in report_zh
assert "cyclic single-anchor side chains" in report_zh
assert "6 → 13" in report_zh
assert "7 → 12" in report_zh
assert "15 → 4" in report_zh
assert "16 → 3" in report_zh
def test_active_text_assets_do_not_reference_legacy_api():
forbidden_patterns = [
"from src.",
"import src.",
"process_csv(",
"batch_to_dataframe(",
"visualize_molecule(",
"save_to_json(",
]
for path in ACTIVE_TEXT_ASSETS:
text = path.read_text(encoding="utf-8")
for pattern in forbidden_patterns:
assert pattern not in text, f"{path} still contains legacy reference: {pattern}"