feat(validation): archive key result assets
Keep key validation outputs and analysis tables tracked directly, package analysis plot PNGs into a small tar.gz backup, and add analysis scripts plus tests so the stored results remain reproducible without flooding git with large image trees.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -133,6 +134,159 @@ def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path):
|
||||
assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()
|
||||
|
||||
|
||||
def test_analyze_validation_fragment_library_script_generates_reports(tmp_path):
|
||||
input_path = tmp_path / "fragment_library.csv"
|
||||
db_path = tmp_path / "fragments.db"
|
||||
output_dir = tmp_path / "fragment_library_analysis"
|
||||
|
||||
pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"source_type": "validation_extract",
|
||||
"source_fragment_id": "ML16A_frag_0",
|
||||
"source_parent_ml_id": "ML16A",
|
||||
"source_parent_chembl_id": None,
|
||||
"cleavage_position": 3,
|
||||
"fragment_smiles_labeled": "[3*]C",
|
||||
"fragment_smiles_plain": "*C",
|
||||
"has_dummy_atom": True,
|
||||
"dummy_atom_count": 1,
|
||||
"splice_ready": True,
|
||||
"original_bond_type": "SINGLE",
|
||||
"created_at": "2026-03-19 00:00:00",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"source_type": "validation_extract",
|
||||
"source_fragment_id": "ML16A_frag_1",
|
||||
"source_parent_ml_id": "ML16A",
|
||||
"source_parent_chembl_id": None,
|
||||
"cleavage_position": 3,
|
||||
"fragment_smiles_labeled": "[3*]CC",
|
||||
"fragment_smiles_plain": "*CC",
|
||||
"has_dummy_atom": True,
|
||||
"dummy_atom_count": 1,
|
||||
"splice_ready": True,
|
||||
"original_bond_type": "SINGLE",
|
||||
"created_at": "2026-03-19 00:00:00",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"source_type": "validation_extract",
|
||||
"source_fragment_id": "ML16B_frag_0",
|
||||
"source_parent_ml_id": "ML16B",
|
||||
"source_parent_chembl_id": None,
|
||||
"cleavage_position": 4,
|
||||
"fragment_smiles_labeled": "[4*]O",
|
||||
"fragment_smiles_plain": "*O",
|
||||
"has_dummy_atom": True,
|
||||
"dummy_atom_count": 1,
|
||||
"splice_ready": True,
|
||||
"original_bond_type": "SINGLE",
|
||||
"created_at": "2026-03-19 00:00:00",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"source_type": "validation_extract",
|
||||
"source_fragment_id": "ML14A_frag_0",
|
||||
"source_parent_ml_id": "ML14A",
|
||||
"source_parent_chembl_id": None,
|
||||
"cleavage_position": 3,
|
||||
"fragment_smiles_labeled": "[3*]CCC",
|
||||
"fragment_smiles_plain": "*CCC",
|
||||
"has_dummy_atom": True,
|
||||
"dummy_atom_count": 1,
|
||||
"splice_ready": True,
|
||||
"original_bond_type": "SINGLE",
|
||||
"created_at": "2026-03-19 00:00:00",
|
||||
},
|
||||
]
|
||||
).to_csv(input_path, index=False)
|
||||
|
||||
with sqlite3.connect(db_path) as connection:
|
||||
connection.execute(
|
||||
"""
|
||||
CREATE TABLE parent_molecules (
|
||||
id INTEGER PRIMARY KEY,
|
||||
ml_id TEXT NOT NULL,
|
||||
chembl_id TEXT,
|
||||
molecule_name TEXT,
|
||||
smiles TEXT NOT NULL,
|
||||
classification TEXT NOT NULL,
|
||||
ring_size INTEGER,
|
||||
primary_reason_code TEXT,
|
||||
primary_reason_message TEXT,
|
||||
processing_status TEXT NOT NULL,
|
||||
error_message TEXT,
|
||||
num_sidechains INTEGER,
|
||||
cleavage_positions TEXT,
|
||||
numbered_image_path TEXT,
|
||||
created_at TEXT NOT NULL,
|
||||
processed_at TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
connection.executemany(
|
||||
"""
|
||||
INSERT INTO parent_molecules (
|
||||
id, ml_id, chembl_id, molecule_name, smiles, classification, ring_size,
|
||||
primary_reason_code, primary_reason_message, processing_status, error_message,
|
||||
num_sidechains, cleavage_positions, numbered_image_path, created_at, processed_at
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
[
|
||||
(1, "ML16A", None, None, "C1CCCCCCCCCCCCCC(=O)O1", "standard_macrolactone", 16, None, None, "success", None, 2, "[3]", None, "2026-03-19 00:00:00", None),
|
||||
(2, "ML16B", None, None, "C1CCCCCCCCCCCCCC(=O)O1", "standard_macrolactone", 16, None, None, "success", None, 1, "[4]", None, "2026-03-19 00:00:00", None),
|
||||
(3, "ML14A", None, None, "C1CCCCCCCCCCCC(=O)O1", "standard_macrolactone", 14, None, None, "success", None, 1, "[3]", None, "2026-03-19 00:00:00", None),
|
||||
],
|
||||
)
|
||||
connection.commit()
|
||||
|
||||
completed = run_script(
|
||||
"analyze_validation_fragment_library.py",
|
||||
"--input",
|
||||
str(input_path),
|
||||
"--db",
|
||||
str(db_path),
|
||||
"--output-dir",
|
||||
str(output_dir),
|
||||
"--ring-size",
|
||||
"16",
|
||||
)
|
||||
|
||||
assert completed.returncode == 0, completed.stderr
|
||||
assert (output_dir / "fragment_atom_count_distribution.png").exists()
|
||||
assert (output_dir / "fragment_atom_count_summary.csv").exists()
|
||||
assert (output_dir / "fragment_atom_count_filter_candidates.csv").exists()
|
||||
assert (output_dir / "ring16_position_count_comparison.csv").exists()
|
||||
assert (output_dir / "ring16_position_count_comparison.png").exists()
|
||||
assert (output_dir / "ring16_position_atom_count_distribution.png").exists()
|
||||
assert (output_dir / "ring16_position_atom_count_boxplot_gt3.png").exists()
|
||||
assert (output_dir / "ring16_position_diversity.csv").exists()
|
||||
assert (output_dir / "ring16_position_diversity_gt3.csv").exists()
|
||||
assert (output_dir / "ring16_position_diversity_gt3.png").exists()
|
||||
assert (output_dir / "ring16_position_ring_sensitivity.csv").exists()
|
||||
assert (output_dir / "ring16_position_ring_sensitivity.png").exists()
|
||||
assert (output_dir / "ring16_medchem_hotspot_comparison.csv").exists()
|
||||
assert (output_dir / "ring16_medchem_hotspot_comparison.png").exists()
|
||||
assert (output_dir / "fragment_library_analysis_report.md").exists()
|
||||
assert (output_dir / "fragment_library_analysis_report_zh.md").exists()
|
||||
assert (output_dir / "analysis_summary.txt").exists()
|
||||
|
||||
diversity = pd.read_csv(output_dir / "ring16_position_diversity.csv")
|
||||
assert set(diversity["cleavage_position"]) == {3, 4}
|
||||
assert set(diversity["total_fragments"]) == {1, 2}
|
||||
|
||||
diversity_gt3 = pd.read_csv(output_dir / "ring16_position_diversity_gt3.csv")
|
||||
assert diversity_gt3.empty
|
||||
|
||||
report_zh = (output_dir / "fragment_library_analysis_report_zh.md").read_text(encoding="utf-8")
|
||||
assert "桥连或双锚点侧链不会进入当前片段库" in report_zh
|
||||
assert "cyclic single-anchor side chains" in report_zh
|
||||
|
||||
|
||||
def test_active_text_assets_do_not_reference_legacy_api():
|
||||
forbidden_patterns = [
|
||||
"from src.",
|
||||
|
||||
40
tests/validation/test_fragment_library_analysis.py
Normal file
40
tests/validation/test_fragment_library_analysis.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from macro_lactone_toolkit.validation.fragment_library_analysis import (
|
||||
build_position_diversity_table,
|
||||
count_non_dummy_atoms,
|
||||
)
|
||||
|
||||
|
||||
def test_count_non_dummy_atoms_excludes_dummy_atoms() -> None:
|
||||
assert count_non_dummy_atoms("*O") == 1
|
||||
assert count_non_dummy_atoms("*C") == 1
|
||||
assert count_non_dummy_atoms("*C(C)C") == 3
|
||||
|
||||
|
||||
def test_build_position_diversity_table_combines_frequency_and_structure_metrics() -> None:
|
||||
dataframe = pd.DataFrame(
|
||||
[
|
||||
{"cleavage_position": 3, "fragment_smiles_plain": "*C"},
|
||||
{"cleavage_position": 3, "fragment_smiles_plain": "*CC"},
|
||||
{"cleavage_position": 3, "fragment_smiles_plain": "*CC"},
|
||||
{"cleavage_position": 3, "fragment_smiles_plain": "*O"},
|
||||
{"cleavage_position": 4, "fragment_smiles_plain": "*C"},
|
||||
]
|
||||
)
|
||||
|
||||
summary = build_position_diversity_table(dataframe).set_index("cleavage_position")
|
||||
|
||||
assert summary.loc[3, "total_fragments"] == 4
|
||||
assert summary.loc[3, "unique_fragments"] == 3
|
||||
assert summary.loc[3, "normalized_unique_ratio"] == pytest.approx(0.75)
|
||||
assert summary.loc[3, "shannon_entropy"] > 0.0
|
||||
assert summary.loc[3, "normalized_shannon_entropy"] > 0.0
|
||||
assert summary.loc[3, "mean_pairwise_tanimoto_distance"] > 0.0
|
||||
|
||||
assert summary.loc[4, "total_fragments"] == 1
|
||||
assert summary.loc[4, "unique_fragments"] == 1
|
||||
assert summary.loc[4, "mean_pairwise_tanimoto_distance"] == 0.0
|
||||
Reference in New Issue
Block a user