feat(validation): archive key result assets

Keep key validation outputs and analysis tables tracked directly,
package analysis plot PNGs into a small tar.gz backup, and add
analysis scripts plus tests so the stored results remain
reproducible without flooding git with large image trees.
This commit is contained in:
2026-03-19 21:29:54 +08:00
parent f6bf9e85a3
commit 8071a141ee
20 changed files with 36723 additions and 0 deletions

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import json
import sqlite3
import subprocess
import sys
from pathlib import Path
@@ -133,6 +134,159 @@ def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path):
assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()
def test_analyze_validation_fragment_library_script_generates_reports(tmp_path):
input_path = tmp_path / "fragment_library.csv"
db_path = tmp_path / "fragments.db"
output_dir = tmp_path / "fragment_library_analysis"
pd.DataFrame(
[
{
"id": 1,
"source_type": "validation_extract",
"source_fragment_id": "ML16A_frag_0",
"source_parent_ml_id": "ML16A",
"source_parent_chembl_id": None,
"cleavage_position": 3,
"fragment_smiles_labeled": "[3*]C",
"fragment_smiles_plain": "*C",
"has_dummy_atom": True,
"dummy_atom_count": 1,
"splice_ready": True,
"original_bond_type": "SINGLE",
"created_at": "2026-03-19 00:00:00",
},
{
"id": 2,
"source_type": "validation_extract",
"source_fragment_id": "ML16A_frag_1",
"source_parent_ml_id": "ML16A",
"source_parent_chembl_id": None,
"cleavage_position": 3,
"fragment_smiles_labeled": "[3*]CC",
"fragment_smiles_plain": "*CC",
"has_dummy_atom": True,
"dummy_atom_count": 1,
"splice_ready": True,
"original_bond_type": "SINGLE",
"created_at": "2026-03-19 00:00:00",
},
{
"id": 3,
"source_type": "validation_extract",
"source_fragment_id": "ML16B_frag_0",
"source_parent_ml_id": "ML16B",
"source_parent_chembl_id": None,
"cleavage_position": 4,
"fragment_smiles_labeled": "[4*]O",
"fragment_smiles_plain": "*O",
"has_dummy_atom": True,
"dummy_atom_count": 1,
"splice_ready": True,
"original_bond_type": "SINGLE",
"created_at": "2026-03-19 00:00:00",
},
{
"id": 4,
"source_type": "validation_extract",
"source_fragment_id": "ML14A_frag_0",
"source_parent_ml_id": "ML14A",
"source_parent_chembl_id": None,
"cleavage_position": 3,
"fragment_smiles_labeled": "[3*]CCC",
"fragment_smiles_plain": "*CCC",
"has_dummy_atom": True,
"dummy_atom_count": 1,
"splice_ready": True,
"original_bond_type": "SINGLE",
"created_at": "2026-03-19 00:00:00",
},
]
).to_csv(input_path, index=False)
with sqlite3.connect(db_path) as connection:
connection.execute(
"""
CREATE TABLE parent_molecules (
id INTEGER PRIMARY KEY,
ml_id TEXT NOT NULL,
chembl_id TEXT,
molecule_name TEXT,
smiles TEXT NOT NULL,
classification TEXT NOT NULL,
ring_size INTEGER,
primary_reason_code TEXT,
primary_reason_message TEXT,
processing_status TEXT NOT NULL,
error_message TEXT,
num_sidechains INTEGER,
cleavage_positions TEXT,
numbered_image_path TEXT,
created_at TEXT NOT NULL,
processed_at TEXT
)
"""
)
connection.executemany(
"""
INSERT INTO parent_molecules (
id, ml_id, chembl_id, molecule_name, smiles, classification, ring_size,
primary_reason_code, primary_reason_message, processing_status, error_message,
num_sidechains, cleavage_positions, numbered_image_path, created_at, processed_at
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
[
(1, "ML16A", None, None, "C1CCCCCCCCCCCCCC(=O)O1", "standard_macrolactone", 16, None, None, "success", None, 2, "[3]", None, "2026-03-19 00:00:00", None),
(2, "ML16B", None, None, "C1CCCCCCCCCCCCCC(=O)O1", "standard_macrolactone", 16, None, None, "success", None, 1, "[4]", None, "2026-03-19 00:00:00", None),
(3, "ML14A", None, None, "C1CCCCCCCCCCCC(=O)O1", "standard_macrolactone", 14, None, None, "success", None, 1, "[3]", None, "2026-03-19 00:00:00", None),
],
)
connection.commit()
completed = run_script(
"analyze_validation_fragment_library.py",
"--input",
str(input_path),
"--db",
str(db_path),
"--output-dir",
str(output_dir),
"--ring-size",
"16",
)
assert completed.returncode == 0, completed.stderr
assert (output_dir / "fragment_atom_count_distribution.png").exists()
assert (output_dir / "fragment_atom_count_summary.csv").exists()
assert (output_dir / "fragment_atom_count_filter_candidates.csv").exists()
assert (output_dir / "ring16_position_count_comparison.csv").exists()
assert (output_dir / "ring16_position_count_comparison.png").exists()
assert (output_dir / "ring16_position_atom_count_distribution.png").exists()
assert (output_dir / "ring16_position_atom_count_boxplot_gt3.png").exists()
assert (output_dir / "ring16_position_diversity.csv").exists()
assert (output_dir / "ring16_position_diversity_gt3.csv").exists()
assert (output_dir / "ring16_position_diversity_gt3.png").exists()
assert (output_dir / "ring16_position_ring_sensitivity.csv").exists()
assert (output_dir / "ring16_position_ring_sensitivity.png").exists()
assert (output_dir / "ring16_medchem_hotspot_comparison.csv").exists()
assert (output_dir / "ring16_medchem_hotspot_comparison.png").exists()
assert (output_dir / "fragment_library_analysis_report.md").exists()
assert (output_dir / "fragment_library_analysis_report_zh.md").exists()
assert (output_dir / "analysis_summary.txt").exists()
diversity = pd.read_csv(output_dir / "ring16_position_diversity.csv")
assert set(diversity["cleavage_position"]) == {3, 4}
assert set(diversity["total_fragments"]) == {1, 2}
diversity_gt3 = pd.read_csv(output_dir / "ring16_position_diversity_gt3.csv")
assert diversity_gt3.empty
report_zh = (output_dir / "fragment_library_analysis_report_zh.md").read_text(encoding="utf-8")
assert "桥连或双锚点侧链不会进入当前片段库" in report_zh
assert "cyclic single-anchor side chains" in report_zh
def test_active_text_assets_do_not_reference_legacy_api():
forbidden_patterns = [
"from src.",

View File

@@ -0,0 +1,40 @@
from __future__ import annotations
import pandas as pd
import pytest
from macro_lactone_toolkit.validation.fragment_library_analysis import (
build_position_diversity_table,
count_non_dummy_atoms,
)
def test_count_non_dummy_atoms_excludes_dummy_atoms() -> None:
assert count_non_dummy_atoms("*O") == 1
assert count_non_dummy_atoms("*C") == 1
assert count_non_dummy_atoms("*C(C)C") == 3
def test_build_position_diversity_table_combines_frequency_and_structure_metrics() -> None:
dataframe = pd.DataFrame(
[
{"cleavage_position": 3, "fragment_smiles_plain": "*C"},
{"cleavage_position": 3, "fragment_smiles_plain": "*CC"},
{"cleavage_position": 3, "fragment_smiles_plain": "*CC"},
{"cleavage_position": 3, "fragment_smiles_plain": "*O"},
{"cleavage_position": 4, "fragment_smiles_plain": "*C"},
]
)
summary = build_position_diversity_table(dataframe).set_index("cleavage_position")
assert summary.loc[3, "total_fragments"] == 4
assert summary.loc[3, "unique_fragments"] == 3
assert summary.loc[3, "normalized_unique_ratio"] == pytest.approx(0.75)
assert summary.loc[3, "shannon_entropy"] > 0.0
assert summary.loc[3, "normalized_shannon_entropy"] > 0.0
assert summary.loc[3, "mean_pairwise_tanimoto_distance"] > 0.0
assert summary.loc[4, "total_fragments"] == 1
assert summary.loc[4, "unique_fragments"] == 1
assert summary.loc[4, "mean_pairwise_tanimoto_distance"] == 0.0