docs(validation): add macrolactone fragmentation report

This commit is contained in:
hotwa
2026-03-24 22:35:05 +08:00
parent 3e07402f4e
commit 8478abbbaa
3 changed files with 650 additions and 3 deletions

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import argparse
from math import ceil
from pathlib import Path
import sqlite3
import matplotlib
@@ -448,6 +449,48 @@ def format_position_mapping(positions: list[int], ring_size: int) -> str:
return ", ".join(f"{position}{mirror_ring_position(position, ring_size)}" for position in positions)
def build_zero_fragment_parent_table(db_path: str | Path, ring_size: int) -> pd.DataFrame:
with sqlite3.connect(db_path) as connection:
tables = {
row[0]
for row in connection.execute(
"SELECT name FROM sqlite_master WHERE type='table'"
)
}
if "fragment_library_entries" in tables:
query = """
SELECT ml_id, num_sidechains, cleavage_positions
FROM parent_molecules
WHERE classification = 'standard_macrolactone'
AND ring_size = ?
AND processing_status = 'success'
AND NOT EXISTS (
SELECT 1
FROM fragment_library_entries fle
WHERE fle.source_parent_ml_id = parent_molecules.ml_id
AND fle.source_type = 'validation_extract'
AND fle.splice_ready = 1
)
ORDER BY ml_id
"""
else:
# The lightweight test database only includes parent_molecules.
query = """
SELECT ml_id, num_sidechains, cleavage_positions
FROM parent_molecules
WHERE classification = 'standard_macrolactone'
AND ring_size = ?
AND processing_status = 'success'
AND num_sidechains = 0
ORDER BY ml_id
"""
return pd.read_sql_query(
query,
connection,
params=[ring_size],
)
def build_markdown_report(
output_dir: Path,
analysis_df: pd.DataFrame,
@@ -655,6 +698,8 @@ def build_markdown_report_zh(
ring_df: pd.DataFrame,
filtered_ring_df: pd.DataFrame,
filter_candidates: pd.DataFrame,
standard_success_parent_count: int,
zero_fragment_ring_parents: pd.DataFrame,
diversity_gt3: pd.DataFrame,
position_counts: pd.DataFrame,
ring_sensitivity_table: pd.DataFrame,
@@ -688,8 +733,20 @@ def build_markdown_report_zh(
"",
f"- 当前验证后的可拼接碎片库包含 **{len(analysis_df):,}** 条片段记录,来源于 **{analysis_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
f"- 其中 {ring_size} 元环子集包含 **{len(ring_df):,}** 条片段记录,来源于 **{ring_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
f"- 这里的“可拼接碎片”指验证库中 `source_type='validation_extract'` 且 `splice_ready=1` 的单锚点侧链片段。桥环、稠环或任何具有多个环连接点的侧链都已经在生成阶段被排除,不会进入这份库。",
f"- 16 元环子集是按母体元数据里的 `ring_size=16` 直接过滤出来的,不是从片段反推 ring size。",
f"- 用于设计相关位点分析的严格子集定义为:片段重原子数 **>= {design_min_atoms}**。",
"",
"## 16 元环母体计数口径说明",
"",
f"- 在数据库里,`standard_macrolactone + ring_size={ring_size} + processing_status=success` 一共有 **{standard_success_parent_count:,}** 个母体。",
f"- 其中 **{ring_df['source_parent_ml_id'].nunique():,}** 个母体至少产出过 1 条可拼接片段,所以进入了当前报告的 16 元环片段统计。",
f"- 剩余 **{len(zero_fragment_ring_parents):,}** 个母体没有任何可拼接片段;它们的共同特征是 `num_sidechains=0`、`cleavage_positions=[]`。",
"",
"```text",
zero_fragment_ring_parents.to_string(index=False) if not zero_fragment_ring_parents.empty else "No zero-fragment parents.",
"```",
"",
"## 全库碎片大小结论",
"",
f"- 默认清洗阈值建议使用 `<= {design_min_atoms - 2}` 重原子删除。该阈值会删除 **{int(conservative_filter.removed_rows):,}** 条记录({conservative_filter.removed_row_fraction:.1%}),但仅删除 **{int(conservative_filter.removed_unique_fragments):,}** 个唯一片段({conservative_filter.removed_unique_fraction:.1%})。",
@@ -730,7 +787,7 @@ def build_markdown_report_zh(
"",
"## 桥环 / 稠环干扰的敏感性分析",
"",
"桥连或双锚点侧链不会进入当前片段库,因为断裂逻辑只保留与主环存在 **1 个连接点** 的侧链组件。也就是说,真正的 bridge / fused multi-anchor components 已被代码层面排除。",
"桥连或双锚点侧链不会进入当前片段库,因为断裂逻辑只保留与主环存在 **1 个连接点** 的侧链组件。也就是说,真正的 bridge / fused multi-anchor components 已被代码层面排除。上面那 6 个 16 元环母体并不是这类“被误收进来的桥环碎片”,而是根本没有任何可拼接外侧链,所以不会产生 fragment 行。",
"",
"但是,需要额外区分另一类情况:**cyclic single-anchor side chains**。这类片段虽然只在一个位置连到主环,因此会被保留下来,但片段自身可能包含糖环、杂环或其他环状骨架,仍然会显著影响位点多样性排名。",
"",
@@ -905,6 +962,21 @@ def main(argv: list[str] | None = None) -> None:
].copy()
ring_df = analysis_df[analysis_df["ring_size"] == args.ring_size].copy()
filtered_ring_df = ring_df[ring_df["fragment_atom_count"] >= args.design_min_atoms].copy()
zero_fragment_ring_parents = build_zero_fragment_parent_table(args.db, args.ring_size)
with sqlite3.connect(args.db) as connection:
standard_success_parent_count = int(
pd.read_sql_query(
"""
SELECT COUNT(*) AS count
FROM parent_molecules
WHERE classification = 'standard_macrolactone'
AND ring_size = ?
AND processing_status = 'success'
""",
connection,
params=[args.ring_size],
).iloc[0]["count"]
)
if analysis_df.empty:
raise ValueError("No splice-ready standard macrolactone fragments available for analysis.")
@@ -1003,6 +1075,8 @@ def main(argv: list[str] | None = None) -> None:
ring_df,
filtered_ring_df,
filter_candidates,
standard_success_parent_count,
zero_fragment_ring_parents,
diversity_gt3,
position_counts,
ring_sensitivity,