docs(validation): add macrolactone fragmentation report
This commit is contained in:
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
import argparse
|
||||
from math import ceil
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
|
||||
import matplotlib
|
||||
|
||||
@@ -448,6 +449,48 @@ def format_position_mapping(positions: list[int], ring_size: int) -> str:
|
||||
return ", ".join(f"{position} → {mirror_ring_position(position, ring_size)}" for position in positions)
|
||||
|
||||
|
||||
def build_zero_fragment_parent_table(db_path: str | Path, ring_size: int) -> pd.DataFrame:
|
||||
with sqlite3.connect(db_path) as connection:
|
||||
tables = {
|
||||
row[0]
|
||||
for row in connection.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table'"
|
||||
)
|
||||
}
|
||||
if "fragment_library_entries" in tables:
|
||||
query = """
|
||||
SELECT ml_id, num_sidechains, cleavage_positions
|
||||
FROM parent_molecules
|
||||
WHERE classification = 'standard_macrolactone'
|
||||
AND ring_size = ?
|
||||
AND processing_status = 'success'
|
||||
AND NOT EXISTS (
|
||||
SELECT 1
|
||||
FROM fragment_library_entries fle
|
||||
WHERE fle.source_parent_ml_id = parent_molecules.ml_id
|
||||
AND fle.source_type = 'validation_extract'
|
||||
AND fle.splice_ready = 1
|
||||
)
|
||||
ORDER BY ml_id
|
||||
"""
|
||||
else:
|
||||
# The lightweight test database only includes parent_molecules.
|
||||
query = """
|
||||
SELECT ml_id, num_sidechains, cleavage_positions
|
||||
FROM parent_molecules
|
||||
WHERE classification = 'standard_macrolactone'
|
||||
AND ring_size = ?
|
||||
AND processing_status = 'success'
|
||||
AND num_sidechains = 0
|
||||
ORDER BY ml_id
|
||||
"""
|
||||
return pd.read_sql_query(
|
||||
query,
|
||||
connection,
|
||||
params=[ring_size],
|
||||
)
|
||||
|
||||
|
||||
def build_markdown_report(
|
||||
output_dir: Path,
|
||||
analysis_df: pd.DataFrame,
|
||||
@@ -655,6 +698,8 @@ def build_markdown_report_zh(
|
||||
ring_df: pd.DataFrame,
|
||||
filtered_ring_df: pd.DataFrame,
|
||||
filter_candidates: pd.DataFrame,
|
||||
standard_success_parent_count: int,
|
||||
zero_fragment_ring_parents: pd.DataFrame,
|
||||
diversity_gt3: pd.DataFrame,
|
||||
position_counts: pd.DataFrame,
|
||||
ring_sensitivity_table: pd.DataFrame,
|
||||
@@ -688,8 +733,20 @@ def build_markdown_report_zh(
|
||||
"",
|
||||
f"- 当前验证后的可拼接碎片库包含 **{len(analysis_df):,}** 条片段记录,来源于 **{analysis_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
|
||||
f"- 其中 {ring_size} 元环子集包含 **{len(ring_df):,}** 条片段记录,来源于 **{ring_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
|
||||
f"- 这里的“可拼接碎片”指验证库中 `source_type='validation_extract'` 且 `splice_ready=1` 的单锚点侧链片段。桥环、稠环或任何具有多个环连接点的侧链都已经在生成阶段被排除,不会进入这份库。",
|
||||
f"- 16 元环子集是按母体元数据里的 `ring_size=16` 直接过滤出来的,不是从片段反推 ring size。",
|
||||
f"- 用于设计相关位点分析的严格子集定义为:片段重原子数 **>= {design_min_atoms}**。",
|
||||
"",
|
||||
"## 16 元环母体计数口径说明",
|
||||
"",
|
||||
f"- 在数据库里,`standard_macrolactone + ring_size={ring_size} + processing_status=success` 一共有 **{standard_success_parent_count:,}** 个母体。",
|
||||
f"- 其中 **{ring_df['source_parent_ml_id'].nunique():,}** 个母体至少产出过 1 条可拼接片段,所以进入了当前报告的 16 元环片段统计。",
|
||||
f"- 剩余 **{len(zero_fragment_ring_parents):,}** 个母体没有任何可拼接片段;它们的共同特征是 `num_sidechains=0`、`cleavage_positions=[]`。",
|
||||
"",
|
||||
"```text",
|
||||
zero_fragment_ring_parents.to_string(index=False) if not zero_fragment_ring_parents.empty else "No zero-fragment parents.",
|
||||
"```",
|
||||
"",
|
||||
"## 全库碎片大小结论",
|
||||
"",
|
||||
f"- 默认清洗阈值建议使用 `<= {design_min_atoms - 2}` 重原子删除。该阈值会删除 **{int(conservative_filter.removed_rows):,}** 条记录({conservative_filter.removed_row_fraction:.1%}),但仅删除 **{int(conservative_filter.removed_unique_fragments):,}** 个唯一片段({conservative_filter.removed_unique_fraction:.1%})。",
|
||||
@@ -730,7 +787,7 @@ def build_markdown_report_zh(
|
||||
"",
|
||||
"## 桥环 / 稠环干扰的敏感性分析",
|
||||
"",
|
||||
"桥连或双锚点侧链不会进入当前片段库,因为断裂逻辑只保留与主环存在 **1 个连接点** 的侧链组件。也就是说,真正的 bridge / fused multi-anchor components 已被代码层面排除。",
|
||||
"桥连或双锚点侧链不会进入当前片段库,因为断裂逻辑只保留与主环存在 **1 个连接点** 的侧链组件。也就是说,真正的 bridge / fused multi-anchor components 已被代码层面排除。上面那 6 个 16 元环母体并不是这类“被误收进来的桥环碎片”,而是根本没有任何可拼接外侧链,所以不会产生 fragment 行。",
|
||||
"",
|
||||
"但是,需要额外区分另一类情况:**cyclic single-anchor side chains**。这类片段虽然只在一个位置连到主环,因此会被保留下来,但片段自身可能包含糖环、杂环或其他环状骨架,仍然会显著影响位点多样性排名。",
|
||||
"",
|
||||
@@ -905,6 +962,21 @@ def main(argv: list[str] | None = None) -> None:
|
||||
].copy()
|
||||
ring_df = analysis_df[analysis_df["ring_size"] == args.ring_size].copy()
|
||||
filtered_ring_df = ring_df[ring_df["fragment_atom_count"] >= args.design_min_atoms].copy()
|
||||
zero_fragment_ring_parents = build_zero_fragment_parent_table(args.db, args.ring_size)
|
||||
with sqlite3.connect(args.db) as connection:
|
||||
standard_success_parent_count = int(
|
||||
pd.read_sql_query(
|
||||
"""
|
||||
SELECT COUNT(*) AS count
|
||||
FROM parent_molecules
|
||||
WHERE classification = 'standard_macrolactone'
|
||||
AND ring_size = ?
|
||||
AND processing_status = 'success'
|
||||
""",
|
||||
connection,
|
||||
params=[args.ring_size],
|
||||
).iloc[0]["count"]
|
||||
)
|
||||
|
||||
if analysis_df.empty:
|
||||
raise ValueError("No splice-ready standard macrolactone fragments available for analysis.")
|
||||
@@ -1003,6 +1075,8 @@ def main(argv: list[str] | None = None) -> None:
|
||||
ring_df,
|
||||
filtered_ring_df,
|
||||
filter_candidates,
|
||||
standard_success_parent_count,
|
||||
zero_fragment_ring_parents,
|
||||
diversity_gt3,
|
||||
position_counts,
|
||||
ring_sensitivity,
|
||||
|
||||
Reference in New Issue
Block a user