feat(shotter): 实现 Shotter v1 活性评估与单 FNA 流程,新增 API/CLI/绘图与报告

- 新增 scripts/bttoxin_shoter.py:从 BPPRC 正样本 CSV 构建 name/亚家族/家族特异性索引,
  解析 BtToxin_Digger All_Toxins.txt,计算 per-hit 权重并以 noisy-OR 合成菌株×目标目/物种分数,
  输出 TSV/JSON;含 HMM 加成与配对毒素规则(Vip1/Vip2,Vpa/Vpb),other/unknown 桶。
- 新增端到端工具链:
  - scripts/run_single_fna_pipeline.py:Digger → Shotter → Plot → 打包
  - scripts/plot_shotter.py:绘制热图并生成论文式/摘要式报告
  - scripts/bttoxin_api.py 与 bttoxin/api.py:纯 Python API;bttoxin/cli.py 暴露 bttoxin-run
  - pyproject.toml:项目打包与 CLI 入口
- docs(README): 增加输入文件格式与结果解读,补充单目录写入方案
- chore(gitignore): 忽略 runs/ 与 tests/output
- ci: 移除 .woodpecker/test.yml
This commit is contained in:
2025-12-01 10:11:26 +08:00
parent 3fc41a2924
commit 5883e13c56
11 changed files with 2195 additions and 16 deletions

543
scripts/plot_shotter.py Normal file
View File

@@ -0,0 +1,543 @@
#!/usr/bin/env python3
"""
Plotting utilities for Bttoxin_Shoter outputs.
Generates heatmaps for:
- Strain × Target Orders scores (from strain_target_scores.tsv)
- Optional Per-hit contributions for one strain (from toxin_support.tsv)
- Optional Strain × Target Species scores (from strain_target_species_scores.tsv)
Can also write a Markdown report with:
- summary mode: brief overview
- paper mode: expanded, paper-like sections (Abstract, Methods, Results, Discussion, Params, Appendix)
Dependencies:
- pandas, matplotlib; seaborn is optional (for nicer heatmaps). If seaborn is
missing, a pure-matplotlib fallback will be used.
"""
from __future__ import annotations
import argparse
from pathlib import Path
from typing import List, Optional
import pandas as pd
try:
import seaborn as sns # type: ignore
_HAS_SNS = True
except Exception:
_HAS_SNS = False
import matplotlib.pyplot as plt
def _parse_figsize(s: str | None, default=(12, 6)):
if not s:
return default
try:
w, h = s.lower().split("x")
return (float(w), float(h))
except Exception:
return default
def _merge_unresolved_columns(df: pd.DataFrame, merge_unresolved: bool) -> pd.DataFrame:
if not merge_unresolved:
return df
cols = df.columns.tolist()
has_other = "other" in cols
has_unknown = "unknown" in cols
if has_other or has_unknown:
df = df.copy()
df["unresolved"] = df.get("other", 0.0) + df.get("unknown", 0.0)
drop_cols = [c for c in ("other", "unknown") if c in df.columns]
df = df.drop(columns=drop_cols)
return df
def plot_strain_scores(strain_scores_path: Path, out_path: Path, cmap: str, vmin: float, vmax: float, figsize: str | None, merge_unresolved: bool):
df = pd.read_csv(strain_scores_path, sep="\t")
base_cols = {"Strain", "TopOrder", "TopScore"}
orders = [c for c in df.columns if c not in base_cols]
mat = df.set_index("Strain")[orders]
mat = _merge_unresolved_columns(mat, merge_unresolved)
orders = list(mat.columns)
# dynamic figure size if not provided
if figsize:
size = _parse_figsize(figsize)
else:
size = (max(8, 0.6 * len(orders) + 3), max(4, 0.35 * len(mat) + 2))
plt.figure(figsize=size)
if _HAS_SNS:
ax = sns.heatmap(mat, cmap=cmap, vmin=vmin, vmax=vmax, cbar_kws={"label": "score"})
else:
ax = plt.gca()
im = ax.imshow(mat.values, aspect="auto", cmap=cmap, vmin=vmin, vmax=vmax)
plt.colorbar(im, ax=ax, label="score")
ax.set_xticks(range(len(orders)))
ax.set_xticklabels(orders, rotation=45, ha="right")
ax.set_yticks(range(len(mat.index)))
ax.set_yticklabels(mat.index)
ax.set_title("Strain vs Target Orders (Shotter)")
plt.tight_layout()
out_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out_path, dpi=200)
plt.close()
def plot_per_hit_for_strain(toxin_support_path: Path, strain: str, out_path: Path, cmap: str, vmin: float, vmax: float, figsize: str | None, merge_unresolved: bool):
hits = pd.read_csv(toxin_support_path, sep="\t")
sub = hits[hits["Strain"].astype(str).eq(strain)].copy()
if sub.empty:
print(f"[plot_shotter] No hits for strain: {strain}")
return
base_cols = {
"Strain","Protein_id","Hit_id","Identity","Aln_length","Hit_length","Coverage",
"HMM","Family","NameKey","PartnerFulfilled","Weight","TopOrder","TopScore"
}
orders = [c for c in sub.columns if c not in base_cols]
mat = sub.set_index("Hit_id")[orders]
mat = _merge_unresolved_columns(mat, merge_unresolved)
orders = list(mat.columns)
# dynamic figure size if not provided
if figsize:
size = _parse_figsize(figsize)
else:
size = (max(8, 0.6 * len(orders) + 3), max(4, 0.35 * len(mat) + 2))
plt.figure(figsize=size)
if _HAS_SNS:
ax = sns.heatmap(mat, cmap=cmap, vmin=vmin, vmax=vmax, cbar_kws={"label": "contrib"})
else:
ax = plt.gca()
im = ax.imshow(mat.values, aspect="auto", cmap=cmap, vmin=vmin, vmax=vmax)
plt.colorbar(im, ax=ax, label="contrib")
ax.set_xticks(range(len(orders)))
ax.set_xticklabels(orders, rotation=45, ha="right")
ax.set_yticks(range(len(mat.index)))
ax.set_yticklabels(mat.index)
ax.set_title(f"Per-hit contributions for {strain}")
plt.tight_layout()
out_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out_path, dpi=200)
plt.close()
def plot_species_scores(species_scores_path: Path, out_path: Path, cmap: str, vmin: float, vmax: float, figsize: str | None):
df = pd.read_csv(species_scores_path, sep="\t")
species_cols = [c for c in df.columns if c not in ("Strain", "TopSpecies", "TopSpeciesScore")]
if not species_cols:
print("[plot_shotter] No species columns found; skip species heatmap")
return
mat = df.set_index("Strain")[species_cols]
if figsize:
size = _parse_figsize(figsize)
else:
size = (max(8, 0.6 * len(species_cols) + 3), max(4, 0.35 * len(mat) + 2))
plt.figure(figsize=size)
if _HAS_SNS:
ax = sns.heatmap(mat, cmap=cmap, vmin=vmin, vmax=vmax, cbar_kws={"label": "score"})
else:
ax = plt.gca()
im = ax.imshow(mat.values, aspect="auto", cmap=cmap, vmin=vmin, vmax=vmax)
plt.colorbar(im, ax=ax, label="score")
ax.set_xticks(range(len(species_cols)))
ax.set_xticklabels(species_cols, rotation=45, ha="right")
ax.set_yticks(range(len(mat.index)))
ax.set_yticklabels(mat.index)
ax.set_title("Strain vs Target Species (Shotter)")
plt.tight_layout()
out_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out_path, dpi=200)
plt.close()
def _percent(x: float) -> str:
try:
return f"{100.0*float(x):.1f}%"
except Exception:
return "NA"
def _fmtf(x, nd=3):
try:
return f"{float(x):.{nd}f}"
except Exception:
return "NA"
def _labels(lang: str):
zh = {
"title": "Bttoxin Shotter 论文式报告",
"summary": "# Bttoxin Shotter Summary",
"params": "## 参数",
"abstract": "## 摘要",
"methods": "## 数据与方法",
"results": "## 结果",
"discussion": "## 讨论与局限",
"appendix": "## 附录",
"top_targets": "## 各菌株 TopOrder/TopScore",
"heat_orders": "## 热图:菌株 × 目标目",
"heat_species": "## 热图:菌株 × 目标物种",
"perhit": "## 命中 TopOrder/TopScore前50",
"notes": "## 说明",
"unresolved_ratio": "未解析贡献other+unknown整体占比",
"order_distribution": "整体目标目分布按合成分数求和排名前5",
"hits_stats": "命中统计",
"n_strains": "菌株数",
"n_hits": "命中条数",
"avg_weight": "平均权重",
"table_header_strain": "Strain | TopOrder | TopScore",
"table_header_hit": "Hit_id | Family | TopOrder | TopScore | Weight",
"table_sep": "--- | --- | ---",
"table_sep_hit": "--- | --- | --- | --- | ---",
"methods_text": (
"- 输入BtToxin_Digger 的 All_Toxins.txtBPPRC 特异性 CSVactivity=Yes\n"
"- 特异性索引:按 name→亚家族→家族聚合target_order/target_species 以加权和归一构成分布。\n"
"- 相似性权重 w_hit若 identity≥0.78 且 coverage≥0.8 则 base=1若 0.45≤identity<0.78 则线性插值;否则 0。"
"最终 w=min(1, base×coverage + 0.1×I(HMM))). 若需配对而未满足w×=0.2。\n"
"- 单命中贡献c(order)=w_hit×P(order|·)。\n"
"- 菌株层合成noisy-ORscore(order)=1-∏(1-c_i(order))。\n"
"- 物种维度:同上(若 CSV 含 target_species\n"
"- 分类含义other=家族可解析但无证据unknown=家族不可解析。\n"
),
"discussion_text": (
"- unresolved 偏高常见于:命名异常、家族稀缺、索引覆盖不足。\n"
"- 可用更严格阈值或 --require_index_hit 减少伪阳性;也可 --merge_unresolved 合并展示。\n"
"- 名称差异Digger 与 BPPRC 均含 Cry 之外家族Tpp/Spp/Vip 等),采用 name→亚家族→家族回退映射。\n"
),
"notes_items": [
"TopOrder/TopScore最高目标目及其分数命中或菌株",
"contrib单命中对某目标目的贡献 w_hit×P(order|·)。",
"unresolved=other+unknown仅为可视化合并开关不改变计算。",
],
}
en = {
"title": "Bttoxin Shotter Paper-style Report",
"summary": "# Bttoxin Shotter Summary",
"params": "## Parameters",
"abstract": "## Abstract",
"methods": "## Data and Methods",
"results": "## Results",
"discussion": "## Discussion and Limitations",
"appendix": "## Appendix",
"top_targets": "## Top targets per strain",
"heat_orders": "## Heatmap: Strain × Orders",
"heat_species": "## Heatmap: Strain × Species",
"perhit": "## Per-hit TopOrder/TopScore (Top 50)",
"notes": "## Notes",
"unresolved_ratio": "Overall unresolved (other+unknown) fraction",
"order_distribution": "Overall order distribution (top 5 by summed score):",
"hits_stats": "Hit statistics",
"n_strains": "#Strains",
"n_hits": "#Hits",
"avg_weight": "Avg weight",
"table_header_strain": "Strain | TopOrder | TopScore",
"table_header_hit": "Hit_id | Family | TopOrder | TopScore | Weight",
"table_sep": "--- | --- | ---",
"table_sep_hit": "--- | --- | --- | --- | ---",
"methods_text": (
"- Inputs: All_Toxins.txt from BtToxin_Digger; BPPRC specificity CSV (activity=Yes).\n"
"- Specificity index: aggregate name→subfamily→family; build P(order/species|·) by weighted sums and normalization.\n"
"- Similarity weight w_hit: base=1 if identity≥0.78 & coverage≥0.8; linear if 0.45≤identity<0.78; else 0.\n"
" Final w=min(1, base×coverage + 0.1×I(HMM))). Partner missing → w×=0.2.\n"
"- Per-hit contribution: c(order)=w_hit×P(order|·).\n"
"- Strain level: noisy-OR, score(order)=1-∏(1-c_i(order)).\n"
"- Species dimension: same if target_species present.\n"
"- Buckets: other=parseable family but no evidence; unknown=unparseable family.\n"
),
"discussion_text": (
"- High unresolved is common with unusual names, rare families, or limited index coverage.\n"
"- Use stricter thresholds or --require_index_hit to reduce false positives; --merge_unresolved for visualization only.\n"
"- Naming: both Digger and BPPRC include non-Cry families (Tpp/Spp/Vip etc.), mapped via name→subfamily→family backoff.\n"
),
"notes_items": [
"TopOrder/TopScore: max target order and score (per hit or strain).",
"contrib: per-hit contribution w_hit×P(order|·).",
"unresolved=other+unknown merge is cosmetic; computation unchanged.",
],
}
return zh if lang == "zh" else en
def write_summary_md(
out_path: Path,
strain_scores_path: Path,
toxin_support_path: Optional[Path],
species_scores_path: Optional[Path],
strain_heatmap_path: Optional[Path],
per_hit_heatmap_path: Optional[Path],
species_heatmap_path: Optional[Path],
merge_unresolved: bool,
args_namespace,
):
labels = _labels(getattr(args_namespace, "lang", "zh"))
lines: List[str] = []
lines.append(labels["summary"])
lines.append("")
# Parameters
lines.append(labels["params"])
lines.append(f"- allow_unknown_families: {getattr(args_namespace, 'allow_unknown_families', 'NA')}")
lines.append(f"- require_index_hit: {getattr(args_namespace, 'require_index_hit', 'NA')}")
lines.append(f"- min_identity: {getattr(args_namespace, 'min_identity', 'NA')}")
lines.append(f"- min_coverage: {getattr(args_namespace, 'min_coverage', 'NA')}")
lines.append(f"- merge_unresolved: {merge_unresolved}")
lines.append("")
# Top per strain table
try:
df = pd.read_csv(strain_scores_path, sep="\t")
cols = [c for c in df.columns if c not in ("Strain", "TopOrder", "TopScore")]
unresolved_cols = [c for c in ("other", "unknown") if c in cols]
unresolved_total = df[unresolved_cols].sum(axis=1).sum() if unresolved_cols else 0.0
total_sum = df[cols].sum(axis=1).sum() if cols else 0.0
frac_unresolved = (unresolved_total / total_sum) if total_sum > 0 else 0.0
lines.append(labels["top_targets"])
lines.append("")
lines.append(labels["table_header_strain"])
lines.append(labels["table_sep"])
for _, r in df.iterrows():
lines.append(f"{r['Strain']} | {r.get('TopOrder','')} | {float(r.get('TopScore',0.0)):.3f}")
lines.append("")
lines.append(f"{labels['unresolved_ratio']}: {_percent(frac_unresolved)}")
lines.append("")
except Exception as e:
lines.append(f"[warn] Failed to read strain scores: {e}")
if strain_heatmap_path and strain_heatmap_path.exists():
lines.append(labels["heat_orders"])
lines.append(f"![]({strain_heatmap_path.name})")
lines.append("")
if species_scores_path and species_scores_path.exists():
lines.append(labels["heat_species"])
if species_heatmap_path and species_heatmap_path.exists():
lines.append(f"![]({species_heatmap_path.name})")
else:
lines.append("Species heatmap not generated.")
lines.append("")
# Per-hit summary
if toxin_support_path and toxin_support_path.exists():
try:
hits = pd.read_csv(toxin_support_path, sep="\t")
lines.append(labels["perhit"])
lines.append("")
lines.append(labels["table_header_hit"])
lines.append(labels["table_sep_hit"])
for _, r in hits.sort_values("TopScore", ascending=False).head(50).iterrows():
lines.append(
f"{r['Hit_id']} | {r.get('Family','')} | {r.get('TopOrder','')} | {float(r.get('TopScore',0.0)):.3f} | {float(r.get('Weight',0.0)):.3f}"
)
lines.append("")
except Exception as e:
lines.append(f"[warn] Failed to read toxin support: {e}")
# FAQs
lines.append(labels["notes"])
for it in _labels(getattr(args_namespace, "lang", "zh"))["notes_items"]:
lines.append(f"- {it}")
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text("\n".join(lines), encoding="utf-8")
def write_report_md(
out_path: Path,
mode: str,
lang: str,
strain_scores_path: Path,
toxin_support_path: Optional[Path],
species_scores_path: Optional[Path],
strain_heatmap_path: Optional[Path],
per_hit_heatmap_path: Optional[Path],
species_heatmap_path: Optional[Path],
merge_unresolved: bool,
args_namespace,
):
if mode == "summary":
return write_summary_md(
out_path,
strain_scores_path,
toxin_support_path,
species_scores_path,
strain_heatmap_path,
per_hit_heatmap_path,
species_heatmap_path,
merge_unresolved,
args_namespace,
)
L = _labels(lang)
lines: List[str] = []
lines.append(L["title"]) # title
lines.append("")
# Abstract: brief auto-summary
try:
df = pd.read_csv(strain_scores_path, sep="\t")
base_cols = {"Strain", "TopOrder", "TopScore"}
order_cols = [c for c in df.columns if c not in base_cols]
orders_only = df[order_cols]
unresolved_cols = [c for c in ("other", "unknown") if c in order_cols]
frac_unres = 0.0
if order_cols:
tot = orders_only.sum(axis=1).sum()
unres = df[unresolved_cols].sum(axis=1).sum() if unresolved_cols else 0.0
frac_unres = (unres / tot) if tot > 0 else 0.0
top_global = []
if order_cols:
sums = orders_only.sum(axis=0).sort_values(ascending=False)
for k, v in sums.head(5).items():
if k in ("other", "unknown") and merge_unresolved:
continue
top_global.append(f"{k}:{_fmtf(v)}")
n_strains = len(df)
except Exception:
n_strains, frac_unres, top_global = 0, 0.0, []
n_hits, avg_w = 0, 0.0
try:
if toxin_support_path and toxin_support_path.exists():
hits = pd.read_csv(toxin_support_path, sep="\t")
n_hits = len(hits)
if "Weight" in hits.columns and len(hits) > 0:
avg_w = float(hits["Weight"].mean())
except Exception:
pass
lines.append(L["abstract"])
lines.append(
f"- {L['n_strains']}: {n_strains}; {L['n_hits']}: {n_hits}; {L['avg_weight']}: {_fmtf(avg_w)}."
)
if top_global:
lines.append(f"- {L['order_distribution']} " + ", ".join(top_global))
lines.append(f"- {L['unresolved_ratio']}: {_percent(frac_unres)}")
lines.append("")
# Methods
lines.append(L["methods"])
lines.append(L["methods_text"])
# Results
lines.append(L["results"])
try:
df = pd.read_csv(strain_scores_path, sep="\t")
lines.append(L["top_targets"])
lines.append("")
lines.append(L["table_header_strain"])
lines.append(L["table_sep"])
for _, r in df.iterrows():
lines.append(f"{r['Strain']} | {r.get('TopOrder','')} | {_fmtf(r.get('TopScore',0.0))}")
lines.append("")
except Exception as e:
lines.append(f"[warn] Failed to read strain scores: {e}")
if strain_heatmap_path and strain_heatmap_path.exists():
lines.append(L["heat_orders"])
lines.append(f"![]({strain_heatmap_path.name})")
lines.append("")
if species_heatmap_path and species_heatmap_path.exists():
lines.append(L["heat_species"])
lines.append(f"![]({species_heatmap_path.name})")
lines.append("")
# Discussion
lines.append(L["discussion"])
lines.append(L["discussion_text"])
# Params
lines.append(L["params"])
lines.append(
f"- allow_unknown_families: {getattr(args_namespace, 'allow_unknown_families', 'NA')}\n"
f"- require_index_hit: {getattr(args_namespace, 'require_index_hit', 'NA')}\n"
f"- min_identity: {getattr(args_namespace, 'min_identity', 'NA')}\n"
f"- min_coverage: {getattr(args_namespace, 'min_coverage', 'NA')}\n"
f"- merge_unresolved: {merge_unresolved}"
)
lines.append("")
# Appendix
if toxin_support_path and toxin_support_path.exists():
try:
hits = pd.read_csv(toxin_support_path, sep="\t")
lines.append(L["appendix"])
lines.append(L["perhit"])
lines.append("")
lines.append(L["table_header_hit"])
lines.append(L["table_sep_hit"])
for _, r in hits.sort_values("TopScore", ascending=False).head(50).iterrows():
lines.append(
f"{r['Hit_id']} | {r.get('Family','')} | {r.get('TopOrder','')} | {_fmtf(r.get('TopScore',0.0))} | {_fmtf(r.get('Weight',0.0))}"
)
lines.append("")
except Exception as e:
lines.append(f"[warn] Failed to read toxin support: {e}")
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text("\n".join(lines), encoding="utf-8")
def main():
ap = argparse.ArgumentParser(description="Plot heatmaps from Bttoxin_Shoter outputs")
ap.add_argument("--strain_scores", type=Path, default=Path("shotter_outputs/strain_target_scores.tsv"))
ap.add_argument("--toxin_support", type=Path, default=None)
ap.add_argument("--species_scores", type=Path, default=None)
ap.add_argument("--out_dir", type=Path, default=Path("shotter_outputs"))
ap.add_argument("--cmap", type=str, default="viridis")
ap.add_argument("--vmin", type=float, default=0.0)
ap.add_argument("--vmax", type=float, default=1.0)
ap.add_argument("--figsize", type=str, default=None, help="e.g. 12x6")
ap.add_argument("--per_hit_strain", type=str, default=None, help="If provided, also draw per-hit heatmap for this strain (requires --toxin_support)")
ap.add_argument("--merge_unresolved", action="store_true", default=False, help="Merge columns 'other' and 'unknown' into 'unresolved' for plots")
ap.add_argument("--summary_md", type=Path, default=None, help="Write a Markdown report to this path")
ap.add_argument("--report_mode", type=str, choices=["summary", "paper"], default="paper", help="Report template style")
ap.add_argument("--lang", type=str, choices=["zh", "en"], default="zh", help="Report language")
args = ap.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
# Strain × Orders heatmap
out1 = args.out_dir / "strain_target_scores.png"
plot_strain_scores(args.strain_scores, out1, args.cmap, args.vmin, args.vmax, args.figsize, args.merge_unresolved)
print(f"Saved: {out1}")
# Optional per-hit heatmap
if args.per_hit_strain and args.toxin_support:
out2 = args.out_dir / f"per_hit_{args.per_hit_strain}.png"
plot_per_hit_for_strain(args.toxin_support, args.per_hit_strain, out2, args.cmap, args.vmin, args.vmax, args.figsize, args.merge_unresolved)
print(f"Saved: {out2}")
# Optional species heatmap
species_png: Optional[Path] = None
if args.species_scores and args.species_scores.exists():
species_png = args.out_dir / "strain_target_species_scores.png"
plot_species_scores(args.species_scores, species_png, args.cmap, args.vmin, args.vmax, args.figsize)
print(f"Saved: {species_png}")
# Report
default_name = "shotter_report_paper.md" if args.report_mode == "paper" else "shotter_summary.md"
summary_md = args.summary_md or (args.out_dir / default_name)
write_report_md(
out_path=summary_md,
mode=args.report_mode,
lang=args.lang,
strain_scores_path=args.strain_scores,
toxin_support_path=args.toxin_support,
species_scores_path=args.species_scores,
strain_heatmap_path=out1,
per_hit_heatmap_path=(args.out_dir / f"per_hit_{args.per_hit_strain}.png") if (args.per_hit_strain and args.toxin_support) else None,
species_heatmap_path=species_png,
merge_unresolved=args.merge_unresolved,
args_namespace=args,
)
print(f"Saved: {summary_md}")
if __name__ == "__main__":
main()