- 新增 scripts/bttoxin_shoter.py:从 BPPRC 正样本 CSV 构建 name/亚家族/家族特异性索引, 解析 BtToxin_Digger All_Toxins.txt,计算 per-hit 权重并以 noisy-OR 合成菌株×目标目/物种分数, 输出 TSV/JSON;含 HMM 加成与配对毒素规则(Vip1/Vip2,Vpa/Vpb),other/unknown 桶。 - 新增端到端工具链: - scripts/run_single_fna_pipeline.py:Digger → Shotter → Plot → 打包 - scripts/plot_shotter.py:绘制热图并生成论文式/摘要式报告 - scripts/bttoxin_api.py 与 bttoxin/api.py:纯 Python API;bttoxin/cli.py 暴露 bttoxin-run - pyproject.toml:项目打包与 CLI 入口 - docs(README): 增加输入文件格式与结果解读,补充单目录写入方案 - chore(gitignore): 忽略 runs/ 与 tests/output - ci: 移除 .woodpecker/test.yml
237 lines
8.0 KiB
Python
237 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Run a single-fna BtToxin_Digger -> Shotter -> Plots pipeline.
|
|
|
|
- Input: one .fna file (nucleotide scaffold)
|
|
- Steps:
|
|
1) Stage this single file, run BtToxin_Digger via DockerContainerManager
|
|
2) Run Shotter scoring on Digger's All_Toxins.txt
|
|
3) Render heatmaps + paper-style report
|
|
4) Organize outputs under one root folder:
|
|
<out_root>/
|
|
├─ digger/ (container outputs)
|
|
├─ shotter/ (Shotter TSV/JSON + plots + report)
|
|
└─ pipeline_results.tar.gz (bundle)
|
|
|
|
Notes
|
|
- Digger is executed in a container (root in container); files may be owned by root on host.
|
|
We write everything into <out_root>/digger to keep permissions/locality predictable.
|
|
- This script exposes CLI flags for Shotter filters to allow strict/loose runs.
|
|
|
|
Example
|
|
python scripts/run_single_fna_pipeline.py \
|
|
--fna tests/test_data/C15.fna \
|
|
--toxicity_csv Data/toxicity-data.csv \
|
|
--out_root runs/C15_run \
|
|
--min_identity 0.50 --min_coverage 0.60 \
|
|
--disallow_unknown_families --require_index_hit --lang zh
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tarfile
|
|
from pathlib import Path
|
|
from typing import Dict, Any
|
|
|
|
# import DockerContainerManager from backend
|
|
sys.path.append(str(Path(__file__).resolve().parents[1] / "backend"))
|
|
from app.utils.docker_client import DockerContainerManager # type: ignore
|
|
|
|
|
|
def _shell(cmd: list[str]) -> subprocess.CompletedProcess:
|
|
return subprocess.run(cmd, text=True)
|
|
|
|
|
|
def _read_first_strain(strain_scores_tsv: Path) -> str:
|
|
try:
|
|
with strain_scores_tsv.open("r", encoding="utf-8") as f:
|
|
header = f.readline().strip().split("\t")
|
|
idx_strain = header.index("Strain")
|
|
# next non-empty line
|
|
for line in f:
|
|
if not line.strip():
|
|
continue
|
|
parts = line.rstrip("\n").split("\t")
|
|
if len(parts) > idx_strain:
|
|
return parts[idx_strain]
|
|
except Exception:
|
|
pass
|
|
return ""
|
|
|
|
|
|
def run_single_fna_pipeline(
|
|
fna_path: Path,
|
|
out_root: Path,
|
|
toxicity_csv: Path = Path("Data/toxicity-data.csv"),
|
|
image: str = "quay.io/biocontainers/bttoxin_digger:1.0.10--hdfd78af_0",
|
|
platform: str = "linux/amd64",
|
|
min_identity: float = 0.0,
|
|
min_coverage: float = 0.0,
|
|
allow_unknown_families: bool = True,
|
|
require_index_hit: bool = False,
|
|
lang: str = "zh",
|
|
) -> Dict[str, Any]:
|
|
fna_path = fna_path.resolve()
|
|
out_root = out_root.resolve()
|
|
out_root.mkdir(parents=True, exist_ok=True)
|
|
|
|
digger_dir = out_root / "digger"
|
|
shotter_dir = out_root / "shotter"
|
|
logs_dir = out_root / "logs"
|
|
stage_dir = out_root / "stage"
|
|
for d in (digger_dir, shotter_dir, logs_dir, stage_dir):
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Stage single input file
|
|
staged_fna = stage_dir / fna_path.name
|
|
shutil.copy2(fna_path, staged_fna)
|
|
|
|
# 1) Run BtToxin_Digger via DockerContainerManager
|
|
mgr = DockerContainerManager(image=image, platform=platform)
|
|
result = mgr.run_bttoxin_digger(
|
|
input_dir=stage_dir,
|
|
output_dir=digger_dir,
|
|
log_dir=logs_dir,
|
|
sequence_type="nucl",
|
|
scaf_suffix=fna_path.suffix or ".fna",
|
|
threads=4,
|
|
)
|
|
if not result.get("success"):
|
|
return {
|
|
"ok": False,
|
|
"stage": "digger",
|
|
"error": result.get("error") or f"Digger failed (exit={result.get('exit_code')})",
|
|
"logs": (logs_dir / "digger_execution.log").read_text(encoding="utf-8") if (logs_dir / "digger_execution.log").exists() else "",
|
|
}
|
|
|
|
toxins_dir = digger_dir / "Results" / "Toxins"
|
|
all_toxins = toxins_dir / "All_Toxins.txt"
|
|
if not all_toxins.exists():
|
|
return {"ok": False, "stage": "digger", "error": f"Missing All_Toxins.txt at {all_toxins}"}
|
|
|
|
# 2) Run Shotter scoring
|
|
shotter_dir.mkdir(parents=True, exist_ok=True)
|
|
py = sys.executable
|
|
shoter_cmd: list[str] = [
|
|
py,
|
|
str(Path(__file__).resolve().parents[0] / "bttoxin_shoter.py"),
|
|
"--toxicity_csv",
|
|
str(toxicity_csv),
|
|
"--all_toxins",
|
|
str(all_toxins),
|
|
"--output_dir",
|
|
str(shotter_dir),
|
|
]
|
|
if min_identity and min_identity > 0:
|
|
shoter_cmd += ["--min_identity", str(min_identity)]
|
|
if min_coverage and min_coverage > 0:
|
|
shoter_cmd += ["--min_coverage", str(min_coverage)]
|
|
if not allow_unknown_families:
|
|
shoter_cmd += ["--disallow_unknown_families"]
|
|
if require_index_hit:
|
|
shoter_cmd += ["--require_index_hit"]
|
|
|
|
r1 = _shell(shoter_cmd)
|
|
if r1.returncode != 0:
|
|
return {"ok": False, "stage": "shotter", "error": f"Shotter failed: {' '.join(shoter_cmd)}"}
|
|
|
|
strain_scores = shotter_dir / "strain_target_scores.tsv"
|
|
toxin_support = shotter_dir / "toxin_support.tsv"
|
|
species_scores = shotter_dir / "strain_target_species_scores.tsv"
|
|
|
|
# 3) Plot & report
|
|
strain_for_plot = _read_first_strain(strain_scores)
|
|
plot_cmd: list[str] = [
|
|
py,
|
|
str(Path(__file__).resolve().parents[0] / "plot_shotter.py"),
|
|
"--strain_scores",
|
|
str(strain_scores),
|
|
"--toxin_support",
|
|
str(toxin_support),
|
|
"--species_scores",
|
|
str(species_scores),
|
|
"--out_dir",
|
|
str(shotter_dir),
|
|
"--merge_unresolved",
|
|
"--report_mode",
|
|
"paper",
|
|
"--lang",
|
|
lang,
|
|
]
|
|
if strain_for_plot:
|
|
plot_cmd += ["--per_hit_strain", strain_for_plot]
|
|
|
|
r2 = _shell(plot_cmd)
|
|
if r2.returncode != 0:
|
|
# plotting/report optional; continue
|
|
pass
|
|
|
|
# 4) Bundle
|
|
bundle = out_root / "pipeline_results.tar.gz"
|
|
with tarfile.open(bundle, "w:gz") as tar:
|
|
tar.add(digger_dir, arcname="digger")
|
|
tar.add(shotter_dir, arcname="shotter")
|
|
|
|
return {
|
|
"ok": True,
|
|
"digger_dir": str(digger_dir),
|
|
"shotter_dir": str(shotter_dir),
|
|
"bundle": str(bundle),
|
|
"all_toxins": str(all_toxins),
|
|
"strain": strain_for_plot,
|
|
}
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(description="Run single-fna Digger -> Shotter pipeline")
|
|
ap.add_argument("--fna", type=Path, required=True, help="Path to a single .fna file")
|
|
ap.add_argument("--toxicity_csv", type=Path, default=Path("Data/toxicity-data.csv"))
|
|
ap.add_argument("--out_root", type=Path, default=Path("runs/single_run"))
|
|
ap.add_argument("--image", type=str, default="quay.io/biocontainers/bttoxin_digger:1.0.10--hdfd78af_0")
|
|
ap.add_argument("--platform", type=str, default="linux/amd64")
|
|
ap.add_argument("--min_identity", type=float, default=0.0)
|
|
ap.add_argument("--min_coverage", type=float, default=0.0)
|
|
ap.add_argument("--disallow_unknown_families", action="store_true", default=False)
|
|
ap.add_argument("--require_index_hit", action="store_true", default=False)
|
|
ap.add_argument("--lang", type=str, choices=["zh", "en"], default="zh")
|
|
args = ap.parse_args()
|
|
|
|
# derive per-run default out_root using file stem
|
|
if str(args.out_root) == "runs/single_run":
|
|
stem = args.fna.stem
|
|
args.out_root = Path("runs") / f"{stem}_run"
|
|
|
|
res = run_single_fna_pipeline(
|
|
fna_path=args.fna,
|
|
out_root=args.out_root,
|
|
toxicity_csv=args.toxicity_csv,
|
|
image=args.image,
|
|
platform=args.platform,
|
|
min_identity=args.min_identity,
|
|
min_coverage=args.min_coverage,
|
|
allow_unknown_families=not args.disallow_unknown_families,
|
|
require_index_hit=args.require_index_hit,
|
|
lang=args.lang,
|
|
)
|
|
|
|
if not res.get("ok"):
|
|
print(f"[pipeline] FAILED at stage={res.get('stage')}: {res.get('error')}")
|
|
logs = res.get("logs")
|
|
if logs:
|
|
print(logs[:2000])
|
|
return 1
|
|
|
|
print("[pipeline] ✓ Done")
|
|
print(f" Digger: {res['digger_dir']}")
|
|
print(f" Shotter: {res['shotter_dir']}")
|
|
print(f" Bundle: {res['bundle']}")
|
|
print(f" Strain: {res.get('strain','')}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|