Files
bttoxin-pipeline/bttoxin/api.py
hotwa 5883e13c56 feat(shotter): 实现 Shotter v1 活性评估与单 FNA 流程,新增 API/CLI/绘图与报告
- 新增 scripts/bttoxin_shoter.py:从 BPPRC 正样本 CSV 构建 name/亚家族/家族特异性索引,
  解析 BtToxin_Digger All_Toxins.txt,计算 per-hit 权重并以 noisy-OR 合成菌株×目标目/物种分数,
  输出 TSV/JSON;含 HMM 加成与配对毒素规则(Vip1/Vip2,Vpa/Vpb),other/unknown 桶。
- 新增端到端工具链:
  - scripts/run_single_fna_pipeline.py:Digger → Shotter → Plot → 打包
  - scripts/plot_shotter.py:绘制热图并生成论文式/摘要式报告
  - scripts/bttoxin_api.py 与 bttoxin/api.py:纯 Python API;bttoxin/cli.py 暴露 bttoxin-run
  - pyproject.toml:项目打包与 CLI 入口
- docs(README): 增加输入文件格式与结果解读,补充单目录写入方案
- chore(gitignore): 忽略 runs/ 与 tests/output
- ci: 移除 .woodpecker/test.yml
2025-12-01 10:11:26 +08:00

300 lines
12 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import logging
import os
import shutil
import tarfile
from pathlib import Path
from types import SimpleNamespace
from typing import Dict, Any, Optional
import sys as _sys
# Ensure repo-relative imports for backend and scripts when running from installed package
_REPO_ROOT = Path(__file__).resolve().parents[1]
_BACKEND_DIR = _REPO_ROOT / "backend"
_SCRIPTS_DIR = _REPO_ROOT / "scripts"
for _p in (str(_BACKEND_DIR), str(_SCRIPTS_DIR)):
if _p not in _sys.path:
_sys.path.append(_p)
# Import DockerContainerManager from backend
from app.utils.docker_client import DockerContainerManager # type: ignore
logger = logging.getLogger(__name__)
def _lazy_import_shoter():
try:
import bttoxin_shoter as shoter # type: ignore
return shoter
except Exception as e:
raise ImportError(
f"Failed to import bttoxin_shoter from {_SCRIPTS_DIR}. Ensure repo is present in the image.\n{e}"
)
def _lazy_import_plotter():
try:
import plot_shotter as plotter # type: ignore
return plotter
except Exception as e:
raise ImportError(
f"Failed to import plot_shotter from {_SCRIPTS_DIR}. Ensure repo is present in the image.\n{e}"
)
class BtToxinRunner:
"""Wrap BtToxin_Digger docker invocation for a single FNA."""
def __init__(
self,
image: str = "quay.io/biocontainers/bttoxin_digger:1.0.10--hdfd78af_0",
platform: str = "linux/amd64",
base_workdir: Optional[Path] = None,
) -> None:
self.image = image
self.platform = platform
if base_workdir is None:
base_workdir = _REPO_ROOT / "runs" / "bttoxin"
self.base_workdir = base_workdir
self.base_workdir.mkdir(parents=True, exist_ok=True)
self.mgr = DockerContainerManager(image=self.image, platform=self.platform)
def _prepare_layout(self, fna_path: Path) -> tuple[Path, Path, Path, Path, str]:
if not fna_path.exists():
raise FileNotFoundError(f"FNA file not found: {fna_path}")
sample_name = fna_path.stem
run_root = self.base_workdir / sample_name
input_dir = run_root / "input"
digger_out = run_root / "output" / "digger"
log_dir = run_root / "logs"
for d in (input_dir, digger_out, log_dir):
d.mkdir(parents=True, exist_ok=True)
target = input_dir / fna_path.name
if target.exists():
target.unlink()
try:
os.link(fna_path, target)
logger.info("Hard-linked FNA: %s%s", fna_path, target)
except OSError:
shutil.copy2(fna_path, target)
logger.info("Copied FNA: %s%s", fna_path, target)
return input_dir, digger_out, log_dir, run_root, sample_name
def run_single_fna(self, fna_path: Path | str, sequence_type: str = "nucl", threads: int = 4) -> Dict[str, Any]:
fna_path = Path(fna_path)
input_dir, digger_out, log_dir, run_root, sample_name = self._prepare_layout(fna_path)
logger.info("Start BtToxin_Digger: %s (sample=%s)", fna_path, sample_name)
result = self.mgr.run_bttoxin_digger(
input_dir=input_dir,
output_dir=digger_out,
log_dir=log_dir,
sequence_type=sequence_type,
scaf_suffix=fna_path.suffix or ".fna",
threads=threads,
)
toxins_dir = digger_out / "Results" / "Toxins"
files = {
"list": toxins_dir / f"{sample_name}.list",
"gbk": toxins_dir / f"{sample_name}.gbk",
"all_genes": toxins_dir / "Bt_all_genes.table",
"all_toxins": toxins_dir / "All_Toxins.txt",
}
ok = bool(result.get("success")) and files["all_toxins"].exists()
return {
"success": ok,
"sample": sample_name,
"run_root": run_root,
"input_dir": input_dir,
"digger_out": digger_out,
"log_dir": log_dir,
"toxins_dir": toxins_dir,
"files": files,
"raw_result": result,
}
class ShotterAPI:
"""Pure Python Shotter scoring and saving (no subprocess)."""
def score(
self,
toxicity_csv: Path,
all_toxins: Path,
out_dir: Path,
min_identity: float = 0.0,
min_coverage: float = 0.0,
allow_unknown_families: bool = True,
require_index_hit: bool = False,
) -> Dict[str, Any]:
shoter = _lazy_import_shoter()
out_dir.mkdir(parents=True, exist_ok=True)
index = shoter.SpecificityIndex.from_csv(toxicity_csv)
df = shoter.parse_all_toxins(all_toxins)
if min_identity > 0:
df = df[df["identity01"].astype(float) >= float(min_identity)]
if min_coverage > 0:
df = df[df["coverage"].astype(float) >= float(min_coverage)]
if not allow_unknown_families:
df = df[df["family_key"].astype(str) != "unknown"]
if require_index_hit:
def _has_index_orders(row) -> bool:
name_key = str(row.get("Hit_id_norm", ""))
fam = str(row.get("family_key", ""))
d = index.orders_for_name_or_backoff(name_key)
if not d:
d = index.orders_for_name_or_backoff(fam)
return bool(d)
df = df[df.apply(_has_index_orders, axis=1)]
strains = sorted(df["Strain"].astype(str).unique().tolist())
all_hits: list[shoter.ToxinHit] = []
all_strain_scores: list[shoter.StrainScores] = []
all_species_scores: list[shoter.StrainSpeciesScores] = []
for strain in strains:
sdf = df[df["Strain"].astype(str).eq(strain)].copy()
per_hit, sscore, sspecies = shoter.score_strain(strain, sdf, index)
all_hits.extend(per_hit)
all_strain_scores.append(sscore)
if sspecies is not None:
all_species_scores.append(sspecies)
order_columns = sorted({*index.all_orders, "other", "unknown"}) or ["unknown"]
species_columns = sorted(index.all_species)
shoter.ToxinHit.save_list_tsv(out_dir / "toxin_support.tsv", all_hits, order_columns)
shoter.StrainScores.save_list_tsv(out_dir / "strain_target_scores.tsv", all_strain_scores, order_columns)
shoter.StrainScores.save_list_json(out_dir / "strain_scores.json", all_strain_scores)
if species_columns and all_species_scores:
shoter.StrainSpeciesScores.save_list_tsv(out_dir / "strain_target_species_scores.tsv", all_species_scores, species_columns)
shoter.StrainSpeciesScores.save_list_json(out_dir / "strain_species_scores.json", all_species_scores)
return {
"orders": order_columns,
"species": species_columns,
"strain_scores": out_dir / "strain_target_scores.tsv",
"toxin_support": out_dir / "toxin_support.tsv",
"strain_scores_json": out_dir / "strain_scores.json",
"species_scores": out_dir / "strain_target_species_scores.tsv",
"species_scores_json": out_dir / "strain_species_scores.json",
}
class PlotAPI:
"""Plot heatmaps and write Markdown report (no subprocess)."""
def render(
self,
shotter_dir: Path,
lang: str = "zh",
merge_unresolved: bool = True,
per_hit_strain: Optional[str] = None,
cmap: str = "viridis",
vmin: float = 0.0,
vmax: float = 1.0,
) -> Dict[str, Any]:
plotter = _lazy_import_plotter()
strain_scores = shotter_dir / "strain_target_scores.tsv"
toxin_support = shotter_dir / "toxin_support.tsv"
species_scores = shotter_dir / "strain_target_species_scores.tsv"
out1 = shotter_dir / "strain_target_scores.png"
plotter.plot_strain_scores(strain_scores, out1, cmap, vmin, vmax, None, merge_unresolved)
out2 = None
if per_hit_strain and toxin_support.exists():
out2 = shotter_dir / f"per_hit_{per_hit_strain}.png"
plotter.plot_per_hit_for_strain(toxin_support, per_hit_strain, out2, cmap, vmin, vmax, None, merge_unresolved)
species_png = None
if species_scores.exists():
species_png = shotter_dir / "strain_target_species_scores.png"
plotter.plot_species_scores(species_scores, species_png, cmap, vmin, vmax, None)
args_ns = SimpleNamespace(
allow_unknown_families=None,
require_index_hit=None,
min_identity=None,
min_coverage=None,
lang=lang,
)
report_path = shotter_dir / "shotter_report_paper.md"
plotter.write_report_md(
out_path=report_path,
mode="paper",
lang=lang,
strain_scores_path=strain_scores,
toxin_support_path=toxin_support if toxin_support.exists() else None,
species_scores_path=species_scores if species_scores.exists() else None,
strain_heatmap_path=out1,
per_hit_heatmap_path=out2,
species_heatmap_path=species_png,
merge_unresolved=merge_unresolved,
args_namespace=args_ns,
)
return {
"strain_orders_png": out1,
"per_hit_png": out2,
"species_png": species_png,
"report_md": report_path,
}
class BtSingleFnaPipeline:
"""End-to-end single-FNA pipeline: Digger → Shotter → Plot → Bundle."""
def __init__(
self,
image: str = "quay.io/biocontainers/bttoxin_digger:1.0.10--hdfd78af_0",
platform: str = "linux/amd64",
base_workdir: Optional[Path] = None,
) -> None:
self.digger = BtToxinRunner(image=image, platform=platform, base_workdir=base_workdir)
self.shotter = ShotterAPI()
self.plotter = PlotAPI()
def run(
self,
fna: Path | str,
toxicity_csv: Path | str = Path("Data/toxicity-data.csv"),
min_identity: float = 0.0,
min_coverage: float = 0.0,
allow_unknown_families: bool = True,
require_index_hit: bool = False,
lang: str = "zh",
threads: int = 4,
) -> Dict[str, Any]:
dig = self.digger.run_single_fna(fna_path=fna, sequence_type="nucl", threads=threads)
if not dig.get("success"):
return {"ok": False, "stage": "digger", "detail": dig}
run_root: Path = dig["run_root"]
shotter_dir = run_root / "output" / "shotter"
shot = self.shotter.score(
toxicity_csv=Path(toxicity_csv),
all_toxins=Path(dig["files"]["all_toxins"]),
out_dir=shotter_dir,
min_identity=min_identity,
min_coverage=min_coverage,
allow_unknown_families=allow_unknown_families,
require_index_hit=require_index_hit,
)
strain_for_plot = None
try:
import pandas as pd
df = pd.read_csv(shot["strain_scores"], sep="\t")
if len(df):
strain_for_plot = str(df.iloc[0]["Strain"])
except Exception:
pass
_ = self.plotter.render(
shotter_dir=shotter_dir,
lang=lang,
merge_unresolved=True,
per_hit_strain=strain_for_plot,
)
bundle = run_root / "pipeline_results.tar.gz"
with tarfile.open(bundle, "w:gz") as tar:
tar.add(run_root / "output" / "digger", arcname="digger")
tar.add(run_root / "output" / "shotter", arcname="shotter")
return {
"ok": True,
"run_root": str(run_root),
"digger_dir": str(run_root / "output" / "digger"),
"shotter_dir": str(shotter_dir),
"bundle": str(bundle),
"strain": strain_for_plot or "",
}