Files
bttoxin-pipeline/scripts/run_single_fna_pipeline.py

349 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Run a single-file BtToxin_Digger -> Shotter -> Plots pipeline.
Supports both genome (.fna/.fa) and protein (.faa) files.
- Input: one .fna/.fa file (nucleotide scaffold) OR one .faa file (protein)
- Steps:
1) Stage this single file, run BtToxin_Digger via PixiRunner (pixi environment)
2) Run Shotter scoring on Digger's All_Toxins.txt via pixi run -e pipeline
3) Render heatmaps + paper-style report via pixi run -e pipeline
4) Organize outputs under one root folder:
<out_root>/
├─ digger/ (pixi digger env outputs)
├─ shotter/ (Shotter TSV/JSON + plots + report)
└─ pipeline_results.tar.gz (bundle)
Notes
- Digger is executed in the pixi 'digger' environment with bioconda dependencies.
- Shotter and plotting are executed in the pixi 'pipeline' environment with Python dependencies.
- This script exposes CLI flags for Shotter filters to allow strict/loose runs.
- 默认使用 external_dbs/bt_toxin 作为外部数据库(若存在)。
Example (Genome file):
python scripts/run_single_fna_pipeline.py \\
--input tests/test_data/HAN055.fna \\
--toxicity_csv Data/toxicity-data.csv \\
--out_root runs/HAN055_run \\
--min_identity 0.50 --min_coverage 0.60 \\
--disallow_unknown_families --require_index_hit --lang zh
Example (Protein file):
python scripts/run_single_fna_pipeline.py \\
--input tests/test_data/proteins.faa \\
--toxicity_csv Data/toxicity-data.csv \\
--out_root runs/proteins_run \\
--min_identity 0.50 --min_coverage 0.60
# 使用自定义数据库路径
python scripts/run_single_fna_pipeline.py \\
--input tests/test_data/HAN055.fna \\
--bttoxin_db_dir /path/to/custom/bt_toxin
# 使用 pixi 任务运行
pixi run pipeline --input tests/test_data/HAN055.fna
"""
from __future__ import annotations
import argparse
import os
import shutil
import subprocess
import sys
import tarfile
from pathlib import Path
from typing import Dict, Any, List
# Import PixiRunner and command builders from scripts
# Add scripts directory to path for pixi_runner import
sys.path.insert(0, str(Path(__file__).parent))
from pixi_runner import PixiRunner, build_shotter_command, build_plot_command
# Supported file extensions
GENOME_EXTENSIONS = {".fna", ".fa", ".fasta"}
PROTEIN_EXTENSIONS = {".faa"}
ALL_EXTENSIONS = GENOME_EXTENSIONS | PROTEIN_EXTENSIONS
def detect_sequence_type(file_path: Path) -> tuple[str, str]:
"""
检测文件类型并返回 (sequence_type, suffix)。
Returns:
(sequence_type, suffix): where sequence_type is "nucl" or "prot",
and suffix is the file extension (e.g., ".fna", ".faa")
"""
ext = file_path.suffix.lower()
if ext in PROTEIN_EXTENSIONS:
return "prot", ext
elif ext in GENOME_EXTENSIONS:
return "nucl", ext
else:
raise ValueError(
f"Unsupported file extension: {ext}. "
f"Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}"
)
def _shell(cmd: list[str]) -> subprocess.CompletedProcess:
return subprocess.run(cmd, text=True)
def _read_first_strain(strain_scores_tsv: Path) -> str:
try:
with strain_scores_tsv.open("r", encoding="utf-8") as f:
header = f.readline().strip().split("\t")
idx_strain = header.index("Strain")
# next non-empty line
for line in f:
if not line.strip():
continue
parts = line.rstrip("\n").split("\t")
if len(parts) > idx_strain:
return parts[idx_strain]
except Exception:
pass
return ""
def run_single_file_pipeline(
input_path: Path,
out_root: Path,
toxicity_csv: Path = Path("Data/toxicity-data.csv"),
min_identity: float = 0.0,
min_coverage: float = 0.0,
allow_unknown_families: bool = True,
require_index_hit: bool = False,
lang: str = "zh",
bttoxin_db_dir: Path | None = None,
threads: int = 4,
sequence_type: str | None = None,
file_suffix: str | None = None,
) -> Dict[str, Any]:
"""运行单个文件(基因组或蛋白)的完整 pipeline使用 pixi 环境)。
Args:
input_path: 输入文件路径(.fna/.fa/.fasta 或 .faa
out_root: 输出根目录
toxicity_csv: 毒性数据 CSV 文件路径
min_identity: 最小 identity 阈值
min_coverage: 最小 coverage 阈值
allow_unknown_families: 是否允许未知家族
require_index_hit: 是否要求索引命中
lang: 报告语言 (zh/en)
bttoxin_db_dir: 外部 bt_toxin 数据库目录。若为 None则自动检测
项目根目录下的 external_dbs/bt_toxin。
threads: 线程数
sequence_type: 序列类型 ("nucl""prot"),若为 None 则自动检测
file_suffix: 文件后缀,若为 None 则自动检测
"""
input_path = input_path.resolve()
out_root = out_root.resolve()
out_root.mkdir(parents=True, exist_ok=True)
# 自动检测序列类型
if sequence_type is None or file_suffix is None:
detected_type, detected_suffix = detect_sequence_type(input_path)
sequence_type = sequence_type or detected_type
file_suffix = file_suffix or detected_suffix
seq_type_label = "protein" if sequence_type == "prot" else "genome"
print(f"[pipeline] Input file: {input_path.name}")
print(f"[pipeline] Detected type: {seq_type_label} ({sequence_type}), suffix: {file_suffix}")
# 自动检测外部数据库
if bttoxin_db_dir is None:
default_db = Path(__file__).resolve().parents[1] / "external_dbs" / "bt_toxin"
if default_db.exists() and (default_db / "db").exists():
bttoxin_db_dir = default_db
print(f"[pipeline] 使用外部数据库: {bttoxin_db_dir}")
else:
print("[pipeline] 未找到外部数据库,将使用 pixi 环境内置数据库")
digger_dir = out_root / "digger"
shotter_dir = out_root / "shotter"
logs_dir = out_root / "logs"
stage_dir = out_root / "stage"
for d in (digger_dir, shotter_dir, logs_dir, stage_dir):
d.mkdir(parents=True, exist_ok=True)
# Stage single input file
staged_file = stage_dir / input_path.name
shutil.copy2(input_path, staged_file)
# 1) Run BtToxin_Digger via PixiRunner (pixi digger environment)
runner = PixiRunner(env_name="digger")
# 构建参数:根据序列类型使用不同的参数名
digger_params = {
"input_dir": stage_dir,
"output_dir": digger_dir,
"log_dir": logs_dir,
"sequence_type": sequence_type,
"threads": threads,
"bttoxin_db_dir": bttoxin_db_dir,
}
# 根据序列类型添加相应的后缀参数
if sequence_type == "prot":
digger_params["prot_suffix"] = file_suffix
else:
digger_params["scaf_suffix"] = file_suffix
result = runner.run_bttoxin_digger(**digger_params)
if not result.get("success"):
return {
"ok": False,
"stage": "digger",
"error": result.get("error") or f"Digger failed (exit={result.get('exit_code')})",
"logs": (logs_dir / "digger_execution.log").read_text(encoding="utf-8") if (logs_dir / "digger_execution.log").exists() else "",
}
toxins_dir = digger_dir / "Results" / "Toxins"
all_toxins = toxins_dir / "All_Toxins.txt"
if not all_toxins.exists():
return {"ok": False, "stage": "digger", "error": f"Missing All_Toxins.txt at {all_toxins}"}
# 2) Run Shotter scoring via pixi run -e pipeline
shotter_dir.mkdir(parents=True, exist_ok=True)
scripts_dir = Path(__file__).resolve().parents[0]
pixi_project_dir = Path(__file__).resolve().parents[1]
shoter_cmd = build_shotter_command(
pixi_project_dir=pixi_project_dir,
script_path=scripts_dir / "bttoxin_shoter.py",
toxicity_csv=toxicity_csv,
all_toxins=all_toxins,
output_dir=shotter_dir,
min_identity=min_identity,
min_coverage=min_coverage,
allow_unknown_families=allow_unknown_families,
require_index_hit=require_index_hit,
)
r1 = _shell(shoter_cmd)
if r1.returncode != 0:
return {"ok": False, "stage": "shotter", "error": f"Shotter failed: {' '.join(shoter_cmd)}"}
strain_scores = shotter_dir / "strain_target_scores.tsv"
toxin_support = shotter_dir / "toxin_support.tsv"
species_scores = shotter_dir / "strain_target_species_scores.tsv"
# 3) Plot & report via pixi run -e pipeline
strain_for_plot = _read_first_strain(strain_scores)
plot_cmd = build_plot_command(
pixi_project_dir=pixi_project_dir,
script_path=scripts_dir / "plot_shotter.py",
strain_scores=strain_scores,
toxin_support=toxin_support,
species_scores=species_scores,
out_dir=shotter_dir,
merge_unresolved=True,
report_mode="paper",
lang=lang,
per_hit_strain=strain_for_plot if strain_for_plot else None,
)
r2 = _shell(plot_cmd)
if r2.returncode != 0:
# plotting/report optional; continue
pass
# 4) Bundle
bundle = out_root / "pipeline_results.tar.gz"
with tarfile.open(bundle, "w:gz") as tar:
tar.add(digger_dir, arcname="digger")
tar.add(shotter_dir, arcname="shotter")
return {
"ok": True,
"digger_dir": str(digger_dir),
"shotter_dir": str(shotter_dir),
"bundle": str(bundle),
"all_toxins": str(all_toxins),
"strain": strain_for_plot,
"sequence_type": sequence_type,
}
def main() -> int:
ap = argparse.ArgumentParser(
description="Run single-file Digger -> Shotter pipeline (pixi-based). "
"Supports genome (.fna/.fa/.fasta) and protein (.faa) files."
)
ap.add_argument(
"--input",
type=Path,
required=True,
help="Path to input file (genome: .fna/.fa/.fasta, protein: .faa). "
"Deprecated: --fna still works for backward compatibility."
)
ap.add_argument("--fna", type=Path, default=None, help=argparse.SUPPRESS) # Hidden backward compat
ap.add_argument("--toxicity_csv", type=Path, default=Path("Data/toxicity-data.csv"))
ap.add_argument("--out_root", type=Path, default=Path("runs/single_run"))
ap.add_argument("--min_identity", type=float, default=0.0)
ap.add_argument("--min_coverage", type=float, default=0.0)
ap.add_argument("--disallow_unknown_families", action="store_true", default=False)
ap.add_argument("--require_index_hit", action="store_true", default=False)
ap.add_argument("--lang", type=str, choices=["zh", "en"], default="zh")
ap.add_argument("--bttoxin_db_dir", type=Path, default=None,
help="外部 bt_toxin 数据库目录路径(默认自动检测 external_dbs/bt_toxin")
ap.add_argument("--threads", type=int, default=4, help="线程数")
args = ap.parse_args()
# Backward compatibility: --fna flag
input_file = args.input if args.input is not None else args.fna
if input_file is None:
print("[pipeline] Error: --input argument is required")
return 1
# 验证文件扩展名
if input_file.suffix.lower() not in ALL_EXTENSIONS:
print(f"[pipeline] Error: Unsupported file extension: {input_file.suffix}")
print(f"[pipeline] Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}")
return 1
# derive per-run default out_root using file stem
if str(args.out_root) == "runs/single_run":
stem = input_file.stem
args.out_root = Path("runs") / f"{stem}_run"
res = run_single_file_pipeline(
input_path=input_file,
out_root=args.out_root,
toxicity_csv=args.toxicity_csv,
min_identity=args.min_identity,
min_coverage=args.min_coverage,
allow_unknown_families=not args.disallow_unknown_families,
require_index_hit=args.require_index_hit,
lang=args.lang,
bttoxin_db_dir=args.bttoxin_db_dir,
threads=args.threads,
)
if not res.get("ok"):
print(f"[pipeline] FAILED at stage={res.get('stage')}: {res.get('error')}")
logs = res.get("logs")
if logs:
print(logs[:2000])
return 1
print("[pipeline] ✓ Done")
print(f" Input type: {res.get('sequence_type', 'unknown')}")
print(f" Digger: {res['digger_dir']}")
print(f" Shotter: {res['shotter_dir']}")
print(f" Bundle: {res['bundle']}")
print(f" Strain: {res.get('strain','')}")
return 0
# Backward compatibility alias
run_single_fna_pipeline = run_single_file_pipeline
if __name__ == "__main__":
raise SystemExit(main())