#!/usr/bin/env python3 """Run a single-file BtToxin_Digger -> Shotter -> Plots pipeline. Supports both genome (.fna/.fa) and protein (.faa) files. - Input: one .fna/.fa file (nucleotide scaffold) OR one .faa file (protein) - Steps: 1) Stage this single file, run BtToxin_Digger via PixiRunner (pixi environment) 2) Run Shotter scoring on Digger's All_Toxins.txt via pixi run -e pipeline 3) Render heatmaps + paper-style report via pixi run -e pipeline 4) Organize outputs under one root folder: / ├─ digger/ (pixi digger env outputs) ├─ shotter/ (Shotter TSV/JSON + plots + report) └─ pipeline_results.tar.gz (bundle) Notes - Digger is executed in the pixi 'digger' environment with bioconda dependencies. - Shotter and plotting are executed in the pixi 'pipeline' environment with Python dependencies. - This script exposes CLI flags for Shotter filters to allow strict/loose runs. - 默认使用 external_dbs/bt_toxin 作为外部数据库(若存在)。 Example (Genome file): python scripts/run_single_fna_pipeline.py \\ --input tests/test_data/HAN055.fna \\ --toxicity_csv Data/toxicity-data.csv \\ --out_root runs/HAN055_run \\ --min_identity 0.50 --min_coverage 0.60 \\ --disallow_unknown_families --require_index_hit --lang zh Example (Protein file): python scripts/run_single_fna_pipeline.py \\ --input tests/test_data/proteins.faa \\ --toxicity_csv Data/toxicity-data.csv \\ --out_root runs/proteins_run \\ --min_identity 0.50 --min_coverage 0.60 # 使用自定义数据库路径 python scripts/run_single_fna_pipeline.py \\ --input tests/test_data/HAN055.fna \\ --bttoxin_db_dir /path/to/custom/bt_toxin # 使用 pixi 任务运行 pixi run pipeline --input tests/test_data/HAN055.fna """ from __future__ import annotations import argparse import os import shutil import subprocess import sys import tarfile from pathlib import Path from typing import Dict, Any, List # Import PixiRunner and command builders from scripts # Add scripts directory to path for pixi_runner import sys.path.insert(0, str(Path(__file__).parent)) from pixi_runner import PixiRunner, build_shotter_command, build_plot_command # Supported file extensions GENOME_EXTENSIONS = {".fna", ".fa", ".fasta"} PROTEIN_EXTENSIONS = {".faa"} ALL_EXTENSIONS = GENOME_EXTENSIONS | PROTEIN_EXTENSIONS def detect_sequence_type(file_path: Path) -> tuple[str, str]: """ 检测文件类型并返回 (sequence_type, suffix)。 Returns: (sequence_type, suffix): where sequence_type is "nucl" or "prot", and suffix is the file extension (e.g., ".fna", ".faa") """ ext = file_path.suffix.lower() if ext in PROTEIN_EXTENSIONS: return "prot", ext elif ext in GENOME_EXTENSIONS: return "nucl", ext else: raise ValueError( f"Unsupported file extension: {ext}. " f"Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}" ) def _shell(cmd: list[str]) -> subprocess.CompletedProcess: return subprocess.run(cmd, text=True) def _read_first_strain(strain_scores_tsv: Path) -> str: try: with strain_scores_tsv.open("r", encoding="utf-8") as f: header = f.readline().strip().split("\t") idx_strain = header.index("Strain") # next non-empty line for line in f: if not line.strip(): continue parts = line.rstrip("\n").split("\t") if len(parts) > idx_strain: return parts[idx_strain] except Exception: pass return "" def run_single_file_pipeline( input_path: Path, out_root: Path, toxicity_csv: Path = Path("Data/toxicity-data.csv"), min_identity: float = 0.0, min_coverage: float = 0.0, allow_unknown_families: bool = True, require_index_hit: bool = False, lang: str = "zh", bttoxin_db_dir: Path | None = None, threads: int = 4, sequence_type: str | None = None, file_suffix: str | None = None, ) -> Dict[str, Any]: """运行单个文件(基因组或蛋白)的完整 pipeline(使用 pixi 环境)。 Args: input_path: 输入文件路径(.fna/.fa/.fasta 或 .faa) out_root: 输出根目录 toxicity_csv: 毒性数据 CSV 文件路径 min_identity: 最小 identity 阈值 min_coverage: 最小 coverage 阈值 allow_unknown_families: 是否允许未知家族 require_index_hit: 是否要求索引命中 lang: 报告语言 (zh/en) bttoxin_db_dir: 外部 bt_toxin 数据库目录。若为 None,则自动检测 项目根目录下的 external_dbs/bt_toxin。 threads: 线程数 sequence_type: 序列类型 ("nucl" 或 "prot"),若为 None 则自动检测 file_suffix: 文件后缀,若为 None 则自动检测 """ input_path = input_path.resolve() out_root = out_root.resolve() out_root.mkdir(parents=True, exist_ok=True) # 自动检测序列类型 if sequence_type is None or file_suffix is None: detected_type, detected_suffix = detect_sequence_type(input_path) sequence_type = sequence_type or detected_type file_suffix = file_suffix or detected_suffix seq_type_label = "protein" if sequence_type == "prot" else "genome" print(f"[pipeline] Input file: {input_path.name}") print(f"[pipeline] Detected type: {seq_type_label} ({sequence_type}), suffix: {file_suffix}") # 自动检测外部数据库 if bttoxin_db_dir is None: default_db = Path(__file__).resolve().parents[1] / "external_dbs" / "bt_toxin" if default_db.exists() and (default_db / "db").exists(): bttoxin_db_dir = default_db print(f"[pipeline] 使用外部数据库: {bttoxin_db_dir}") else: print("[pipeline] 未找到外部数据库,将使用 pixi 环境内置数据库") digger_dir = out_root / "digger" shotter_dir = out_root / "shotter" logs_dir = out_root / "logs" stage_dir = out_root / "stage" for d in (digger_dir, shotter_dir, logs_dir, stage_dir): d.mkdir(parents=True, exist_ok=True) # Stage single input file staged_file = stage_dir / input_path.name shutil.copy2(input_path, staged_file) # 1) Run BtToxin_Digger via PixiRunner (pixi digger environment) runner = PixiRunner(env_name="digger") # 构建参数:根据序列类型使用不同的参数名 digger_params = { "input_dir": stage_dir, "output_dir": digger_dir, "log_dir": logs_dir, "sequence_type": sequence_type, "threads": threads, "bttoxin_db_dir": bttoxin_db_dir, } # 根据序列类型添加相应的后缀参数 if sequence_type == "prot": digger_params["prot_suffix"] = file_suffix else: digger_params["scaf_suffix"] = file_suffix result = runner.run_bttoxin_digger(**digger_params) if not result.get("success"): return { "ok": False, "stage": "digger", "error": result.get("error") or f"Digger failed (exit={result.get('exit_code')})", "logs": (logs_dir / "digger_execution.log").read_text(encoding="utf-8") if (logs_dir / "digger_execution.log").exists() else "", } toxins_dir = digger_dir / "Results" / "Toxins" all_toxins = toxins_dir / "All_Toxins.txt" if not all_toxins.exists(): return {"ok": False, "stage": "digger", "error": f"Missing All_Toxins.txt at {all_toxins}"} # 1.5) Run Context Analysis (BGC, Mobilome, CRISPR) print("[pipeline] Running Genome Context Analysis...") context_dir = out_root / "context" context_dir.mkdir(parents=True, exist_ok=True) # Define tool paths project_root = Path(__file__).resolve().parents[1] bgc_script = project_root / "tools" / "bgc_analysis" / "detect_bgc.py" mobi_script = project_root / "tools" / "mobilome_analysis" / "detect_mobilome.py" crispr_script = project_root / "tools" / "crispr_cas_analysis" / "scripts" / "detect_crispr.py" fusion_script = project_root / "tools" / "crispr_cas_analysis" / "scripts" / "fusion_analysis.py" # Define outputs bgc_out = context_dir / "bgc.json" mobi_out = context_dir / "mobilome.json" crispr_out = context_dir / "crispr_results.json" fusion_out = context_dir / "fusion_results.json" # Execute Tools (using pipeline env python) # BGC if bgc_script.exists(): _shell(["python", str(bgc_script), "--input", str(input_path), "--output", str(bgc_out)]) # Mobilome if mobi_script.exists(): _shell(["python", str(mobi_script), "--input", str(input_path), "--output", str(mobi_out)]) # CRISPR Detection if crispr_script.exists(): # Use --mock for now as per instructions _shell(["python", str(crispr_script), "--input", str(input_path), "--output", str(crispr_out), "--mock"]) # CRISPR Fusion (needs CRISPR results + Digger results) if fusion_script.exists() and crispr_out.exists() and all_toxins.exists(): _shell([ "python", str(fusion_script), "--crispr-results", str(crispr_out), "--toxin-results", str(all_toxins), "--genome", str(input_path), "--output", str(fusion_out), "--mock" ]) # 2) Run Shotter scoring via pixi run -e pipeline shotter_dir.mkdir(parents=True, exist_ok=True) scripts_dir = Path(__file__).resolve().parents[0] pixi_project_dir = Path(__file__).resolve().parents[1] shoter_cmd = build_shotter_command( pixi_project_dir=pixi_project_dir, script_path=scripts_dir / "bttoxin_shoter.py", toxicity_csv=toxicity_csv, all_toxins=all_toxins, output_dir=shotter_dir, min_identity=min_identity, min_coverage=min_coverage, allow_unknown_families=allow_unknown_families, require_index_hit=require_index_hit, # Pass Context Results crispr_results=fusion_out if fusion_out.exists() else None, crispr_fusion=fusion_out.exists(), context_bgc=bgc_out if bgc_out.exists() else None, context_mobilome=mobi_out if mobi_out.exists() else None, context_crispr=crispr_out if crispr_out.exists() else None, ) r1 = _shell(shoter_cmd) if r1.returncode != 0: return {"ok": False, "stage": "shotter", "error": f"Shotter failed: {' '.join(shoter_cmd)}"} strain_scores = shotter_dir / "strain_target_scores.tsv" toxin_support = shotter_dir / "toxin_support.tsv" species_scores = shotter_dir / "strain_target_species_scores.tsv" # 3) Plot & report via pixi run -e pipeline strain_for_plot = _read_first_strain(strain_scores) plot_cmd = build_plot_command( pixi_project_dir=pixi_project_dir, script_path=scripts_dir / "plot_shotter.py", strain_scores=strain_scores, toxin_support=toxin_support, species_scores=species_scores, out_dir=shotter_dir, merge_unresolved=True, report_mode="paper", lang=lang, per_hit_strain=strain_for_plot if strain_for_plot else None, ) r2 = _shell(plot_cmd) if r2.returncode != 0: print(f"[pipeline] Warning: Plotting/Report failed (exit={r2.returncode})") # plotting/report optional; continue pass # 4) Bundle bundle = out_root / "pipeline_results.tar.gz" with tarfile.open(bundle, "w:gz") as tar: tar.add(digger_dir, arcname="digger") tar.add(shotter_dir, arcname="shotter") return { "ok": True, "digger_dir": str(digger_dir), "shotter_dir": str(shotter_dir), "bundle": str(bundle), "all_toxins": str(all_toxins), "strain": strain_for_plot, "sequence_type": sequence_type, } def main() -> int: ap = argparse.ArgumentParser( description="Run single-file Digger -> Shotter pipeline (pixi-based). " "Supports genome (.fna/.fa/.fasta) and protein (.faa) files." ) ap.add_argument( "--input", type=Path, required=True, help="Path to input file (genome: .fna/.fa/.fasta, protein: .faa). " "Deprecated: --fna still works for backward compatibility." ) ap.add_argument("--fna", type=Path, default=None, help=argparse.SUPPRESS) # Hidden backward compat ap.add_argument("--toxicity_csv", type=Path, default=Path("Data/toxicity-data.csv")) ap.add_argument("--out_root", type=Path, default=Path("runs/single_run")) ap.add_argument("--min_identity", type=float, default=0.0) ap.add_argument("--min_coverage", type=float, default=0.0) ap.add_argument("--disallow_unknown_families", action="store_true", default=False) ap.add_argument("--require_index_hit", action="store_true", default=False) ap.add_argument("--lang", type=str, choices=["zh", "en"], default="zh") ap.add_argument("--bttoxin_db_dir", type=Path, default=None, help="外部 bt_toxin 数据库目录路径(默认自动检测 external_dbs/bt_toxin)") ap.add_argument("--threads", type=int, default=4, help="线程数") args = ap.parse_args() # Backward compatibility: --fna flag input_file = args.input if args.input is not None else args.fna if input_file is None: print("[pipeline] Error: --input argument is required") return 1 # 验证文件扩展名 if input_file.suffix.lower() not in ALL_EXTENSIONS: print(f"[pipeline] Error: Unsupported file extension: {input_file.suffix}") print(f"[pipeline] Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}") return 1 # derive per-run default out_root using file stem if str(args.out_root) == "runs/single_run": stem = input_file.stem args.out_root = Path("runs") / f"{stem}_run" res = run_single_file_pipeline( input_path=input_file, out_root=args.out_root, toxicity_csv=args.toxicity_csv, min_identity=args.min_identity, min_coverage=args.min_coverage, allow_unknown_families=not args.disallow_unknown_families, require_index_hit=args.require_index_hit, lang=args.lang, bttoxin_db_dir=args.bttoxin_db_dir, threads=args.threads, ) if not res.get("ok"): print(f"[pipeline] FAILED at stage={res.get('stage')}: {res.get('error')}") logs = res.get("logs") if logs: print(logs[:2000]) return 1 print("[pipeline] ✓ Done") print(f" Input type: {res.get('sequence_type', 'unknown')}") print(f" Digger: {res['digger_dir']}") print(f" Shotter: {res['shotter_dir']}") print(f" Bundle: {res['bundle']}") print(f" Strain: {res.get('strain','')}") return 0 # Backward compatibility alias run_single_fna_pipeline = run_single_file_pipeline if __name__ == "__main__": raise SystemExit(main())