- Backend: Refactored tasks.py to directly invoke run_single_fna_pipeline.py for consistency. - Backend: Changed output format to ZIP and added auto-cleanup of intermediate files. - Backend: Fixed language parameter passing in API and tasks. - Frontend: Removed CRISPR Fusion UI elements from Submit and Monitor views. - Frontend: Implemented simulated progress bar for better UX. - Frontend: Restored One-click load button and added result file structure documentation. - Docker: Fixed critical Restarting loop by removing incorrect image directive in docker-compose.yml. - Docker: Optimized Dockerfile to correct .pixi environment path issues and prevent accidental deletion of frontend assets.
399 lines
15 KiB
Python
399 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
"""Run a single-file BtToxin_Digger -> Shotter -> Plots pipeline.
|
||
|
||
Supports both genome (.fna/.fa) and protein (.faa) files.
|
||
|
||
- Input: one .fna/.fa file (nucleotide scaffold) OR one .faa file (protein)
|
||
- Steps:
|
||
1) Stage this single file, run BtToxin_Digger via PixiRunner (pixi environment)
|
||
2) Run Shotter scoring on Digger's All_Toxins.txt via pixi run -e pipeline
|
||
3) Render heatmaps + paper-style report via pixi run -e pipeline
|
||
4) Organize outputs under one root folder:
|
||
<out_root>/
|
||
├─ digger/ (pixi digger env outputs)
|
||
├─ shotter/ (Shotter TSV/JSON + plots + report)
|
||
└─ pipeline_results.tar.gz (bundle)
|
||
|
||
Notes
|
||
- Digger is executed in the pixi 'digger' environment with bioconda dependencies.
|
||
- Shotter and plotting are executed in the pixi 'pipeline' environment with Python dependencies.
|
||
- This script exposes CLI flags for Shotter filters to allow strict/loose runs.
|
||
- 默认使用 external_dbs/bt_toxin 作为外部数据库(若存在)。
|
||
|
||
Example (Genome file):
|
||
python scripts/run_single_fna_pipeline.py \\
|
||
--input tests/test_data/HAN055.fna \\
|
||
--toxicity_csv Data/toxicity-data.csv \\
|
||
--out_root runs/HAN055_run \\
|
||
--min_identity 0.50 --min_coverage 0.60 \\
|
||
--disallow_unknown_families --require_index_hit --lang zh
|
||
|
||
Example (Protein file):
|
||
python scripts/run_single_fna_pipeline.py \\
|
||
--input tests/test_data/proteins.faa \\
|
||
--toxicity_csv Data/toxicity-data.csv \\
|
||
--out_root runs/proteins_run \\
|
||
--min_identity 0.50 --min_coverage 0.60
|
||
|
||
# 使用自定义数据库路径
|
||
python scripts/run_single_fna_pipeline.py \\
|
||
--input tests/test_data/HAN055.fna \\
|
||
--bttoxin_db_dir /path/to/custom/bt_toxin
|
||
|
||
# 使用 pixi 任务运行
|
||
pixi run pipeline --input tests/test_data/HAN055.fna
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
import sys
|
||
import tarfile
|
||
from pathlib import Path
|
||
from typing import Dict, Any, List
|
||
|
||
# Import PixiRunner and command builders from scripts
|
||
# Add scripts directory to path for pixi_runner import
|
||
sys.path.insert(0, str(Path(__file__).parent))
|
||
from pixi_runner import PixiRunner, build_shotter_command, build_plot_command
|
||
|
||
|
||
# Supported file extensions
|
||
GENOME_EXTENSIONS = {".fna", ".fa", ".fasta"}
|
||
PROTEIN_EXTENSIONS = {".faa"}
|
||
ALL_EXTENSIONS = GENOME_EXTENSIONS | PROTEIN_EXTENSIONS
|
||
|
||
|
||
def detect_sequence_type(file_path: Path) -> tuple[str, str]:
|
||
"""
|
||
检测文件类型并返回 (sequence_type, suffix)。
|
||
|
||
Returns:
|
||
(sequence_type, suffix): where sequence_type is "nucl" or "prot",
|
||
and suffix is the file extension (e.g., ".fna", ".faa")
|
||
"""
|
||
ext = file_path.suffix.lower()
|
||
if ext in PROTEIN_EXTENSIONS:
|
||
return "prot", ext
|
||
elif ext in GENOME_EXTENSIONS:
|
||
return "nucl", ext
|
||
else:
|
||
raise ValueError(
|
||
f"Unsupported file extension: {ext}. "
|
||
f"Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}"
|
||
)
|
||
|
||
|
||
def _shell(cmd: list[str]) -> subprocess.CompletedProcess:
|
||
return subprocess.run(cmd, text=True)
|
||
|
||
|
||
def _read_first_strain(strain_scores_tsv: Path) -> str:
|
||
try:
|
||
with strain_scores_tsv.open("r", encoding="utf-8") as f:
|
||
header = f.readline().strip().split("\t")
|
||
idx_strain = header.index("Strain")
|
||
# next non-empty line
|
||
for line in f:
|
||
if not line.strip():
|
||
continue
|
||
parts = line.rstrip("\n").split("\t")
|
||
if len(parts) > idx_strain:
|
||
return parts[idx_strain]
|
||
except Exception:
|
||
pass
|
||
return ""
|
||
|
||
|
||
def run_single_file_pipeline(
|
||
input_path: Path,
|
||
out_root: Path,
|
||
toxicity_csv: Path = Path("Data/toxicity-data.csv"),
|
||
min_identity: float = 0.0,
|
||
min_coverage: float = 0.0,
|
||
allow_unknown_families: bool = True,
|
||
require_index_hit: bool = False,
|
||
lang: str = "zh",
|
||
bttoxin_db_dir: Path | None = None,
|
||
threads: int = 4,
|
||
sequence_type: str | None = None,
|
||
file_suffix: str | None = None,
|
||
) -> Dict[str, Any]:
|
||
"""运行单个文件(基因组或蛋白)的完整 pipeline(使用 pixi 环境)。
|
||
|
||
Args:
|
||
input_path: 输入文件路径(.fna/.fa/.fasta 或 .faa)
|
||
out_root: 输出根目录
|
||
toxicity_csv: 毒性数据 CSV 文件路径
|
||
min_identity: 最小 identity 阈值
|
||
min_coverage: 最小 coverage 阈值
|
||
allow_unknown_families: 是否允许未知家族
|
||
require_index_hit: 是否要求索引命中
|
||
lang: 报告语言 (zh/en)
|
||
bttoxin_db_dir: 外部 bt_toxin 数据库目录。若为 None,则自动检测
|
||
项目根目录下的 external_dbs/bt_toxin。
|
||
threads: 线程数
|
||
sequence_type: 序列类型 ("nucl" 或 "prot"),若为 None 则自动检测
|
||
file_suffix: 文件后缀,若为 None 则自动检测
|
||
"""
|
||
input_path = input_path.resolve()
|
||
out_root = out_root.resolve()
|
||
out_root.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 自动检测序列类型
|
||
if sequence_type is None or file_suffix is None:
|
||
detected_type, detected_suffix = detect_sequence_type(input_path)
|
||
sequence_type = sequence_type or detected_type
|
||
file_suffix = file_suffix or detected_suffix
|
||
|
||
seq_type_label = "protein" if sequence_type == "prot" else "genome"
|
||
print(f"[pipeline] Input file: {input_path.name}")
|
||
print(f"[pipeline] Detected type: {seq_type_label} ({sequence_type}), suffix: {file_suffix}")
|
||
|
||
# 自动检测外部数据库
|
||
if bttoxin_db_dir is None:
|
||
default_db = Path(__file__).resolve().parents[1] / "external_dbs" / "bt_toxin"
|
||
if default_db.exists() and (default_db / "db").exists():
|
||
bttoxin_db_dir = default_db
|
||
print(f"[pipeline] 使用外部数据库: {bttoxin_db_dir}")
|
||
else:
|
||
print("[pipeline] 未找到外部数据库,将使用 pixi 环境内置数据库")
|
||
|
||
digger_dir = out_root / "digger"
|
||
shotter_dir = out_root / "shotter"
|
||
logs_dir = out_root / "logs"
|
||
stage_dir = out_root / "stage"
|
||
for d in (digger_dir, shotter_dir, logs_dir, stage_dir):
|
||
d.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Stage single input file
|
||
staged_file = stage_dir / input_path.name
|
||
shutil.copy2(input_path, staged_file)
|
||
|
||
# 1) Run BtToxin_Digger via PixiRunner (pixi digger environment)
|
||
runner = PixiRunner(env_name="digger")
|
||
|
||
# 构建参数:根据序列类型使用不同的参数名
|
||
digger_params = {
|
||
"input_dir": stage_dir,
|
||
"output_dir": digger_dir,
|
||
"log_dir": logs_dir,
|
||
"sequence_type": sequence_type,
|
||
"threads": threads,
|
||
"bttoxin_db_dir": bttoxin_db_dir,
|
||
}
|
||
|
||
# 根据序列类型添加相应的后缀参数
|
||
if sequence_type == "prot":
|
||
digger_params["prot_suffix"] = file_suffix
|
||
else:
|
||
digger_params["scaf_suffix"] = file_suffix
|
||
|
||
result = runner.run_bttoxin_digger(**digger_params)
|
||
|
||
if not result.get("success"):
|
||
return {
|
||
"ok": False,
|
||
"stage": "digger",
|
||
"error": result.get("error") or f"Digger failed (exit={result.get('exit_code')})",
|
||
"logs": (logs_dir / "digger_execution.log").read_text(encoding="utf-8") if (logs_dir / "digger_execution.log").exists() else "",
|
||
}
|
||
|
||
toxins_dir = digger_dir / "Results" / "Toxins"
|
||
all_toxins = toxins_dir / "All_Toxins.txt"
|
||
if not all_toxins.exists():
|
||
return {"ok": False, "stage": "digger", "error": f"Missing All_Toxins.txt at {all_toxins}"}
|
||
|
||
# 1.5) Run Context Analysis (BGC, Mobilome, CRISPR)
|
||
print("[pipeline] Running Genome Context Analysis...")
|
||
context_dir = out_root / "context"
|
||
context_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Define tool paths
|
||
project_root = Path(__file__).resolve().parents[1]
|
||
bgc_script = project_root / "tools" / "bgc_analysis" / "detect_bgc.py"
|
||
mobi_script = project_root / "tools" / "mobilome_analysis" / "detect_mobilome.py"
|
||
crispr_script = project_root / "tools" / "crispr_cas_analysis" / "scripts" / "detect_crispr.py"
|
||
fusion_script = project_root / "tools" / "crispr_cas_analysis" / "scripts" / "fusion_analysis.py"
|
||
|
||
# Define outputs
|
||
bgc_out = context_dir / "bgc.json"
|
||
mobi_out = context_dir / "mobilome.json"
|
||
crispr_out = context_dir / "crispr_results.json"
|
||
fusion_out = context_dir / "fusion_results.json"
|
||
|
||
# Execute Tools (using pipeline env python)
|
||
# BGC
|
||
if bgc_script.exists():
|
||
_shell(["python", str(bgc_script), "--input", str(input_path), "--output", str(bgc_out)])
|
||
|
||
# Mobilome
|
||
if mobi_script.exists():
|
||
_shell(["python", str(mobi_script), "--input", str(input_path), "--output", str(mobi_out)])
|
||
|
||
# CRISPR Detection
|
||
if crispr_script.exists():
|
||
# Use --mock for now as per instructions
|
||
_shell(["python", str(crispr_script), "--input", str(input_path), "--output", str(crispr_out), "--mock"])
|
||
|
||
# CRISPR Fusion (needs CRISPR results + Digger results)
|
||
if fusion_script.exists() and crispr_out.exists() and all_toxins.exists():
|
||
_shell([
|
||
"python", str(fusion_script),
|
||
"--crispr-results", str(crispr_out),
|
||
"--toxin-results", str(all_toxins),
|
||
"--genome", str(input_path),
|
||
"--output", str(fusion_out),
|
||
"--mock"
|
||
])
|
||
|
||
# 2) Run Shotter scoring via pixi run -e pipeline
|
||
shotter_dir.mkdir(parents=True, exist_ok=True)
|
||
scripts_dir = Path(__file__).resolve().parents[0]
|
||
pixi_project_dir = Path(__file__).resolve().parents[1]
|
||
|
||
shoter_cmd = build_shotter_command(
|
||
pixi_project_dir=pixi_project_dir,
|
||
script_path=scripts_dir / "bttoxin_shoter.py",
|
||
toxicity_csv=toxicity_csv,
|
||
all_toxins=all_toxins,
|
||
output_dir=shotter_dir,
|
||
min_identity=min_identity,
|
||
min_coverage=min_coverage,
|
||
allow_unknown_families=allow_unknown_families,
|
||
require_index_hit=require_index_hit,
|
||
# Pass Context Results
|
||
crispr_results=fusion_out if fusion_out.exists() else None,
|
||
crispr_fusion=fusion_out.exists(),
|
||
context_bgc=bgc_out if bgc_out.exists() else None,
|
||
context_mobilome=mobi_out if mobi_out.exists() else None,
|
||
context_crispr=crispr_out if crispr_out.exists() else None,
|
||
)
|
||
|
||
r1 = _shell(shoter_cmd)
|
||
if r1.returncode != 0:
|
||
return {"ok": False, "stage": "shotter", "error": f"Shotter failed: {' '.join(shoter_cmd)}"}
|
||
|
||
strain_scores = shotter_dir / "strain_target_scores.tsv"
|
||
toxin_support = shotter_dir / "toxin_support.tsv"
|
||
species_scores = shotter_dir / "strain_target_species_scores.tsv"
|
||
|
||
# 3) Plot & report via pixi run -e pipeline
|
||
strain_for_plot = _read_first_strain(strain_scores)
|
||
plot_cmd = build_plot_command(
|
||
pixi_project_dir=pixi_project_dir,
|
||
script_path=scripts_dir / "plot_shotter.py",
|
||
strain_scores=strain_scores,
|
||
toxin_support=toxin_support,
|
||
species_scores=species_scores,
|
||
out_dir=shotter_dir,
|
||
merge_unresolved=True,
|
||
report_mode="paper",
|
||
lang=lang,
|
||
per_hit_strain=strain_for_plot if strain_for_plot else None,
|
||
)
|
||
|
||
r2 = _shell(plot_cmd)
|
||
if r2.returncode != 0:
|
||
print(f"[pipeline] Warning: Plotting/Report failed (exit={r2.returncode})")
|
||
# plotting/report optional; continue
|
||
pass
|
||
|
||
# 4) Bundle
|
||
bundle = out_root / "pipeline_results.tar.gz"
|
||
with tarfile.open(bundle, "w:gz") as tar:
|
||
tar.add(digger_dir, arcname="digger")
|
||
tar.add(shotter_dir, arcname="shotter")
|
||
|
||
return {
|
||
"ok": True,
|
||
"digger_dir": str(digger_dir),
|
||
"shotter_dir": str(shotter_dir),
|
||
"bundle": str(bundle),
|
||
"all_toxins": str(all_toxins),
|
||
"strain": strain_for_plot,
|
||
"sequence_type": sequence_type,
|
||
}
|
||
|
||
|
||
def main() -> int:
|
||
ap = argparse.ArgumentParser(
|
||
description="Run single-file Digger -> Shotter pipeline (pixi-based). "
|
||
"Supports genome (.fna/.fa/.fasta) and protein (.faa) files."
|
||
)
|
||
ap.add_argument(
|
||
"--input",
|
||
type=Path,
|
||
required=True,
|
||
help="Path to input file (genome: .fna/.fa/.fasta, protein: .faa). "
|
||
"Deprecated: --fna still works for backward compatibility."
|
||
)
|
||
ap.add_argument("--fna", type=Path, default=None, help=argparse.SUPPRESS) # Hidden backward compat
|
||
ap.add_argument("--toxicity_csv", type=Path, default=Path("Data/toxicity-data.csv"))
|
||
ap.add_argument("--out_root", type=Path, default=Path("runs/single_run"))
|
||
ap.add_argument("--min_identity", type=float, default=0.0)
|
||
ap.add_argument("--min_coverage", type=float, default=0.0)
|
||
ap.add_argument("--disallow_unknown_families", action="store_true", default=False)
|
||
ap.add_argument("--require_index_hit", action="store_true", default=False)
|
||
ap.add_argument("--lang", type=str, choices=["zh", "en"], default="zh")
|
||
ap.add_argument("--bttoxin_db_dir", type=Path, default=None,
|
||
help="外部 bt_toxin 数据库目录路径(默认自动检测 external_dbs/bt_toxin)")
|
||
ap.add_argument("--threads", type=int, default=4, help="线程数")
|
||
args = ap.parse_args()
|
||
|
||
# Backward compatibility: --fna flag
|
||
input_file = args.input if args.input is not None else args.fna
|
||
|
||
if input_file is None:
|
||
print("[pipeline] Error: --input argument is required")
|
||
return 1
|
||
|
||
# 验证文件扩展名
|
||
if input_file.suffix.lower() not in ALL_EXTENSIONS:
|
||
print(f"[pipeline] Error: Unsupported file extension: {input_file.suffix}")
|
||
print(f"[pipeline] Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}")
|
||
return 1
|
||
|
||
# derive per-run default out_root using file stem
|
||
if str(args.out_root) == "runs/single_run":
|
||
stem = input_file.stem
|
||
args.out_root = Path("runs") / f"{stem}_run"
|
||
|
||
res = run_single_file_pipeline(
|
||
input_path=input_file,
|
||
out_root=args.out_root,
|
||
toxicity_csv=args.toxicity_csv,
|
||
min_identity=args.min_identity,
|
||
min_coverage=args.min_coverage,
|
||
allow_unknown_families=not args.disallow_unknown_families,
|
||
require_index_hit=args.require_index_hit,
|
||
lang=args.lang,
|
||
bttoxin_db_dir=args.bttoxin_db_dir,
|
||
threads=args.threads,
|
||
)
|
||
|
||
if not res.get("ok"):
|
||
print(f"[pipeline] FAILED at stage={res.get('stage')}: {res.get('error')}")
|
||
logs = res.get("logs")
|
||
if logs:
|
||
print(logs[:2000])
|
||
return 1
|
||
|
||
print("[pipeline] ✓ Done")
|
||
print(f" Input type: {res.get('sequence_type', 'unknown')}")
|
||
print(f" Digger: {res['digger_dir']}")
|
||
print(f" Shotter: {res['shotter_dir']}")
|
||
print(f" Bundle: {res['bundle']}")
|
||
print(f" Strain: {res.get('strain','')}")
|
||
return 0
|
||
|
||
|
||
# Backward compatibility alias
|
||
run_single_fna_pipeline = run_single_file_pipeline
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|