diff --git a/scripts/run_single_fna_pipeline.py b/scripts/run_single_fna_pipeline.py index 99db90b..a7db061 100644 --- a/scripts/run_single_fna_pipeline.py +++ b/scripts/run_single_fna_pipeline.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 -"""Run a single-fna BtToxin_Digger -> Shotter -> Plots pipeline. +"""Run a single-file BtToxin_Digger -> Shotter -> Plots pipeline. -- Input: one .fna file (nucleotide scaffold) +Supports both genome (.fna/.fa) and protein (.faa) files. + +- Input: one .fna/.fa file (nucleotide scaffold) OR one .faa file (protein) - Steps: 1) Stage this single file, run BtToxin_Digger via PixiRunner (pixi environment) 2) Run Shotter scoring on Digger's All_Toxins.txt via pixi run -e pipeline @@ -18,21 +20,28 @@ Notes - This script exposes CLI flags for Shotter filters to allow strict/loose runs. - 默认使用 external_dbs/bt_toxin 作为外部数据库(若存在)。 -Example +Example (Genome file): python scripts/run_single_fna_pipeline.py \\ - --fna tests/test_data/HAN055.fna \\ + --input tests/test_data/HAN055.fna \\ --toxicity_csv Data/toxicity-data.csv \\ --out_root runs/HAN055_run \\ --min_identity 0.50 --min_coverage 0.60 \\ --disallow_unknown_families --require_index_hit --lang zh +Example (Protein file): + python scripts/run_single_fna_pipeline.py \\ + --input tests/test_data/proteins.faa \\ + --toxicity_csv Data/toxicity-data.csv \\ + --out_root runs/proteins_run \\ + --min_identity 0.50 --min_coverage 0.60 + # 使用自定义数据库路径 python scripts/run_single_fna_pipeline.py \\ - --fna tests/test_data/HAN055.fna \\ + --input tests/test_data/HAN055.fna \\ --bttoxin_db_dir /path/to/custom/bt_toxin # 使用 pixi 任务运行 - pixi run pipeline --fna tests/test_data/HAN055.fna + pixi run pipeline --input tests/test_data/HAN055.fna """ from __future__ import annotations @@ -51,6 +60,32 @@ sys.path.insert(0, str(Path(__file__).parent)) from pixi_runner import PixiRunner, build_shotter_command, build_plot_command +# Supported file extensions +GENOME_EXTENSIONS = {".fna", ".fa", ".fasta"} +PROTEIN_EXTENSIONS = {".faa"} +ALL_EXTENSIONS = GENOME_EXTENSIONS | PROTEIN_EXTENSIONS + + +def detect_sequence_type(file_path: Path) -> tuple[str, str]: + """ + 检测文件类型并返回 (sequence_type, suffix)。 + + Returns: + (sequence_type, suffix): where sequence_type is "nucl" or "prot", + and suffix is the file extension (e.g., ".fna", ".faa") + """ + ext = file_path.suffix.lower() + if ext in PROTEIN_EXTENSIONS: + return "prot", ext + elif ext in GENOME_EXTENSIONS: + return "nucl", ext + else: + raise ValueError( + f"Unsupported file extension: {ext}. " + f"Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}" + ) + + def _shell(cmd: list[str]) -> subprocess.CompletedProcess: return subprocess.run(cmd, text=True) @@ -72,8 +107,8 @@ def _read_first_strain(strain_scores_tsv: Path) -> str: return "" -def run_single_fna_pipeline( - fna_path: Path, +def run_single_file_pipeline( + input_path: Path, out_root: Path, toxicity_csv: Path = Path("Data/toxicity-data.csv"), min_identity: float = 0.0, @@ -83,11 +118,13 @@ def run_single_fna_pipeline( lang: str = "zh", bttoxin_db_dir: Path | None = None, threads: int = 4, + sequence_type: str | None = None, + file_suffix: str | None = None, ) -> Dict[str, Any]: - """运行单个 fna 文件的完整 pipeline(使用 pixi 环境)。 + """运行单个文件(基因组或蛋白)的完整 pipeline(使用 pixi 环境)。 Args: - fna_path: 输入 .fna 文件路径 + input_path: 输入文件路径(.fna/.fa/.fasta 或 .faa) out_root: 输出根目录 toxicity_csv: 毒性数据 CSV 文件路径 min_identity: 最小 identity 阈值 @@ -98,11 +135,23 @@ def run_single_fna_pipeline( bttoxin_db_dir: 外部 bt_toxin 数据库目录。若为 None,则自动检测 项目根目录下的 external_dbs/bt_toxin。 threads: 线程数 + sequence_type: 序列类型 ("nucl" 或 "prot"),若为 None 则自动检测 + file_suffix: 文件后缀,若为 None 则自动检测 """ - fna_path = fna_path.resolve() + input_path = input_path.resolve() out_root = out_root.resolve() out_root.mkdir(parents=True, exist_ok=True) + # 自动检测序列类型 + if sequence_type is None or file_suffix is None: + detected_type, detected_suffix = detect_sequence_type(input_path) + sequence_type = sequence_type or detected_type + file_suffix = file_suffix or detected_suffix + + seq_type_label = "protein" if sequence_type == "prot" else "genome" + print(f"[pipeline] Input file: {input_path.name}") + print(f"[pipeline] Detected type: {seq_type_label} ({sequence_type}), suffix: {file_suffix}") + # 自动检测外部数据库 if bttoxin_db_dir is None: default_db = Path(__file__).resolve().parents[1] / "external_dbs" / "bt_toxin" @@ -120,20 +169,30 @@ def run_single_fna_pipeline( d.mkdir(parents=True, exist_ok=True) # Stage single input file - staged_fna = stage_dir / fna_path.name - shutil.copy2(fna_path, staged_fna) + staged_file = stage_dir / input_path.name + shutil.copy2(input_path, staged_file) # 1) Run BtToxin_Digger via PixiRunner (pixi digger environment) runner = PixiRunner(env_name="digger") - result = runner.run_bttoxin_digger( - input_dir=stage_dir, - output_dir=digger_dir, - log_dir=logs_dir, - sequence_type="nucl", - scaf_suffix=fna_path.suffix or ".fna", - threads=threads, - bttoxin_db_dir=bttoxin_db_dir, - ) + + # 构建参数:根据序列类型使用不同的参数名 + digger_params = { + "input_dir": stage_dir, + "output_dir": digger_dir, + "log_dir": logs_dir, + "sequence_type": sequence_type, + "threads": threads, + "bttoxin_db_dir": bttoxin_db_dir, + } + + # 根据序列类型添加相应的后缀参数 + if sequence_type == "prot": + digger_params["prot_suffix"] = file_suffix + else: + digger_params["scaf_suffix"] = file_suffix + + result = runner.run_bttoxin_digger(**digger_params) + if not result.get("success"): return { "ok": False, @@ -151,7 +210,7 @@ def run_single_fna_pipeline( shotter_dir.mkdir(parents=True, exist_ok=True) scripts_dir = Path(__file__).resolve().parents[0] pixi_project_dir = Path(__file__).resolve().parents[1] - + shoter_cmd = build_shotter_command( pixi_project_dir=pixi_project_dir, script_path=scripts_dir / "bttoxin_shoter.py", @@ -205,12 +264,23 @@ def run_single_fna_pipeline( "bundle": str(bundle), "all_toxins": str(all_toxins), "strain": strain_for_plot, + "sequence_type": sequence_type, } def main() -> int: - ap = argparse.ArgumentParser(description="Run single-fna Digger -> Shotter pipeline (pixi-based)") - ap.add_argument("--fna", type=Path, required=True, help="Path to a single .fna file") + ap = argparse.ArgumentParser( + description="Run single-file Digger -> Shotter pipeline (pixi-based). " + "Supports genome (.fna/.fa/.fasta) and protein (.faa) files." + ) + ap.add_argument( + "--input", + type=Path, + required=True, + help="Path to input file (genome: .fna/.fa/.fasta, protein: .faa). " + "Deprecated: --fna still works for backward compatibility." + ) + ap.add_argument("--fna", type=Path, default=None, help=argparse.SUPPRESS) # Hidden backward compat ap.add_argument("--toxicity_csv", type=Path, default=Path("Data/toxicity-data.csv")) ap.add_argument("--out_root", type=Path, default=Path("runs/single_run")) ap.add_argument("--min_identity", type=float, default=0.0) @@ -223,13 +293,26 @@ def main() -> int: ap.add_argument("--threads", type=int, default=4, help="线程数") args = ap.parse_args() + # Backward compatibility: --fna flag + input_file = args.input if args.input is not None else args.fna + + if input_file is None: + print("[pipeline] Error: --input argument is required") + return 1 + + # 验证文件扩展名 + if input_file.suffix.lower() not in ALL_EXTENSIONS: + print(f"[pipeline] Error: Unsupported file extension: {input_file.suffix}") + print(f"[pipeline] Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}") + return 1 + # derive per-run default out_root using file stem if str(args.out_root) == "runs/single_run": - stem = args.fna.stem + stem = input_file.stem args.out_root = Path("runs") / f"{stem}_run" - res = run_single_fna_pipeline( - fna_path=args.fna, + res = run_single_file_pipeline( + input_path=input_file, out_root=args.out_root, toxicity_csv=args.toxicity_csv, min_identity=args.min_identity, @@ -249,6 +332,7 @@ def main() -> int: return 1 print("[pipeline] ✓ Done") + print(f" Input type: {res.get('sequence_type', 'unknown')}") print(f" Digger: {res['digger_dir']}") print(f" Shotter: {res['shotter_dir']}") print(f" Bundle: {res['bundle']}") @@ -256,5 +340,9 @@ def main() -> int: return 0 +# Backward compatibility alias +run_single_fna_pipeline = run_single_file_pipeline + + if __name__ == "__main__": raise SystemExit(main())