feat(scripts): add protein file (.faa) support to pipeline\n\n- Add detect_sequence_type() function for automatic file type detection\n- Rename run_single_fna_pipeline() to run_single_file_pipeline()\n- Add backward compatibility alias for existing callers\n- Add --input CLI argument (preferred over --fna)\n- Support .faa protein files alongside .fna/.fa/.fasta genome files\n- Auto-detect sequence type and pass correct suffix to BtToxin_Digger\n- Update documentation with protein file examples\n\nCo-Authored-By: Claude <noreply@anthropic.com>

2026-01-13 23:45:34 +08:00
parent 7379c98fac
commit 80e7d1ffb6
1 changed files with 116 additions and 28 deletions
--- a/scripts/run_single_fna_pipeline.py
+++ b/scripts/run_single_fna_pipeline.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python3
-"""Run a single-fna BtToxin_Digger -> Shotter -> Plots pipeline.
+"""Run a single-file BtToxin_Digger -> Shotter -> Plots pipeline.

- Input: one .fna file (nucleotide scaffold)
+Supports both genome (.fna/.fa) and protein (.faa) files.
+
+- Input: one .fna/.fa file (nucleotide scaffold) OR one .faa file (protein)
 - Steps:
  1) Stage this single file, run BtToxin_Digger via PixiRunner (pixi environment)
  2) Run Shotter scoring on Digger's All_Toxins.txt via pixi run -e pipeline
@@ -18,21 +20,28 @@ Notes
 - This script exposes CLI flags for Shotter filters to allow strict/loose runs.
 - 默认使用 external_dbs/bt_toxin 作为外部数据库（若存在）。

-Example
+Example (Genome file):
  python scripts/run_single_fna_pipeline.py \\
-    --fna tests/test_data/HAN055.fna \\
+    --input tests/test_data/HAN055.fna \\
    --toxicity_csv Data/toxicity-data.csv \\
    --out_root runs/HAN055_run \\
    --min_identity 0.50 --min_coverage 0.60 \\
    --disallow_unknown_families --require_index_hit --lang zh

+Example (Protein file):
+  python scripts/run_single_fna_pipeline.py \\
+    --input tests/test_data/proteins.faa \\
+    --toxicity_csv Data/toxicity-data.csv \\
+    --out_root runs/proteins_run \\
+    --min_identity 0.50 --min_coverage 0.60
+
  # 使用自定义数据库路径
  python scripts/run_single_fna_pipeline.py \\
-    --fna tests/test_data/HAN055.fna \\
+    --input tests/test_data/HAN055.fna \\
    --bttoxin_db_dir /path/to/custom/bt_toxin

  # 使用 pixi 任务运行
-  pixi run pipeline --fna tests/test_data/HAN055.fna
+  pixi run pipeline --input tests/test_data/HAN055.fna
 """
 from __future__ import annotations

@@ -51,6 +60,32 @@ sys.path.insert(0, str(Path(__file__).parent))
 from pixi_runner import PixiRunner, build_shotter_command, build_plot_command


+# Supported file extensions
+GENOME_EXTENSIONS = {".fna", ".fa", ".fasta"}
+PROTEIN_EXTENSIONS = {".faa"}
+ALL_EXTENSIONS = GENOME_EXTENSIONS | PROTEIN_EXTENSIONS
+
+
+def detect_sequence_type(file_path: Path) -> tuple[str, str]:
+    """
+    检测文件类型并返回 (sequence_type, suffix)。
+
+    Returns:
+        (sequence_type, suffix): where sequence_type is "nucl" or "prot",
+                                and suffix is the file extension (e.g., ".fna", ".faa")
+    """
+    ext = file_path.suffix.lower()
+    if ext in PROTEIN_EXTENSIONS:
+        return "prot", ext
+    elif ext in GENOME_EXTENSIONS:
+        return "nucl", ext
+    else:
+        raise ValueError(
+            f"Unsupported file extension: {ext}. "
+            f"Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}"
+        )
+
+
 def _shell(cmd: list[str]) -> subprocess.CompletedProcess:
    return subprocess.run(cmd, text=True)

@@ -72,8 +107,8 @@ def _read_first_strain(strain_scores_tsv: Path) -> str:
    return ""


-def run_single_fna_pipeline(
-    fna_path: Path,
+def run_single_file_pipeline(
+    input_path: Path,
    out_root: Path,
    toxicity_csv: Path = Path("Data/toxicity-data.csv"),
    min_identity: float = 0.0,
@@ -83,11 +118,13 @@ def run_single_fna_pipeline(
    lang: str = "zh",
    bttoxin_db_dir: Path | None = None,
    threads: int = 4,
+    sequence_type: str | None = None,
+    file_suffix: str | None = None,
 ) -> Dict[str, Any]:
-    """运行单个 fna 文件的完整 pipeline（使用 pixi 环境）。
+    """运行单个文件（基因组或蛋白）的完整 pipeline（使用 pixi 环境）。

    Args:
-        fna_path: 输入 .fna 文件路径
+        input_path: 输入文件路径（.fna/.fa/.fasta 或 .faa）
        out_root: 输出根目录
        toxicity_csv: 毒性数据 CSV 文件路径
        min_identity: 最小 identity 阈值
@@ -98,11 +135,23 @@ def run_single_fna_pipeline(
        bttoxin_db_dir: 外部 bt_toxin 数据库目录。若为 None，则自动检测
                       项目根目录下的 external_dbs/bt_toxin。
        threads: 线程数
+        sequence_type: 序列类型 ("nucl" 或 "prot")，若为 None 则自动检测
+        file_suffix: 文件后缀，若为 None 则自动检测
    """
-    fna_path = fna_path.resolve()
+    input_path = input_path.resolve()
    out_root = out_root.resolve()
    out_root.mkdir(parents=True, exist_ok=True)

+    # 自动检测序列类型
+    if sequence_type is None or file_suffix is None:
+        detected_type, detected_suffix = detect_sequence_type(input_path)
+        sequence_type = sequence_type or detected_type
+        file_suffix = file_suffix or detected_suffix
+
+    seq_type_label = "protein" if sequence_type == "prot" else "genome"
+    print(f"[pipeline] Input file: {input_path.name}")
+    print(f"[pipeline] Detected type: {seq_type_label} ({sequence_type}), suffix: {file_suffix}")
+
    # 自动检测外部数据库
    if bttoxin_db_dir is None:
        default_db = Path(__file__).resolve().parents[1] / "external_dbs" / "bt_toxin"
@@ -120,20 +169,30 @@ def run_single_fna_pipeline(
        d.mkdir(parents=True, exist_ok=True)

    # Stage single input file
-    staged_fna = stage_dir / fna_path.name
-    shutil.copy2(fna_path, staged_fna)
+    staged_file = stage_dir / input_path.name
+    shutil.copy2(input_path, staged_file)

    # 1) Run BtToxin_Digger via PixiRunner (pixi digger environment)
    runner = PixiRunner(env_name="digger")
-    result = runner.run_bttoxin_digger(
-        input_dir=stage_dir,
-        output_dir=digger_dir,
-        log_dir=logs_dir,
-        sequence_type="nucl",
-        scaf_suffix=fna_path.suffix or ".fna",
-        threads=threads,
-        bttoxin_db_dir=bttoxin_db_dir,
-    )
+
+    # 构建参数：根据序列类型使用不同的参数名
+    digger_params = {
+        "input_dir": stage_dir,
+        "output_dir": digger_dir,
+        "log_dir": logs_dir,
+        "sequence_type": sequence_type,
+        "threads": threads,
+        "bttoxin_db_dir": bttoxin_db_dir,
+    }
+
+    # 根据序列类型添加相应的后缀参数
+    if sequence_type == "prot":
+        digger_params["prot_suffix"] = file_suffix
+    else:
+        digger_params["scaf_suffix"] = file_suffix
+
+    result = runner.run_bttoxin_digger(**digger_params)
+
    if not result.get("success"):
        return {
            "ok": False,
@@ -151,7 +210,7 @@ def run_single_fna_pipeline(
    shotter_dir.mkdir(parents=True, exist_ok=True)
    scripts_dir = Path(__file__).resolve().parents[0]
    pixi_project_dir = Path(__file__).resolve().parents[1]
-    
+
    shoter_cmd = build_shotter_command(
        pixi_project_dir=pixi_project_dir,
        script_path=scripts_dir / "bttoxin_shoter.py",
@@ -205,12 +264,23 @@ def run_single_fna_pipeline(
        "bundle": str(bundle),
        "all_toxins": str(all_toxins),
        "strain": strain_for_plot,
+        "sequence_type": sequence_type,
    }


 def main() -> int:
-    ap = argparse.ArgumentParser(description="Run single-fna Digger -> Shotter pipeline (pixi-based)")
-    ap.add_argument("--fna", type=Path, required=True, help="Path to a single .fna file")
+    ap = argparse.ArgumentParser(
+        description="Run single-file Digger -> Shotter pipeline (pixi-based). "
+                    "Supports genome (.fna/.fa/.fasta) and protein (.faa) files."
+    )
+    ap.add_argument(
+        "--input",
+        type=Path,
+        required=True,
+        help="Path to input file (genome: .fna/.fa/.fasta, protein: .faa). "
+             "Deprecated: --fna still works for backward compatibility."
+    )
+    ap.add_argument("--fna", type=Path, default=None, help=argparse.SUPPRESS)  # Hidden backward compat
    ap.add_argument("--toxicity_csv", type=Path, default=Path("Data/toxicity-data.csv"))
    ap.add_argument("--out_root", type=Path, default=Path("runs/single_run"))
    ap.add_argument("--min_identity", type=float, default=0.0)
@@ -223,13 +293,26 @@ def main() -> int:
    ap.add_argument("--threads", type=int, default=4, help="线程数")
    args = ap.parse_args()

+    # Backward compatibility: --fna flag
+    input_file = args.input if args.input is not None else args.fna
+
+    if input_file is None:
+        print("[pipeline] Error: --input argument is required")
+        return 1
+
+    # 验证文件扩展名
+    if input_file.suffix.lower() not in ALL_EXTENSIONS:
+        print(f"[pipeline] Error: Unsupported file extension: {input_file.suffix}")
+        print(f"[pipeline] Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}")
+        return 1
+
    # derive per-run default out_root using file stem
    if str(args.out_root) == "runs/single_run":
-        stem = args.fna.stem
+        stem = input_file.stem
        args.out_root = Path("runs") / f"{stem}_run"

-    res = run_single_fna_pipeline(
-        fna_path=args.fna,
+    res = run_single_file_pipeline(
+        input_path=input_file,
        out_root=args.out_root,
        toxicity_csv=args.toxicity_csv,
        min_identity=args.min_identity,
@@ -249,6 +332,7 @@ def main() -> int:
        return 1

    print("[pipeline] ✓ Done")
+    print(f"  Input type: {res.get('sequence_type', 'unknown')}")
    print(f"  Digger: {res['digger_dir']}")
    print(f"  Shotter: {res['shotter_dir']}")
    print(f"  Bundle: {res['bundle']}")
@@ -256,5 +340,9 @@ def main() -> int:
    return 0


+# Backward compatibility alias
+run_single_fna_pipeline = run_single_file_pipeline
+
+
 if __name__ == "__main__":
    raise SystemExit(main())