feat(scripts): add protein file (.faa) support to pipeline\n\n- Add detect_sequence_type() function for automatic file type detection\n- Rename run_single_fna_pipeline() to run_single_file_pipeline()\n- Add backward compatibility alias for existing callers\n- Add --input CLI argument (preferred over --fna)\n- Support .faa protein files alongside .fna/.fa/.fasta genome files\n- Auto-detect sequence type and pass correct suffix to BtToxin_Digger\n- Update documentation with protein file examples\n\nCo-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,9 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Run a single-fna BtToxin_Digger -> Shotter -> Plots pipeline.
|
"""Run a single-file BtToxin_Digger -> Shotter -> Plots pipeline.
|
||||||
|
|
||||||
- Input: one .fna file (nucleotide scaffold)
|
Supports both genome (.fna/.fa) and protein (.faa) files.
|
||||||
|
|
||||||
|
- Input: one .fna/.fa file (nucleotide scaffold) OR one .faa file (protein)
|
||||||
- Steps:
|
- Steps:
|
||||||
1) Stage this single file, run BtToxin_Digger via PixiRunner (pixi environment)
|
1) Stage this single file, run BtToxin_Digger via PixiRunner (pixi environment)
|
||||||
2) Run Shotter scoring on Digger's All_Toxins.txt via pixi run -e pipeline
|
2) Run Shotter scoring on Digger's All_Toxins.txt via pixi run -e pipeline
|
||||||
@@ -18,21 +20,28 @@ Notes
|
|||||||
- This script exposes CLI flags for Shotter filters to allow strict/loose runs.
|
- This script exposes CLI flags for Shotter filters to allow strict/loose runs.
|
||||||
- 默认使用 external_dbs/bt_toxin 作为外部数据库(若存在)。
|
- 默认使用 external_dbs/bt_toxin 作为外部数据库(若存在)。
|
||||||
|
|
||||||
Example
|
Example (Genome file):
|
||||||
python scripts/run_single_fna_pipeline.py \\
|
python scripts/run_single_fna_pipeline.py \\
|
||||||
--fna tests/test_data/HAN055.fna \\
|
--input tests/test_data/HAN055.fna \\
|
||||||
--toxicity_csv Data/toxicity-data.csv \\
|
--toxicity_csv Data/toxicity-data.csv \\
|
||||||
--out_root runs/HAN055_run \\
|
--out_root runs/HAN055_run \\
|
||||||
--min_identity 0.50 --min_coverage 0.60 \\
|
--min_identity 0.50 --min_coverage 0.60 \\
|
||||||
--disallow_unknown_families --require_index_hit --lang zh
|
--disallow_unknown_families --require_index_hit --lang zh
|
||||||
|
|
||||||
|
Example (Protein file):
|
||||||
|
python scripts/run_single_fna_pipeline.py \\
|
||||||
|
--input tests/test_data/proteins.faa \\
|
||||||
|
--toxicity_csv Data/toxicity-data.csv \\
|
||||||
|
--out_root runs/proteins_run \\
|
||||||
|
--min_identity 0.50 --min_coverage 0.60
|
||||||
|
|
||||||
# 使用自定义数据库路径
|
# 使用自定义数据库路径
|
||||||
python scripts/run_single_fna_pipeline.py \\
|
python scripts/run_single_fna_pipeline.py \\
|
||||||
--fna tests/test_data/HAN055.fna \\
|
--input tests/test_data/HAN055.fna \\
|
||||||
--bttoxin_db_dir /path/to/custom/bt_toxin
|
--bttoxin_db_dir /path/to/custom/bt_toxin
|
||||||
|
|
||||||
# 使用 pixi 任务运行
|
# 使用 pixi 任务运行
|
||||||
pixi run pipeline --fna tests/test_data/HAN055.fna
|
pixi run pipeline --input tests/test_data/HAN055.fna
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@@ -51,6 +60,32 @@ sys.path.insert(0, str(Path(__file__).parent))
|
|||||||
from pixi_runner import PixiRunner, build_shotter_command, build_plot_command
|
from pixi_runner import PixiRunner, build_shotter_command, build_plot_command
|
||||||
|
|
||||||
|
|
||||||
|
# Supported file extensions
|
||||||
|
GENOME_EXTENSIONS = {".fna", ".fa", ".fasta"}
|
||||||
|
PROTEIN_EXTENSIONS = {".faa"}
|
||||||
|
ALL_EXTENSIONS = GENOME_EXTENSIONS | PROTEIN_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
|
def detect_sequence_type(file_path: Path) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
检测文件类型并返回 (sequence_type, suffix)。
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(sequence_type, suffix): where sequence_type is "nucl" or "prot",
|
||||||
|
and suffix is the file extension (e.g., ".fna", ".faa")
|
||||||
|
"""
|
||||||
|
ext = file_path.suffix.lower()
|
||||||
|
if ext in PROTEIN_EXTENSIONS:
|
||||||
|
return "prot", ext
|
||||||
|
elif ext in GENOME_EXTENSIONS:
|
||||||
|
return "nucl", ext
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported file extension: {ext}. "
|
||||||
|
f"Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _shell(cmd: list[str]) -> subprocess.CompletedProcess:
|
def _shell(cmd: list[str]) -> subprocess.CompletedProcess:
|
||||||
return subprocess.run(cmd, text=True)
|
return subprocess.run(cmd, text=True)
|
||||||
|
|
||||||
@@ -72,8 +107,8 @@ def _read_first_strain(strain_scores_tsv: Path) -> str:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def run_single_fna_pipeline(
|
def run_single_file_pipeline(
|
||||||
fna_path: Path,
|
input_path: Path,
|
||||||
out_root: Path,
|
out_root: Path,
|
||||||
toxicity_csv: Path = Path("Data/toxicity-data.csv"),
|
toxicity_csv: Path = Path("Data/toxicity-data.csv"),
|
||||||
min_identity: float = 0.0,
|
min_identity: float = 0.0,
|
||||||
@@ -83,11 +118,13 @@ def run_single_fna_pipeline(
|
|||||||
lang: str = "zh",
|
lang: str = "zh",
|
||||||
bttoxin_db_dir: Path | None = None,
|
bttoxin_db_dir: Path | None = None,
|
||||||
threads: int = 4,
|
threads: int = 4,
|
||||||
|
sequence_type: str | None = None,
|
||||||
|
file_suffix: str | None = None,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""运行单个 fna 文件的完整 pipeline(使用 pixi 环境)。
|
"""运行单个文件(基因组或蛋白)的完整 pipeline(使用 pixi 环境)。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
fna_path: 输入 .fna 文件路径
|
input_path: 输入文件路径(.fna/.fa/.fasta 或 .faa)
|
||||||
out_root: 输出根目录
|
out_root: 输出根目录
|
||||||
toxicity_csv: 毒性数据 CSV 文件路径
|
toxicity_csv: 毒性数据 CSV 文件路径
|
||||||
min_identity: 最小 identity 阈值
|
min_identity: 最小 identity 阈值
|
||||||
@@ -98,11 +135,23 @@ def run_single_fna_pipeline(
|
|||||||
bttoxin_db_dir: 外部 bt_toxin 数据库目录。若为 None,则自动检测
|
bttoxin_db_dir: 外部 bt_toxin 数据库目录。若为 None,则自动检测
|
||||||
项目根目录下的 external_dbs/bt_toxin。
|
项目根目录下的 external_dbs/bt_toxin。
|
||||||
threads: 线程数
|
threads: 线程数
|
||||||
|
sequence_type: 序列类型 ("nucl" 或 "prot"),若为 None 则自动检测
|
||||||
|
file_suffix: 文件后缀,若为 None 则自动检测
|
||||||
"""
|
"""
|
||||||
fna_path = fna_path.resolve()
|
input_path = input_path.resolve()
|
||||||
out_root = out_root.resolve()
|
out_root = out_root.resolve()
|
||||||
out_root.mkdir(parents=True, exist_ok=True)
|
out_root.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# 自动检测序列类型
|
||||||
|
if sequence_type is None or file_suffix is None:
|
||||||
|
detected_type, detected_suffix = detect_sequence_type(input_path)
|
||||||
|
sequence_type = sequence_type or detected_type
|
||||||
|
file_suffix = file_suffix or detected_suffix
|
||||||
|
|
||||||
|
seq_type_label = "protein" if sequence_type == "prot" else "genome"
|
||||||
|
print(f"[pipeline] Input file: {input_path.name}")
|
||||||
|
print(f"[pipeline] Detected type: {seq_type_label} ({sequence_type}), suffix: {file_suffix}")
|
||||||
|
|
||||||
# 自动检测外部数据库
|
# 自动检测外部数据库
|
||||||
if bttoxin_db_dir is None:
|
if bttoxin_db_dir is None:
|
||||||
default_db = Path(__file__).resolve().parents[1] / "external_dbs" / "bt_toxin"
|
default_db = Path(__file__).resolve().parents[1] / "external_dbs" / "bt_toxin"
|
||||||
@@ -120,20 +169,30 @@ def run_single_fna_pipeline(
|
|||||||
d.mkdir(parents=True, exist_ok=True)
|
d.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Stage single input file
|
# Stage single input file
|
||||||
staged_fna = stage_dir / fna_path.name
|
staged_file = stage_dir / input_path.name
|
||||||
shutil.copy2(fna_path, staged_fna)
|
shutil.copy2(input_path, staged_file)
|
||||||
|
|
||||||
# 1) Run BtToxin_Digger via PixiRunner (pixi digger environment)
|
# 1) Run BtToxin_Digger via PixiRunner (pixi digger environment)
|
||||||
runner = PixiRunner(env_name="digger")
|
runner = PixiRunner(env_name="digger")
|
||||||
result = runner.run_bttoxin_digger(
|
|
||||||
input_dir=stage_dir,
|
# 构建参数:根据序列类型使用不同的参数名
|
||||||
output_dir=digger_dir,
|
digger_params = {
|
||||||
log_dir=logs_dir,
|
"input_dir": stage_dir,
|
||||||
sequence_type="nucl",
|
"output_dir": digger_dir,
|
||||||
scaf_suffix=fna_path.suffix or ".fna",
|
"log_dir": logs_dir,
|
||||||
threads=threads,
|
"sequence_type": sequence_type,
|
||||||
bttoxin_db_dir=bttoxin_db_dir,
|
"threads": threads,
|
||||||
)
|
"bttoxin_db_dir": bttoxin_db_dir,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 根据序列类型添加相应的后缀参数
|
||||||
|
if sequence_type == "prot":
|
||||||
|
digger_params["prot_suffix"] = file_suffix
|
||||||
|
else:
|
||||||
|
digger_params["scaf_suffix"] = file_suffix
|
||||||
|
|
||||||
|
result = runner.run_bttoxin_digger(**digger_params)
|
||||||
|
|
||||||
if not result.get("success"):
|
if not result.get("success"):
|
||||||
return {
|
return {
|
||||||
"ok": False,
|
"ok": False,
|
||||||
@@ -205,12 +264,23 @@ def run_single_fna_pipeline(
|
|||||||
"bundle": str(bundle),
|
"bundle": str(bundle),
|
||||||
"all_toxins": str(all_toxins),
|
"all_toxins": str(all_toxins),
|
||||||
"strain": strain_for_plot,
|
"strain": strain_for_plot,
|
||||||
|
"sequence_type": sequence_type,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
def main() -> int:
|
||||||
ap = argparse.ArgumentParser(description="Run single-fna Digger -> Shotter pipeline (pixi-based)")
|
ap = argparse.ArgumentParser(
|
||||||
ap.add_argument("--fna", type=Path, required=True, help="Path to a single .fna file")
|
description="Run single-file Digger -> Shotter pipeline (pixi-based). "
|
||||||
|
"Supports genome (.fna/.fa/.fasta) and protein (.faa) files."
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"--input",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="Path to input file (genome: .fna/.fa/.fasta, protein: .faa). "
|
||||||
|
"Deprecated: --fna still works for backward compatibility."
|
||||||
|
)
|
||||||
|
ap.add_argument("--fna", type=Path, default=None, help=argparse.SUPPRESS) # Hidden backward compat
|
||||||
ap.add_argument("--toxicity_csv", type=Path, default=Path("Data/toxicity-data.csv"))
|
ap.add_argument("--toxicity_csv", type=Path, default=Path("Data/toxicity-data.csv"))
|
||||||
ap.add_argument("--out_root", type=Path, default=Path("runs/single_run"))
|
ap.add_argument("--out_root", type=Path, default=Path("runs/single_run"))
|
||||||
ap.add_argument("--min_identity", type=float, default=0.0)
|
ap.add_argument("--min_identity", type=float, default=0.0)
|
||||||
@@ -223,13 +293,26 @@ def main() -> int:
|
|||||||
ap.add_argument("--threads", type=int, default=4, help="线程数")
|
ap.add_argument("--threads", type=int, default=4, help="线程数")
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
# Backward compatibility: --fna flag
|
||||||
|
input_file = args.input if args.input is not None else args.fna
|
||||||
|
|
||||||
|
if input_file is None:
|
||||||
|
print("[pipeline] Error: --input argument is required")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# 验证文件扩展名
|
||||||
|
if input_file.suffix.lower() not in ALL_EXTENSIONS:
|
||||||
|
print(f"[pipeline] Error: Unsupported file extension: {input_file.suffix}")
|
||||||
|
print(f"[pipeline] Supported: genome {GENOME_EXTENSIONS}, protein {PROTEIN_EXTENSIONS}")
|
||||||
|
return 1
|
||||||
|
|
||||||
# derive per-run default out_root using file stem
|
# derive per-run default out_root using file stem
|
||||||
if str(args.out_root) == "runs/single_run":
|
if str(args.out_root) == "runs/single_run":
|
||||||
stem = args.fna.stem
|
stem = input_file.stem
|
||||||
args.out_root = Path("runs") / f"{stem}_run"
|
args.out_root = Path("runs") / f"{stem}_run"
|
||||||
|
|
||||||
res = run_single_fna_pipeline(
|
res = run_single_file_pipeline(
|
||||||
fna_path=args.fna,
|
input_path=input_file,
|
||||||
out_root=args.out_root,
|
out_root=args.out_root,
|
||||||
toxicity_csv=args.toxicity_csv,
|
toxicity_csv=args.toxicity_csv,
|
||||||
min_identity=args.min_identity,
|
min_identity=args.min_identity,
|
||||||
@@ -249,6 +332,7 @@ def main() -> int:
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
print("[pipeline] ✓ Done")
|
print("[pipeline] ✓ Done")
|
||||||
|
print(f" Input type: {res.get('sequence_type', 'unknown')}")
|
||||||
print(f" Digger: {res['digger_dir']}")
|
print(f" Digger: {res['digger_dir']}")
|
||||||
print(f" Shotter: {res['shotter_dir']}")
|
print(f" Shotter: {res['shotter_dir']}")
|
||||||
print(f" Bundle: {res['bundle']}")
|
print(f" Bundle: {res['bundle']}")
|
||||||
@@ -256,5 +340,9 @@ def main() -> int:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
# Backward compatibility alias
|
||||||
|
run_single_fna_pipeline = run_single_file_pipeline
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
raise SystemExit(main())
|
raise SystemExit(main())
|
||||||
|
|||||||
Reference in New Issue
Block a user