feat: migrate from Docker to pixi for BtToxin_Digger execution
- Add pixi.toml with digger and pipeline environments - Implement PixiRunner class replacing DockerContainerManager - Add run_digger_stage.py for standalone digger execution - Update run_single_fna_pipeline.py to use PixiRunner - Remove docker dependency from pyproject.toml - Delete docker_client.py (no longer needed) BREAKING CHANGE: Docker is no longer required. Use 'pixi install' instead.
This commit is contained in:
@@ -1,405 +0,0 @@
|
||||
"""Docker/Podman 容器管理(修正版,支持 arm64/macOS 与 linux/amd64)"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, List
|
||||
import sys
|
||||
|
||||
try:
|
||||
import docker # type: ignore
|
||||
except Exception: # 允许在无 docker SDK 环境下使用 podman fallback
|
||||
docker = None # type: ignore
|
||||
|
||||
from ..core.config import settings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _which(cmd: str) -> Optional[str]:
|
||||
from shutil import which
|
||||
return which(cmd)
|
||||
|
||||
|
||||
class DockerContainerManager:
|
||||
"""容器管理器 - 兼容 Docker 与 Podman。
|
||||
|
||||
优先尝试 docker SDK;若不可用则回落到 podman CLI(或 docker CLI)。
|
||||
在 arm64 主机上默认以 --platform linux/amd64 运行镜像。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image: str = settings.DOCKER_IMAGE,
|
||||
platform: str = settings.DOCKER_PLATFORM,
|
||||
) -> None:
|
||||
self.image = image
|
||||
self.platform = platform
|
||||
|
||||
self._engine: str = "docker"
|
||||
self._client = None
|
||||
|
||||
# 首选 docker-py 客户端(若可用)
|
||||
if docker is not None:
|
||||
try:
|
||||
self._client = docker.from_env()
|
||||
# 探测 daemon
|
||||
self._client.ping()
|
||||
self._engine = "docker-sdk"
|
||||
except Exception as err:
|
||||
logger.info(f"docker SDK 不可用,将尝试 CLI 回落: {err}")
|
||||
self._client = None
|
||||
|
||||
# CLI 回落:优先 podman,其次 docker
|
||||
if self._client is None:
|
||||
if _which("podman"):
|
||||
self._engine = "podman-cli"
|
||||
elif _which("docker"):
|
||||
self._engine = "docker-cli"
|
||||
else:
|
||||
raise RuntimeError("未找到可用的容器引擎(需要 podman 或 docker)")
|
||||
|
||||
self._ensure_image()
|
||||
|
||||
# ----------------------------- 公共方法 -----------------------------
|
||||
def run_command_in_container(
|
||||
self,
|
||||
command: List[str],
|
||||
volumes: Dict[str, Dict[str, str]],
|
||||
environment: Optional[Dict[str, str]] = None,
|
||||
working_dir: str = "/workspace",
|
||||
name: Optional[str] = None,
|
||||
detach: bool = False,
|
||||
remove: bool = True,
|
||||
) -> Dict[str, Any]:
|
||||
"""在容器中执行命令,返回执行结果。"""
|
||||
# 确保挂载目录存在且可写
|
||||
for host_path, spec in volumes.items():
|
||||
p = Path(host_path)
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
p.chmod(0o777)
|
||||
except Exception:
|
||||
pass
|
||||
if self._engine == "docker-sdk" and self._client is not None:
|
||||
return self._run_with_docker_sdk(
|
||||
command, volumes, environment, working_dir, name, detach, remove
|
||||
)
|
||||
else:
|
||||
return self._run_with_cli(
|
||||
command, volumes, environment, working_dir, name, detach, remove
|
||||
)
|
||||
|
||||
def update_database(self, log_dir: Path) -> Dict[str, Any]:
|
||||
"""在容器中更新 BtToxin_Digger 数据库。"""
|
||||
cmd = [
|
||||
"/usr/local/env-execute",
|
||||
"BtToxin_Digger",
|
||||
"--update-db",
|
||||
]
|
||||
vols = {str(log_dir): {"bind": "/logs", "mode": "rw"}}
|
||||
result = self.run_command_in_container(
|
||||
command=cmd, volumes=vols, working_dir="/tmp", name=f"bttoxin_update_db_{int(time.time())}"
|
||||
)
|
||||
if result.get("logs"):
|
||||
(log_dir / "update_db.log").write_text(result["logs"], encoding="utf-8")
|
||||
return result
|
||||
|
||||
def validate_reads_filenames(
|
||||
self,
|
||||
input_dir: Path,
|
||||
platform: str,
|
||||
reads1_suffix: str,
|
||||
reads2_suffix: Optional[str] = None,
|
||||
suffix_len: int = 0,
|
||||
) -> Dict[str, Any]:
|
||||
files = list(input_dir.glob("*"))
|
||||
if platform == "illumina":
|
||||
r1 = [f for f in files if reads1_suffix and reads1_suffix in f.name]
|
||||
r2 = [f for f in files if reads2_suffix and reads2_suffix in f.name]
|
||||
if not r1 or not r2 or len(r1) != len(r2):
|
||||
return {"valid": False, "error": "Illumina R1/R2 配对数量不匹配或缺失"}
|
||||
for f1 in r1:
|
||||
strain = f1.name.replace(reads1_suffix, "")
|
||||
if not (input_dir / f"{strain}{reads2_suffix}").exists():
|
||||
return {"valid": False, "error": f"未找到配对文件: {strain}{reads2_suffix}"}
|
||||
return {
|
||||
"valid": True,
|
||||
"strain_count": len(r1),
|
||||
"suggested_suffix_len": suffix_len or len(reads1_suffix),
|
||||
}
|
||||
if platform in ("pacbio", "oxford"):
|
||||
r = [f for f in files if reads1_suffix and reads1_suffix in f.name]
|
||||
if not r:
|
||||
return {"valid": False, "error": f"未找到匹配 {reads1_suffix} 的 reads 文件"}
|
||||
return {
|
||||
"valid": True,
|
||||
"strain_count": len(r),
|
||||
"suggested_suffix_len": suffix_len or len(reads1_suffix),
|
||||
}
|
||||
return {"valid": True}
|
||||
|
||||
def run_bttoxin_digger(
|
||||
self,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
log_dir: Path,
|
||||
sequence_type: str = "nucl",
|
||||
scaf_suffix: str = ".fna",
|
||||
threads: int = 4,
|
||||
bttoxin_db_dir: Optional[Path] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
"""在容器中运行 BtToxin_Digger 主分析(单目录方案)。
|
||||
|
||||
Args:
|
||||
bttoxin_db_dir: 外部 bt_toxin 数据库目录路径(可选)。
|
||||
若提供,将绑定到容器内 /usr/local/bin/BTTCMP_db/bt_toxin,
|
||||
覆盖容器内置的旧数据库。目录结构应为:
|
||||
bt_toxin/
|
||||
├── db/ (BLAST 索引文件)
|
||||
└── seq/ (序列源文件)
|
||||
"""
|
||||
|
||||
# 1) 在宿主输出目录下准备 input_files,并复制输入文件
|
||||
work_input_dir = (output_dir / "input_files").resolve()
|
||||
work_input_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
import shutil
|
||||
if sequence_type == "nucl":
|
||||
pattern = f"*{scaf_suffix}"
|
||||
elif sequence_type == "orfs":
|
||||
pattern = f"*{kwargs.get('orfs_suffix', '.ffn')}"
|
||||
elif sequence_type == "prot":
|
||||
pattern = f"*{kwargs.get('prot_suffix', '.faa')}"
|
||||
elif sequence_type == "reads":
|
||||
pattern = "*"
|
||||
else:
|
||||
pattern = "*"
|
||||
|
||||
copied_files = 0
|
||||
for f in input_dir.glob(pattern):
|
||||
if f.is_file():
|
||||
shutil.copy2(f, work_input_dir / f.name)
|
||||
copied_files += 1
|
||||
logger.info(f"已复制 {copied_files} 个输入文件到 {work_input_dir}")
|
||||
|
||||
base_cmd: List[str] = [
|
||||
"/usr/local/env-execute",
|
||||
"BtToxin_Digger",
|
||||
"--SeqPath",
|
||||
"/workspace/input_files",
|
||||
"--SequenceType",
|
||||
sequence_type,
|
||||
"--threads",
|
||||
str(threads),
|
||||
]
|
||||
|
||||
if sequence_type == "nucl":
|
||||
base_cmd += ["--Scaf_suffix", scaf_suffix]
|
||||
elif sequence_type == "orfs":
|
||||
base_cmd += ["--orfs_suffix", kwargs.get("orfs_suffix", ".ffn")]
|
||||
elif sequence_type == "prot":
|
||||
base_cmd += ["--prot_suffix", kwargs.get("prot_suffix", ".faa")]
|
||||
elif sequence_type == "reads":
|
||||
platform = kwargs.get("platform", "illumina")
|
||||
base_cmd += ["--platform", platform]
|
||||
if platform == "illumina":
|
||||
r1 = kwargs.get("reads1_suffix", "_R1.fastq.gz")
|
||||
r2 = kwargs.get("reads2_suffix", "_R2.fastq.gz")
|
||||
sfx = kwargs.get("suffix_len") or len(r1)
|
||||
v = self.validate_reads_filenames(work_input_dir, platform, r1, r2, sfx)
|
||||
if not v.get("valid"):
|
||||
raise ValueError(f"Reads 文件验证失败: {v.get('error')}")
|
||||
sfx = v.get("suggested_suffix_len", sfx)
|
||||
base_cmd += ["--reads1", r1, "--reads2", r2, "--suffix_len", str(sfx)]
|
||||
elif platform in ("pacbio", "oxford"):
|
||||
r = kwargs.get("reads1_suffix", ".fastq.gz")
|
||||
gsize = kwargs.get("genome_size", "6.07m")
|
||||
sfx = kwargs.get("suffix_len") or len(r)
|
||||
v = self.validate_reads_filenames(work_input_dir, platform, r, None, sfx)
|
||||
if not v.get("valid"):
|
||||
raise ValueError(f"Reads 文件验证失败: {v.get('error')}")
|
||||
sfx = v.get("suggested_suffix_len", sfx)
|
||||
base_cmd += ["--reads1", r, "--genomeSize", gsize, "--suffix_len", str(sfx)]
|
||||
elif platform == "hybrid":
|
||||
short1 = kwargs.get("short1")
|
||||
short2 = kwargs.get("short2")
|
||||
long = kwargs.get("long")
|
||||
if not all([short1, short2, long]):
|
||||
raise ValueError("hybrid 需要 short1/short2/long 三个完整文件名")
|
||||
for fn in (short1, short2, long):
|
||||
if not (work_input_dir / fn).exists():
|
||||
raise ValueError(f"文件不存在: {fn}")
|
||||
base_cmd += [
|
||||
"--short1",
|
||||
short1,
|
||||
"--short2",
|
||||
short2,
|
||||
"--long",
|
||||
long,
|
||||
"--hout",
|
||||
"/workspace/Results/Assembles/Hybrid",
|
||||
]
|
||||
|
||||
if kwargs.get("assemble_only"):
|
||||
base_cmd.append("--assemble_only")
|
||||
|
||||
# 2) 挂载输出目录(含 input_files)、日志目录、以及可选的外部数据库
|
||||
volumes = {
|
||||
str(output_dir.resolve()): {"bind": "/workspace", "mode": "rw"},
|
||||
str(log_dir.resolve()): {"bind": "/data/logs", "mode": "rw"},
|
||||
}
|
||||
|
||||
# 绑定外部 bt_toxin 数据库(覆盖容器内置旧库)
|
||||
if bttoxin_db_dir is not None:
|
||||
db_path = Path(bttoxin_db_dir).resolve()
|
||||
if db_path.exists() and (db_path / "db").exists():
|
||||
volumes[str(db_path)] = {
|
||||
"bind": "/usr/local/bin/BTTCMP_db/bt_toxin",
|
||||
"mode": "ro",
|
||||
}
|
||||
logger.info(f"绑定外部数据库: {db_path} -> /usr/local/bin/BTTCMP_db/bt_toxin")
|
||||
else:
|
||||
logger.warning(f"外部数据库目录不存在或结构不完整: {bttoxin_db_dir}")
|
||||
|
||||
logger.info("开始 BtToxin_Digger 分析...")
|
||||
|
||||
final_cmd = base_cmd
|
||||
working_dir = "/workspace"
|
||||
|
||||
result = self.run_command_in_container(
|
||||
command=final_cmd,
|
||||
volumes=volumes,
|
||||
working_dir=working_dir,
|
||||
name=f"bttoxin_digger_{int(time.time())}",
|
||||
)
|
||||
|
||||
# 保存容器日志
|
||||
logs_path = log_dir / "digger_execution.log"
|
||||
if result.get("logs"):
|
||||
logs_path.write_text(result["logs"], encoding="utf-8")
|
||||
logger.info(f"容器日志已保存: {logs_path}")
|
||||
|
||||
# 验证输出
|
||||
results_dir = output_dir / "Results"
|
||||
if result.get("success") and results_dir.exists():
|
||||
files = [f for f in results_dir.rglob("*") if f.is_file()]
|
||||
result["output_files"] = len(files)
|
||||
else:
|
||||
result["output_files"] = 0
|
||||
return result
|
||||
|
||||
# ----------------------------- 内部实现 -----------------------------
|
||||
def _ensure_image(self) -> None:
|
||||
if self._engine == "docker-sdk" and self._client is not None:
|
||||
try:
|
||||
self._client.images.get(self.image)
|
||||
return
|
||||
except Exception:
|
||||
logger.info(f"拉取镜像 {self.image} (platform={self.platform}) ...")
|
||||
self._client.images.pull(self.image, platform=self.platform)
|
||||
else:
|
||||
# CLI 模式:先尝试拉取
|
||||
cli = "podman" if self._engine == "podman-cli" else "docker"
|
||||
try:
|
||||
subprocess.run(
|
||||
[cli, "pull", "--platform", self.platform, self.image],
|
||||
check=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
)
|
||||
except Exception as err:
|
||||
logger.warning(f"{cli} pull 失败: {err}")
|
||||
|
||||
def _run_with_docker_sdk(
|
||||
self,
|
||||
command: List[str],
|
||||
volumes: Dict[str, Dict[str, str]],
|
||||
environment: Optional[Dict[str, str]],
|
||||
working_dir: str,
|
||||
name: Optional[str],
|
||||
detach: bool,
|
||||
remove: bool,
|
||||
) -> Dict[str, Any]:
|
||||
assert self._client is not None
|
||||
try:
|
||||
# 注意:docker SDK 在 detach=False 时返回的是日志字节串,而非容器对象。
|
||||
# 这里统一以 detach=True 运行,然后等待并抓取日志,最后按需删除容器。
|
||||
container = self._client.containers.run(
|
||||
image=self.image,
|
||||
command=command,
|
||||
volumes=volumes,
|
||||
environment=environment or {},
|
||||
working_dir=working_dir,
|
||||
platform=self.platform,
|
||||
name=name,
|
||||
user="0:0", # 以 root 运行,避免挂载目录权限问题
|
||||
detach=True,
|
||||
remove=False, # 获取日志后再删
|
||||
stdout=True,
|
||||
stderr=True,
|
||||
)
|
||||
exit_info = container.wait()
|
||||
code = exit_info.get("StatusCode", 1)
|
||||
logs = container.logs().decode("utf-8", errors="ignore")
|
||||
if remove:
|
||||
try:
|
||||
container.remove()
|
||||
except Exception:
|
||||
pass
|
||||
return {"success": code == 0, "exit_code": code, "logs": logs, "status": "completed" if code == 0 else "failed"}
|
||||
except Exception as e:
|
||||
logger.error(f"docker SDK 运行失败: {e}", exc_info=True)
|
||||
return {"success": False, "error": str(e), "exit_code": -1, "status": "error"}
|
||||
|
||||
def _run_with_cli(
|
||||
self,
|
||||
command: List[str],
|
||||
volumes: Dict[str, Dict[str, str]],
|
||||
environment: Optional[Dict[str, str]],
|
||||
working_dir: str,
|
||||
name: Optional[str],
|
||||
detach: bool,
|
||||
remove: bool,
|
||||
) -> Dict[str, Any]:
|
||||
cli = "podman" if self._engine == "podman-cli" else "docker"
|
||||
cmd: List[str] = [cli, "run", "--rm" if remove and not detach else ""]
|
||||
cmd = [c for c in cmd if c]
|
||||
cmd += ["--platform", self.platform]
|
||||
# 以 root 运行,避免权限问题
|
||||
cmd += ["--user", "0:0"]
|
||||
if name:
|
||||
cmd += ["--name", name]
|
||||
for host, spec in volumes.items():
|
||||
bind = spec.get("bind")
|
||||
mode = spec.get("mode", "rw")
|
||||
# Podman(Linux) 下附加 :Z 处理 SELinux 标注;其他平台保持不变
|
||||
mount_mode = mode
|
||||
if self._engine == "podman-cli" and os.name == "posix" and sys.platform.startswith("linux"):
|
||||
mount_mode = f"{mode},Z"
|
||||
cmd += ["-v", f"{host}:{bind}:{mount_mode}"]
|
||||
for k, v in (environment or {}).items():
|
||||
cmd += ["-e", f"{k}={v}"]
|
||||
cmd += ["-w", working_dir, self.image]
|
||||
cmd += command
|
||||
|
||||
try:
|
||||
if detach:
|
||||
# 后台运行:CLI 简化返回
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
||||
return {"success": True, "status": "running", "pid": p.pid}
|
||||
else:
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||||
out = (proc.stdout or "") + (proc.stderr or "")
|
||||
return {"success": proc.returncode == 0, "exit_code": proc.returncode, "logs": out, "status": "completed" if proc.returncode == 0 else "failed"}
|
||||
except Exception as e:
|
||||
logger.error(f"{cli} 运行失败: {e}", exc_info=True)
|
||||
return {"success": False, "error": str(e), "exit_code": -1, "status": "error"}
|
||||
|
||||
|
||||
43
pixi.toml
Normal file
43
pixi.toml
Normal file
@@ -0,0 +1,43 @@
|
||||
[workspace]
|
||||
name = "bttoxin-pipeline"
|
||||
channels = ["conda-forge", "bioconda", "bioconda/label/cf201901"]
|
||||
platforms = ["linux-64"]
|
||||
version = "0.1.0"
|
||||
channel-priority = "disabled"
|
||||
|
||||
# =========================
|
||||
# digger 环境:bioconda 依赖
|
||||
# =========================
|
||||
[feature.digger.dependencies]
|
||||
bttoxin_digger = "==1.0.10"
|
||||
perl = "==5.26.2"
|
||||
perl-file-tee = "==0.07"
|
||||
perl-list-util = "==1.38"
|
||||
blast = "==2.16.0"
|
||||
|
||||
# =========================
|
||||
# pipeline 环境:Python 分析依赖
|
||||
# =========================
|
||||
[feature.pipeline.dependencies]
|
||||
python = ">=3.9"
|
||||
pandas = ">=2.0.0"
|
||||
matplotlib = ">=3.7.0"
|
||||
seaborn = ">=0.12.2"
|
||||
|
||||
# =========================
|
||||
# 环境定义
|
||||
# =========================
|
||||
[environments]
|
||||
digger = ["digger"]
|
||||
pipeline = ["pipeline"]
|
||||
|
||||
# =========================
|
||||
# pixi tasks
|
||||
# =========================
|
||||
[feature.pipeline.tasks]
|
||||
# 完整 pipeline(三阶段)
|
||||
pipeline = "python scripts/run_single_fna_pipeline.py"
|
||||
# 单独阶段
|
||||
digger-only = "python scripts/run_digger_stage.py"
|
||||
shotter = "python scripts/bttoxin_shoter.py"
|
||||
plot = "python scripts/plot_shotter.py"
|
||||
@@ -13,7 +13,6 @@ dependencies = [
|
||||
"pandas>=2.0.0",
|
||||
"matplotlib>=3.7.0",
|
||||
"seaborn>=0.12.2; python_version>='3.9'",
|
||||
"docker>=6.1.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
671
scripts/pixi_runner.py
Normal file
671
scripts/pixi_runner.py
Normal file
@@ -0,0 +1,671 @@
|
||||
#!/usr/bin/env python3
|
||||
"""PixiRunner - 在 pixi 环境中执行 BtToxin_Digger 的运行器。
|
||||
|
||||
此模块提供 PixiRunner 类,用于替代 DockerContainerManager,
|
||||
在本地 pixi 环境中运行 BtToxin_Digger 分析。
|
||||
|
||||
Example:
|
||||
runner = PixiRunner()
|
||||
result = runner.run_bttoxin_digger(
|
||||
input_dir=Path("input"),
|
||||
output_dir=Path("output"),
|
||||
log_dir=Path("logs"),
|
||||
)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PixiRunnerError(Exception):
|
||||
"""PixiRunner 相关错误的基类。"""
|
||||
pass
|
||||
|
||||
|
||||
class PixiNotInstalledError(PixiRunnerError):
|
||||
"""pixi 未安装时抛出的错误。"""
|
||||
pass
|
||||
|
||||
|
||||
class EnvironmentNotFoundError(PixiRunnerError):
|
||||
"""pixi 环境不存在时抛出的错误。"""
|
||||
pass
|
||||
|
||||
|
||||
class BtToxinDiggerNotFoundError(PixiRunnerError):
|
||||
"""BtToxin_Digger 未找到时抛出的错误。"""
|
||||
pass
|
||||
|
||||
|
||||
class PixiRunner:
|
||||
"""在 pixi 环境中执行 BtToxin_Digger 的运行器。
|
||||
|
||||
此类提供与 DockerContainerManager 兼容的接口,
|
||||
但使用本地 pixi 环境而非 Docker 容器执行命令。
|
||||
|
||||
Attributes:
|
||||
pixi_project_dir: pixi.toml 所在目录
|
||||
env_name: pixi 环境名称,默认为 "digger"
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pixi_project_dir: Optional[Path] = None,
|
||||
env_name: str = "digger",
|
||||
) -> None:
|
||||
"""初始化运行器,验证 pixi 和环境可用性。
|
||||
|
||||
Args:
|
||||
pixi_project_dir: pixi.toml 所在目录,默认为项目根目录
|
||||
env_name: pixi 环境名称,默认为 "digger"
|
||||
|
||||
Raises:
|
||||
PixiNotInstalledError: pixi 未安装
|
||||
EnvironmentNotFoundError: pixi 环境不存在
|
||||
BtToxinDiggerNotFoundError: BtToxin_Digger 未找到
|
||||
"""
|
||||
self.env_name = env_name
|
||||
|
||||
# 确定 pixi 项目目录
|
||||
if pixi_project_dir is None:
|
||||
# 默认为脚本所在目录的父目录(项目根目录)
|
||||
self.pixi_project_dir = Path(__file__).resolve().parents[1]
|
||||
else:
|
||||
self.pixi_project_dir = Path(pixi_project_dir).resolve()
|
||||
|
||||
# 验证 pixi.toml 存在
|
||||
pixi_toml = self.pixi_project_dir / "pixi.toml"
|
||||
if not pixi_toml.exists():
|
||||
raise PixiRunnerError(
|
||||
f"pixi.toml 配置文件不存在: {pixi_toml}。"
|
||||
"请确保在项目根目录运行,或指定正确的 pixi_project_dir。"
|
||||
)
|
||||
|
||||
def check_environment(self) -> Dict[str, Any]:
|
||||
"""检查 pixi 环境和 BtToxin_Digger 可用性。
|
||||
|
||||
Returns:
|
||||
包含环境状态的字典:
|
||||
- pixi_installed: bool, pixi 是否已安装
|
||||
- env_exists: bool, 指定环境是否存在
|
||||
- bttoxin_available: bool, BtToxin_Digger 是否可用
|
||||
- error: str | None, 错误信息(包含可操作的指导)
|
||||
"""
|
||||
result: Dict[str, Any] = {
|
||||
"pixi_installed": False,
|
||||
"env_exists": False,
|
||||
"bttoxin_available": False,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
# 检查 pixi 是否安装
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["pixi", "--version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
if proc.returncode == 0:
|
||||
result["pixi_installed"] = True
|
||||
else:
|
||||
result["error"] = (
|
||||
"pixi 未安装。请访问 https://pixi.sh 安装 pixi,"
|
||||
"或运行: curl -fsSL https://pixi.sh/install.sh | bash"
|
||||
)
|
||||
return result
|
||||
except FileNotFoundError:
|
||||
result["error"] = (
|
||||
"pixi 未安装。请访问 https://pixi.sh 安装 pixi,"
|
||||
"或运行: curl -fsSL https://pixi.sh/install.sh | bash"
|
||||
)
|
||||
return result
|
||||
except subprocess.TimeoutExpired:
|
||||
result["error"] = "pixi 命令超时。请检查 pixi 安装是否正确。"
|
||||
return result
|
||||
except Exception as e:
|
||||
result["error"] = f"检查 pixi 时出错: {e}。请确保 pixi 已正确安装。"
|
||||
return result
|
||||
|
||||
# 检查环境是否存在
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["pixi", "info", "--json"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
cwd=str(self.pixi_project_dir),
|
||||
)
|
||||
if proc.returncode == 0:
|
||||
try:
|
||||
info = json.loads(proc.stdout)
|
||||
envs = info.get("environments_info", [])
|
||||
env_names = [e.get("name") for e in envs if e.get("name")]
|
||||
if self.env_name in env_names:
|
||||
result["env_exists"] = True
|
||||
else:
|
||||
result["error"] = (
|
||||
f"{self.env_name} 环境不存在。"
|
||||
f"请在项目根目录运行 'pixi install' 安装依赖。"
|
||||
)
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
# 回退到简单检查
|
||||
result["env_exists"] = True
|
||||
else:
|
||||
result["error"] = (
|
||||
f"{self.env_name} 环境不存在。"
|
||||
f"请在项目根目录运行 'pixi install' 安装依赖。"
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
result["error"] = f"检查环境时出错: {e}。请运行 'pixi install' 安装依赖。"
|
||||
return result
|
||||
|
||||
# 检查 BtToxin_Digger 是否可用
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["pixi", "run", "-e", self.env_name, "BtToxin_Digger", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
cwd=str(self.pixi_project_dir),
|
||||
)
|
||||
output = proc.stdout + proc.stderr
|
||||
if proc.returncode == 0 or "BtToxin_Digger" in output or "Usage" in output:
|
||||
result["bttoxin_available"] = True
|
||||
else:
|
||||
result["error"] = (
|
||||
"BtToxin_Digger 未找到。请确保 pixi install 已完成,"
|
||||
"并且 pixi.toml 中包含 bttoxin_digger 依赖。"
|
||||
)
|
||||
return result
|
||||
except FileNotFoundError:
|
||||
result["error"] = (
|
||||
"pixi 命令未找到。请确保 pixi 已正确安装并在 PATH 中。"
|
||||
)
|
||||
return result
|
||||
except subprocess.TimeoutExpired:
|
||||
result["error"] = (
|
||||
"检查 BtToxin_Digger 超时。请运行 'pixi install' 确保环境已正确安装。"
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
result["error"] = (
|
||||
f"检查 BtToxin_Digger 时出错: {e}。"
|
||||
"请运行 'pixi install' 安装依赖。"
|
||||
)
|
||||
return result
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def build_digger_command(
|
||||
self,
|
||||
input_dir: Path,
|
||||
sequence_type: str = "nucl",
|
||||
scaf_suffix: str = ".fna",
|
||||
threads: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""构建 BtToxin_Digger 命令列表。
|
||||
|
||||
Args:
|
||||
input_dir: 输入文件目录
|
||||
sequence_type: 序列类型 (nucl/orfs/prot/reads)
|
||||
scaf_suffix: scaffold 文件后缀
|
||||
threads: 线程数
|
||||
**kwargs: 其他 BtToxin_Digger 参数
|
||||
|
||||
Returns:
|
||||
命令参数列表,以 'pixi run -e digger BtToxin_Digger' 开头
|
||||
"""
|
||||
cmd: List[str] = [
|
||||
"pixi", "run", "-e", self.env_name,
|
||||
"BtToxin_Digger",
|
||||
"--SeqPath", str(input_dir),
|
||||
"--SequenceType", sequence_type,
|
||||
"--threads", str(threads),
|
||||
]
|
||||
|
||||
if sequence_type == "nucl":
|
||||
cmd += ["--Scaf_suffix", scaf_suffix]
|
||||
elif sequence_type == "orfs":
|
||||
cmd += ["--orfs_suffix", kwargs.get("orfs_suffix", ".ffn")]
|
||||
elif sequence_type == "prot":
|
||||
cmd += ["--prot_suffix", kwargs.get("prot_suffix", ".faa")]
|
||||
elif sequence_type == "reads":
|
||||
platform = kwargs.get("platform", "illumina")
|
||||
cmd += ["--platform", platform]
|
||||
if platform == "illumina":
|
||||
r1 = kwargs.get("reads1_suffix", "_R1.fastq.gz")
|
||||
r2 = kwargs.get("reads2_suffix", "_R2.fastq.gz")
|
||||
sfx = kwargs.get("suffix_len") or len(r1)
|
||||
cmd += ["--reads1", r1, "--reads2", r2, "--suffix_len", str(sfx)]
|
||||
elif platform in ("pacbio", "oxford"):
|
||||
r = kwargs.get("reads1_suffix", ".fastq.gz")
|
||||
gsize = kwargs.get("genome_size", "6.07m")
|
||||
sfx = kwargs.get("suffix_len") or len(r)
|
||||
cmd += ["--reads1", r, "--genomeSize", gsize, "--suffix_len", str(sfx)]
|
||||
elif platform == "hybrid":
|
||||
short1 = kwargs.get("short1")
|
||||
short2 = kwargs.get("short2")
|
||||
long_reads = kwargs.get("long")
|
||||
if short1:
|
||||
cmd += ["--short1", short1]
|
||||
if short2:
|
||||
cmd += ["--short2", short2]
|
||||
if long_reads:
|
||||
cmd += ["--long", long_reads]
|
||||
hout = kwargs.get("hout")
|
||||
if hout:
|
||||
cmd += ["--hout", hout]
|
||||
|
||||
if kwargs.get("assemble_only"):
|
||||
cmd.append("--assemble_only")
|
||||
|
||||
return cmd
|
||||
|
||||
def run_bttoxin_digger(
|
||||
self,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
log_dir: Path,
|
||||
sequence_type: str = "nucl",
|
||||
scaf_suffix: str = ".fna",
|
||||
threads: int = 4,
|
||||
bttoxin_db_dir: Optional[Path] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
"""执行 BtToxin_Digger 分析。
|
||||
|
||||
Args:
|
||||
input_dir: 输入文件目录
|
||||
output_dir: 输出目录
|
||||
log_dir: 日志目录
|
||||
sequence_type: 序列类型 (nucl/orfs/prot/reads)
|
||||
scaf_suffix: scaffold 文件后缀
|
||||
threads: 线程数
|
||||
bttoxin_db_dir: 外部数据库目录(可选)
|
||||
**kwargs: 其他 BtToxin_Digger 参数
|
||||
|
||||
Returns:
|
||||
结果字典,包含:
|
||||
- success: bool, 执行是否成功
|
||||
- exit_code: int, 进程退出码
|
||||
- logs: str, 执行日志
|
||||
- status: str, 状态 (completed/failed/error)
|
||||
- error: str (可选), 错误信息
|
||||
- output_files: int (可选), 输出文件数量
|
||||
"""
|
||||
# 确保目录存在
|
||||
output_dir = Path(output_dir).resolve()
|
||||
log_dir = Path(log_dir).resolve()
|
||||
input_dir = Path(input_dir).resolve()
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 准备工作目录结构
|
||||
work_input_dir = output_dir / "input_files"
|
||||
work_input_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 复制输入文件
|
||||
if sequence_type == "nucl":
|
||||
pattern = f"*{scaf_suffix}"
|
||||
elif sequence_type == "orfs":
|
||||
pattern = f"*{kwargs.get('orfs_suffix', '.ffn')}"
|
||||
elif sequence_type == "prot":
|
||||
pattern = f"*{kwargs.get('prot_suffix', '.faa')}"
|
||||
else:
|
||||
pattern = "*"
|
||||
|
||||
copied_files = 0
|
||||
for f in input_dir.glob(pattern):
|
||||
if f.is_file():
|
||||
shutil.copy2(f, work_input_dir / f.name)
|
||||
copied_files += 1
|
||||
|
||||
logger.info(f"已复制 {copied_files} 个输入文件到 {work_input_dir}")
|
||||
|
||||
# 设置外部数据库(如果提供)
|
||||
if bttoxin_db_dir is not None:
|
||||
db_setup_ok = self._setup_external_db(bttoxin_db_dir)
|
||||
if not db_setup_ok:
|
||||
logger.warning("外部数据库设置失败,将使用默认数据库")
|
||||
|
||||
# 构建命令
|
||||
cmd = self.build_digger_command(
|
||||
input_dir=work_input_dir,
|
||||
sequence_type=sequence_type,
|
||||
scaf_suffix=scaf_suffix,
|
||||
threads=threads,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
logger.info(f"执行命令: {' '.join(cmd)}")
|
||||
|
||||
# 执行命令
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=str(output_dir),
|
||||
)
|
||||
|
||||
logs = f"=== STDOUT ===\n{proc.stdout}\n=== STDERR ===\n{proc.stderr}"
|
||||
|
||||
# 保存日志
|
||||
log_file = log_dir / "digger_execution.log"
|
||||
log_file.write_text(logs, encoding="utf-8")
|
||||
logger.info(f"日志已保存: {log_file}")
|
||||
|
||||
# 构建结果
|
||||
result: Dict[str, Any] = {
|
||||
"success": proc.returncode == 0,
|
||||
"exit_code": proc.returncode,
|
||||
"logs": logs,
|
||||
"status": "completed" if proc.returncode == 0 else "failed",
|
||||
}
|
||||
|
||||
if proc.returncode != 0:
|
||||
result["error"] = (
|
||||
f"BtToxin_Digger 执行失败 (exit={proc.returncode}): "
|
||||
f"{proc.stderr[:500] if proc.stderr else 'No error output'}"
|
||||
)
|
||||
|
||||
# 统计输出文件
|
||||
results_dir = output_dir / "Results"
|
||||
if results_dir.exists():
|
||||
files = [f for f in results_dir.rglob("*") if f.is_file()]
|
||||
result["output_files"] = len(files)
|
||||
else:
|
||||
result["output_files"] = 0
|
||||
|
||||
return result
|
||||
|
||||
except FileNotFoundError as e:
|
||||
error_msg = (
|
||||
f"命令执行失败: {e}。"
|
||||
"请确保 pixi 已正确安装并在 PATH 中。"
|
||||
"安装说明: https://pixi.sh"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
return {
|
||||
"success": False,
|
||||
"exit_code": -1,
|
||||
"logs": "",
|
||||
"status": "error",
|
||||
"error": error_msg,
|
||||
}
|
||||
except subprocess.TimeoutExpired as e:
|
||||
error_msg = f"命令执行超时: {e}"
|
||||
logger.error(error_msg)
|
||||
return {
|
||||
"success": False,
|
||||
"exit_code": -1,
|
||||
"logs": "",
|
||||
"status": "error",
|
||||
"error": error_msg,
|
||||
}
|
||||
except Exception as e:
|
||||
error_msg = f"执行 BtToxin_Digger 时出错: {e}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"exit_code": -1,
|
||||
"logs": "",
|
||||
"status": "error",
|
||||
"error": error_msg,
|
||||
}
|
||||
|
||||
|
||||
def _setup_external_db(self, bttoxin_db_dir: Path) -> bool:
|
||||
"""设置外部数据库符号链接。
|
||||
|
||||
将 external_dbs/bt_toxin 链接到 pixi 环境的 BTTCMP_db/bt_toxin。
|
||||
|
||||
Args:
|
||||
bttoxin_db_dir: 外部数据库目录路径
|
||||
|
||||
Returns:
|
||||
bool: 设置是否成功
|
||||
"""
|
||||
db_path = Path(bttoxin_db_dir).resolve()
|
||||
|
||||
# 验证数据库目录结构
|
||||
if not db_path.exists():
|
||||
logger.warning(f"外部数据库目录不存在: {db_path}")
|
||||
return False
|
||||
|
||||
if not (db_path / "db").exists():
|
||||
logger.warning(f"外部数据库目录结构不完整,缺少 db 子目录: {db_path}")
|
||||
return False
|
||||
|
||||
# 获取 pixi 环境路径
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["pixi", "info", "--json"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
cwd=str(self.pixi_project_dir),
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
logger.warning("无法获取 pixi 环境信息")
|
||||
return False
|
||||
|
||||
info = json.loads(proc.stdout)
|
||||
envs_info = info.get("environments_info", [])
|
||||
|
||||
env_prefix = None
|
||||
for env in envs_info:
|
||||
if env.get("name") == self.env_name:
|
||||
env_prefix = env.get("prefix")
|
||||
break
|
||||
|
||||
if not env_prefix:
|
||||
logger.warning(f"未找到 {self.env_name} 环境的路径")
|
||||
return False
|
||||
|
||||
# 创建符号链接
|
||||
# BtToxin_Digger 期望数据库在 BTTCMP_db/bt_toxin
|
||||
env_path = Path(env_prefix)
|
||||
bttcmp_db_dir = env_path / "bin" / "BTTCMP_db"
|
||||
target_link = bttcmp_db_dir / "bt_toxin"
|
||||
|
||||
# 确保 BTTCMP_db 目录存在
|
||||
bttcmp_db_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 如果已存在符号链接或目录,先删除
|
||||
if target_link.exists() or target_link.is_symlink():
|
||||
if target_link.is_symlink():
|
||||
target_link.unlink()
|
||||
elif target_link.is_dir():
|
||||
shutil.rmtree(target_link)
|
||||
|
||||
# 创建符号链接
|
||||
target_link.symlink_to(db_path)
|
||||
logger.info(f"已创建数据库符号链接: {target_link} -> {db_path}")
|
||||
return True
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"解析 pixi info 输出失败: {e}")
|
||||
return False
|
||||
except PermissionError as e:
|
||||
logger.warning(f"无法创建数据库符号链接(权限不足): {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"设置外部数据库时出错: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def build_shotter_command(
|
||||
pixi_project_dir: Path,
|
||||
script_path: Path,
|
||||
toxicity_csv: Path,
|
||||
all_toxins: Path,
|
||||
output_dir: Path,
|
||||
min_identity: float = 0.0,
|
||||
min_coverage: float = 0.0,
|
||||
allow_unknown_families: bool = True,
|
||||
require_index_hit: bool = False,
|
||||
) -> List[str]:
|
||||
"""构建 Shotter 命令列表(使用 pipeline 环境)。
|
||||
|
||||
Args:
|
||||
pixi_project_dir: pixi.toml 所在目录
|
||||
script_path: bttoxin_shoter.py 脚本路径
|
||||
toxicity_csv: 毒性数据 CSV 文件路径
|
||||
all_toxins: All_Toxins.txt 文件路径
|
||||
output_dir: 输出目录
|
||||
min_identity: 最小 identity 阈值
|
||||
min_coverage: 最小 coverage 阈值
|
||||
allow_unknown_families: 是否允许未知家族
|
||||
require_index_hit: 是否要求索引命中
|
||||
|
||||
Returns:
|
||||
命令参数列表,以 'pixi run -e pipeline python' 开头
|
||||
"""
|
||||
cmd: List[str] = [
|
||||
"pixi", "run", "-e", "pipeline",
|
||||
"python", str(script_path),
|
||||
"--toxicity_csv", str(toxicity_csv),
|
||||
"--all_toxins", str(all_toxins),
|
||||
"--output_dir", str(output_dir),
|
||||
]
|
||||
|
||||
if min_identity and min_identity > 0:
|
||||
cmd += ["--min_identity", str(min_identity)]
|
||||
if min_coverage and min_coverage > 0:
|
||||
cmd += ["--min_coverage", str(min_coverage)]
|
||||
if not allow_unknown_families:
|
||||
cmd += ["--disallow_unknown_families"]
|
||||
if require_index_hit:
|
||||
cmd += ["--require_index_hit"]
|
||||
|
||||
return cmd
|
||||
|
||||
|
||||
def build_plot_command(
|
||||
pixi_project_dir: Path,
|
||||
script_path: Path,
|
||||
strain_scores: Path,
|
||||
toxin_support: Path,
|
||||
species_scores: Path,
|
||||
out_dir: Path,
|
||||
merge_unresolved: bool = True,
|
||||
report_mode: str = "paper",
|
||||
lang: str = "zh",
|
||||
per_hit_strain: Optional[str] = None,
|
||||
) -> List[str]:
|
||||
"""构建 Plot 命令列表(使用 pipeline 环境)。
|
||||
|
||||
Args:
|
||||
pixi_project_dir: pixi.toml 所在目录
|
||||
script_path: plot_shotter.py 脚本路径
|
||||
strain_scores: strain_target_scores.tsv 文件路径
|
||||
toxin_support: toxin_support.tsv 文件路径
|
||||
species_scores: strain_target_species_scores.tsv 文件路径
|
||||
out_dir: 输出目录
|
||||
merge_unresolved: 是否合并 unresolved 列
|
||||
report_mode: 报告模式 (paper/summary)
|
||||
lang: 报告语言 (zh/en)
|
||||
per_hit_strain: 可选的菌株名称用于绘制 per-hit 热图
|
||||
|
||||
Returns:
|
||||
命令参数列表,以 'pixi run -e pipeline python' 开头
|
||||
"""
|
||||
cmd: List[str] = [
|
||||
"pixi", "run", "-e", "pipeline",
|
||||
"python", str(script_path),
|
||||
"--strain_scores", str(strain_scores),
|
||||
"--toxin_support", str(toxin_support),
|
||||
"--species_scores", str(species_scores),
|
||||
"--out_dir", str(out_dir),
|
||||
"--report_mode", report_mode,
|
||||
"--lang", lang,
|
||||
]
|
||||
|
||||
if merge_unresolved:
|
||||
cmd.append("--merge_unresolved")
|
||||
|
||||
if per_hit_strain:
|
||||
cmd += ["--per_hit_strain", per_hit_strain]
|
||||
|
||||
return cmd
|
||||
|
||||
|
||||
|
||||
def create_pipeline_bundle(
|
||||
bundle_path: Path,
|
||||
digger_dir: Path,
|
||||
shotter_dir: Path,
|
||||
) -> bool:
|
||||
"""创建 pipeline 结果打包文件。
|
||||
|
||||
Args:
|
||||
bundle_path: 输出的 tar.gz 文件路径
|
||||
digger_dir: digger 输出目录
|
||||
shotter_dir: shotter 输出目录
|
||||
|
||||
Returns:
|
||||
bool: 打包是否成功
|
||||
"""
|
||||
import tarfile
|
||||
|
||||
try:
|
||||
bundle_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with tarfile.open(bundle_path, "w:gz") as tar:
|
||||
if digger_dir.exists():
|
||||
tar.add(digger_dir, arcname="digger")
|
||||
if shotter_dir.exists():
|
||||
tar.add(shotter_dir, arcname="shotter")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"创建打包文件失败: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def verify_bundle_contents(bundle_path: Path) -> Dict[str, Any]:
|
||||
"""验证打包文件内容。
|
||||
|
||||
Args:
|
||||
bundle_path: tar.gz 文件路径
|
||||
|
||||
Returns:
|
||||
包含验证结果的字典:
|
||||
- valid: bool, 打包是否有效
|
||||
- has_digger: bool, 是否包含 digger 目录
|
||||
- has_shotter: bool, 是否包含 shotter 目录
|
||||
- members: List[str], 所有成员名称
|
||||
"""
|
||||
import tarfile
|
||||
|
||||
result: Dict[str, Any] = {
|
||||
"valid": False,
|
||||
"has_digger": False,
|
||||
"has_shotter": False,
|
||||
"members": [],
|
||||
}
|
||||
|
||||
try:
|
||||
with tarfile.open(bundle_path, "r:gz") as tar:
|
||||
members = tar.getnames()
|
||||
result["members"] = members
|
||||
result["has_digger"] = any(m.startswith("digger") for m in members)
|
||||
result["has_shotter"] = any(m.startswith("shotter") for m in members)
|
||||
result["valid"] = result["has_digger"] or result["has_shotter"]
|
||||
except Exception as e:
|
||||
logger.error(f"验证打包文件失败: {e
|
||||
}")
|
||||
|
||||
return result
|
||||
133
scripts/run_digger_stage.py
Normal file
133
scripts/run_digger_stage.py
Normal file
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Run BtToxin_Digger stage only via pixi digger environment.
|
||||
|
||||
This script provides a standalone way to run only the BtToxin_Digger stage
|
||||
of the pipeline, useful for testing or when only digger output is needed.
|
||||
|
||||
Example:
|
||||
pixi run -e pipeline python scripts/run_digger_stage.py \
|
||||
--fna tests/test_data/C15.fna \
|
||||
--out_dir runs/digger_only_test
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
from pixi_runner import PixiRunner
|
||||
|
||||
|
||||
def run_digger_stage(
|
||||
fna_path: Path,
|
||||
out_dir: Path,
|
||||
bttoxin_db_dir: Path | None = None,
|
||||
threads: int = 4,
|
||||
sequence_type: str = "nucl",
|
||||
) -> Dict[str, Any]:
|
||||
"""Run only the BtToxin_Digger stage.
|
||||
|
||||
Args:
|
||||
fna_path: Input .fna file path
|
||||
out_dir: Output directory for digger results
|
||||
bttoxin_db_dir: External bt_toxin database directory (optional)
|
||||
threads: Number of threads
|
||||
sequence_type: Sequence type (nucl/orfs/prot/reads)
|
||||
|
||||
Returns:
|
||||
Result dictionary with success status and output paths
|
||||
"""
|
||||
fna_path = fna_path.resolve()
|
||||
out_dir = out_dir.resolve()
|
||||
|
||||
# Create directory structure
|
||||
digger_dir = out_dir / "digger"
|
||||
logs_dir = out_dir / "logs"
|
||||
stage_dir = out_dir / "stage"
|
||||
|
||||
for d in (digger_dir, logs_dir, stage_dir):
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Stage input file
|
||||
staged_fna = stage_dir / fna_path.name
|
||||
shutil.copy2(fna_path, staged_fna)
|
||||
|
||||
# Auto-detect external database
|
||||
if bttoxin_db_dir is None:
|
||||
default_db = Path(__file__).resolve().parents[1] / "external_dbs" / "bt_toxin"
|
||||
if default_db.exists() and (default_db / "db").exists():
|
||||
bttoxin_db_dir = default_db
|
||||
print(f"[digger-only] Using external database: {bttoxin_db_dir}")
|
||||
else:
|
||||
print("[digger-only] No external database found, using pixi environment default")
|
||||
|
||||
# Run BtToxin_Digger
|
||||
runner = PixiRunner(env_name="digger")
|
||||
result = runner.run_bttoxin_digger(
|
||||
input_dir=stage_dir,
|
||||
output_dir=digger_dir,
|
||||
log_dir=logs_dir,
|
||||
sequence_type=sequence_type,
|
||||
scaf_suffix=fna_path.suffix or ".fna",
|
||||
threads=threads,
|
||||
bttoxin_db_dir=bttoxin_db_dir,
|
||||
)
|
||||
|
||||
if not result.get("success"):
|
||||
return {
|
||||
"ok": False,
|
||||
"error": result.get("error") or f"Digger failed (exit={result.get('exit_code')})",
|
||||
"logs": result.get("logs", ""),
|
||||
}
|
||||
|
||||
# Check for output
|
||||
toxins_dir = digger_dir / "Results" / "Toxins"
|
||||
all_toxins = toxins_dir / "All_Toxins.txt"
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"digger_dir": str(digger_dir),
|
||||
"all_toxins": str(all_toxins) if all_toxins.exists() else None,
|
||||
"logs": result.get("logs", ""),
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="Run BtToxin_Digger stage only (pixi-based)")
|
||||
ap.add_argument("--fna", type=Path, required=True, help="Path to a single .fna file")
|
||||
ap.add_argument("--out_dir", type=Path, default=Path("runs/digger_only"))
|
||||
ap.add_argument("--bttoxin_db_dir", type=Path, default=None,
|
||||
help="External bt_toxin database directory")
|
||||
ap.add_argument("--threads", type=int, default=4, help="Number of threads")
|
||||
ap.add_argument("--sequence_type", type=str, default="nucl",
|
||||
choices=["nucl", "orfs", "prot", "reads"],
|
||||
help="Sequence type")
|
||||
args = ap.parse_args()
|
||||
|
||||
# Derive default out_dir using file stem
|
||||
if str(args.out_dir) == "runs/digger_only":
|
||||
stem = args.fna.stem
|
||||
args.out_dir = Path("runs") / f"{stem}_digger_only"
|
||||
|
||||
res = run_digger_stage(
|
||||
fna_path=args.fna,
|
||||
out_dir=args.out_dir,
|
||||
bttoxin_db_dir=args.bttoxin_db_dir,
|
||||
threads=args.threads,
|
||||
sequence_type=args.sequence_type,
|
||||
)
|
||||
|
||||
if not res.get("ok"):
|
||||
print(f"[digger-only] FAILED: {res.get('error')}")
|
||||
return 1
|
||||
|
||||
print("[digger-only] ✓ Done")
|
||||
print(f" Digger output: {res['digger_dir']}")
|
||||
if res.get("all_toxins"):
|
||||
print(f" All_Toxins.txt: {res['all_toxins']}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -3,20 +3,20 @@
|
||||
|
||||
- Input: one .fna file (nucleotide scaffold)
|
||||
- Steps:
|
||||
1) Stage this single file, run BtToxin_Digger via DockerContainerManager
|
||||
2) Run Shotter scoring on Digger's All_Toxins.txt
|
||||
3) Render heatmaps + paper-style report
|
||||
1) Stage this single file, run BtToxin_Digger via PixiRunner (pixi environment)
|
||||
2) Run Shotter scoring on Digger's All_Toxins.txt via pixi run -e pipeline
|
||||
3) Render heatmaps + paper-style report via pixi run -e pipeline
|
||||
4) Organize outputs under one root folder:
|
||||
<out_root>/
|
||||
├─ digger/ (container outputs)
|
||||
├─ digger/ (pixi digger env outputs)
|
||||
├─ shotter/ (Shotter TSV/JSON + plots + report)
|
||||
└─ pipeline_results.tar.gz (bundle)
|
||||
|
||||
Notes
|
||||
- Digger is executed in a container (root in container); files may be owned by root on host.
|
||||
We write everything into <out_root>/digger to keep permissions/locality predictable.
|
||||
- Digger is executed in the pixi 'digger' environment with bioconda dependencies.
|
||||
- Shotter and plotting are executed in the pixi 'pipeline' environment with Python dependencies.
|
||||
- This script exposes CLI flags for Shotter filters to allow strict/loose runs.
|
||||
- 默认使用 external_dbs/bt_toxin 作为外部数据库(若存在),覆盖容器内置旧库。
|
||||
- 默认使用 external_dbs/bt_toxin 作为外部数据库(若存在)。
|
||||
|
||||
Example
|
||||
python scripts/run_single_fna_pipeline.py \\
|
||||
@@ -30,6 +30,9 @@ Example
|
||||
python scripts/run_single_fna_pipeline.py \\
|
||||
--fna tests/test_data/HAN055.fna \\
|
||||
--bttoxin_db_dir /path/to/custom/bt_toxin
|
||||
|
||||
# 使用 pixi 任务运行
|
||||
pixi run pipeline --fna tests/test_data/HAN055.fna
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -40,11 +43,10 @@ import subprocess
|
||||
import sys
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from typing import Dict, Any, List
|
||||
|
||||
# import DockerContainerManager from backend
|
||||
sys.path.append(str(Path(__file__).resolve().parents[1] / "backend"))
|
||||
from app.utils.docker_client import DockerContainerManager # type: ignore
|
||||
# Import PixiRunner and command builders from scripts
|
||||
from pixi_runner import PixiRunner, build_shotter_command, build_plot_command
|
||||
|
||||
|
||||
def _shell(cmd: list[str]) -> subprocess.CompletedProcess:
|
||||
@@ -72,20 +74,28 @@ def run_single_fna_pipeline(
|
||||
fna_path: Path,
|
||||
out_root: Path,
|
||||
toxicity_csv: Path = Path("Data/toxicity-data.csv"),
|
||||
image: str = "quay.io/biocontainers/bttoxin_digger:1.0.10--hdfd78af_0",
|
||||
platform: str = "linux/amd64",
|
||||
min_identity: float = 0.0,
|
||||
min_coverage: float = 0.0,
|
||||
allow_unknown_families: bool = True,
|
||||
require_index_hit: bool = False,
|
||||
lang: str = "zh",
|
||||
bttoxin_db_dir: Path | None = None,
|
||||
threads: int = 4,
|
||||
) -> Dict[str, Any]:
|
||||
"""运行单个 fna 文件的完整 pipeline。
|
||||
"""运行单个 fna 文件的完整 pipeline(使用 pixi 环境)。
|
||||
|
||||
Args:
|
||||
fna_path: 输入 .fna 文件路径
|
||||
out_root: 输出根目录
|
||||
toxicity_csv: 毒性数据 CSV 文件路径
|
||||
min_identity: 最小 identity 阈值
|
||||
min_coverage: 最小 coverage 阈值
|
||||
allow_unknown_families: 是否允许未知家族
|
||||
require_index_hit: 是否要求索引命中
|
||||
lang: 报告语言 (zh/en)
|
||||
bttoxin_db_dir: 外部 bt_toxin 数据库目录。若为 None,则自动检测
|
||||
项目根目录下的 external_dbs/bt_toxin。
|
||||
threads: 线程数
|
||||
"""
|
||||
fna_path = fna_path.resolve()
|
||||
out_root = out_root.resolve()
|
||||
@@ -98,7 +108,7 @@ def run_single_fna_pipeline(
|
||||
bttoxin_db_dir = default_db
|
||||
print(f"[pipeline] 使用外部数据库: {bttoxin_db_dir}")
|
||||
else:
|
||||
print("[pipeline] 未找到外部数据库,将使用容器内置数据库(可能较旧)")
|
||||
print("[pipeline] 未找到外部数据库,将使用 pixi 环境内置数据库")
|
||||
|
||||
digger_dir = out_root / "digger"
|
||||
shotter_dir = out_root / "shotter"
|
||||
@@ -111,15 +121,15 @@ def run_single_fna_pipeline(
|
||||
staged_fna = stage_dir / fna_path.name
|
||||
shutil.copy2(fna_path, staged_fna)
|
||||
|
||||
# 1) Run BtToxin_Digger via DockerContainerManager
|
||||
mgr = DockerContainerManager(image=image, platform=platform)
|
||||
result = mgr.run_bttoxin_digger(
|
||||
# 1) Run BtToxin_Digger via PixiRunner (pixi digger environment)
|
||||
runner = PixiRunner(env_name="digger")
|
||||
result = runner.run_bttoxin_digger(
|
||||
input_dir=stage_dir,
|
||||
output_dir=digger_dir,
|
||||
log_dir=logs_dir,
|
||||
sequence_type="nucl",
|
||||
scaf_suffix=fna_path.suffix or ".fna",
|
||||
threads=4,
|
||||
threads=threads,
|
||||
bttoxin_db_dir=bttoxin_db_dir,
|
||||
)
|
||||
if not result.get("success"):
|
||||
@@ -135,27 +145,22 @@ def run_single_fna_pipeline(
|
||||
if not all_toxins.exists():
|
||||
return {"ok": False, "stage": "digger", "error": f"Missing All_Toxins.txt at {all_toxins}"}
|
||||
|
||||
# 2) Run Shotter scoring
|
||||
# 2) Run Shotter scoring via pixi run -e pipeline
|
||||
shotter_dir.mkdir(parents=True, exist_ok=True)
|
||||
py = sys.executable
|
||||
shoter_cmd: list[str] = [
|
||||
py,
|
||||
str(Path(__file__).resolve().parents[0] / "bttoxin_shoter.py"),
|
||||
"--toxicity_csv",
|
||||
str(toxicity_csv),
|
||||
"--all_toxins",
|
||||
str(all_toxins),
|
||||
"--output_dir",
|
||||
str(shotter_dir),
|
||||
]
|
||||
if min_identity and min_identity > 0:
|
||||
shoter_cmd += ["--min_identity", str(min_identity)]
|
||||
if min_coverage and min_coverage > 0:
|
||||
shoter_cmd += ["--min_coverage", str(min_coverage)]
|
||||
if not allow_unknown_families:
|
||||
shoter_cmd += ["--disallow_unknown_families"]
|
||||
if require_index_hit:
|
||||
shoter_cmd += ["--require_index_hit"]
|
||||
scripts_dir = Path(__file__).resolve().parents[0]
|
||||
pixi_project_dir = Path(__file__).resolve().parents[1]
|
||||
|
||||
shoter_cmd = build_shotter_command(
|
||||
pixi_project_dir=pixi_project_dir,
|
||||
script_path=scripts_dir / "bttoxin_shoter.py",
|
||||
toxicity_csv=toxicity_csv,
|
||||
all_toxins=all_toxins,
|
||||
output_dir=shotter_dir,
|
||||
min_identity=min_identity,
|
||||
min_coverage=min_coverage,
|
||||
allow_unknown_families=allow_unknown_families,
|
||||
require_index_hit=require_index_hit,
|
||||
)
|
||||
|
||||
r1 = _shell(shoter_cmd)
|
||||
if r1.returncode != 0:
|
||||
@@ -165,27 +170,20 @@ def run_single_fna_pipeline(
|
||||
toxin_support = shotter_dir / "toxin_support.tsv"
|
||||
species_scores = shotter_dir / "strain_target_species_scores.tsv"
|
||||
|
||||
# 3) Plot & report
|
||||
# 3) Plot & report via pixi run -e pipeline
|
||||
strain_for_plot = _read_first_strain(strain_scores)
|
||||
plot_cmd: list[str] = [
|
||||
py,
|
||||
str(Path(__file__).resolve().parents[0] / "plot_shotter.py"),
|
||||
"--strain_scores",
|
||||
str(strain_scores),
|
||||
"--toxin_support",
|
||||
str(toxin_support),
|
||||
"--species_scores",
|
||||
str(species_scores),
|
||||
"--out_dir",
|
||||
str(shotter_dir),
|
||||
"--merge_unresolved",
|
||||
"--report_mode",
|
||||
"paper",
|
||||
"--lang",
|
||||
lang,
|
||||
]
|
||||
if strain_for_plot:
|
||||
plot_cmd += ["--per_hit_strain", strain_for_plot]
|
||||
plot_cmd = build_plot_command(
|
||||
pixi_project_dir=pixi_project_dir,
|
||||
script_path=scripts_dir / "plot_shotter.py",
|
||||
strain_scores=strain_scores,
|
||||
toxin_support=toxin_support,
|
||||
species_scores=species_scores,
|
||||
out_dir=shotter_dir,
|
||||
merge_unresolved=True,
|
||||
report_mode="paper",
|
||||
lang=lang,
|
||||
per_hit_strain=strain_for_plot if strain_for_plot else None,
|
||||
)
|
||||
|
||||
r2 = _shell(plot_cmd)
|
||||
if r2.returncode != 0:
|
||||
@@ -209,12 +207,10 @@ def run_single_fna_pipeline(
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="Run single-fna Digger -> Shotter pipeline")
|
||||
ap = argparse.ArgumentParser(description="Run single-fna Digger -> Shotter pipeline (pixi-based)")
|
||||
ap.add_argument("--fna", type=Path, required=True, help="Path to a single .fna file")
|
||||
ap.add_argument("--toxicity_csv", type=Path, default=Path("Data/toxicity-data.csv"))
|
||||
ap.add_argument("--out_root", type=Path, default=Path("runs/single_run"))
|
||||
ap.add_argument("--image", type=str, default="quay.io/biocontainers/bttoxin_digger:1.0.10--hdfd78af_0")
|
||||
ap.add_argument("--platform", type=str, default="linux/amd64")
|
||||
ap.add_argument("--min_identity", type=float, default=0.0)
|
||||
ap.add_argument("--min_coverage", type=float, default=0.0)
|
||||
ap.add_argument("--disallow_unknown_families", action="store_true", default=False)
|
||||
@@ -222,6 +218,7 @@ def main() -> int:
|
||||
ap.add_argument("--lang", type=str, choices=["zh", "en"], default="zh")
|
||||
ap.add_argument("--bttoxin_db_dir", type=Path, default=None,
|
||||
help="外部 bt_toxin 数据库目录路径(默认自动检测 external_dbs/bt_toxin)")
|
||||
ap.add_argument("--threads", type=int, default=4, help="线程数")
|
||||
args = ap.parse_args()
|
||||
|
||||
# derive per-run default out_root using file stem
|
||||
@@ -233,14 +230,13 @@ def main() -> int:
|
||||
fna_path=args.fna,
|
||||
out_root=args.out_root,
|
||||
toxicity_csv=args.toxicity_csv,
|
||||
image=args.image,
|
||||
platform=args.platform,
|
||||
min_identity=args.min_identity,
|
||||
min_coverage=args.min_coverage,
|
||||
allow_unknown_families=not args.disallow_unknown_families,
|
||||
require_index_hit=args.require_index_hit,
|
||||
lang=args.lang,
|
||||
bttoxin_db_dir=args.bttoxin_db_dir,
|
||||
threads=args.threads,
|
||||
)
|
||||
|
||||
if not res.get("ok"):
|
||||
|
||||
Reference in New Issue
Block a user