feat: migrate from Docker to pixi for BtToxin_Digger execution

- Add pixi.toml with digger and pipeline environments
- Implement PixiRunner class replacing DockerContainerManager
- Add run_digger_stage.py for standalone digger execution
- Update run_single_fna_pipeline.py to use PixiRunner
- Remove docker dependency from pyproject.toml
- Delete docker_client.py (no longer needed)

BREAKING CHANGE: Docker is no longer required. Use 'pixi install' instead.
This commit is contained in:
2026-01-08 16:58:45 +08:00
parent 1c0e8f90a5
commit ae4c6351d9
7 changed files with 7457 additions and 470 deletions

View File

@@ -1,405 +0,0 @@
"""Docker/Podman 容器管理(修正版,支持 arm64/macOS 与 linux/amd64"""
from __future__ import annotations
import os
import subprocess
import logging
import time
from pathlib import Path
from typing import Dict, Any, Optional, List
import sys
try:
import docker # type: ignore
except Exception: # 允许在无 docker SDK 环境下使用 podman fallback
docker = None # type: ignore
from ..core.config import settings
logger = logging.getLogger(__name__)
def _which(cmd: str) -> Optional[str]:
from shutil import which
return which(cmd)
class DockerContainerManager:
"""容器管理器 - 兼容 Docker 与 Podman。
优先尝试 docker SDK若不可用则回落到 podman CLI或 docker CLI
在 arm64 主机上默认以 --platform linux/amd64 运行镜像。
"""
def __init__(
self,
image: str = settings.DOCKER_IMAGE,
platform: str = settings.DOCKER_PLATFORM,
) -> None:
self.image = image
self.platform = platform
self._engine: str = "docker"
self._client = None
# 首选 docker-py 客户端(若可用)
if docker is not None:
try:
self._client = docker.from_env()
# 探测 daemon
self._client.ping()
self._engine = "docker-sdk"
except Exception as err:
logger.info(f"docker SDK 不可用,将尝试 CLI 回落: {err}")
self._client = None
# CLI 回落:优先 podman其次 docker
if self._client is None:
if _which("podman"):
self._engine = "podman-cli"
elif _which("docker"):
self._engine = "docker-cli"
else:
raise RuntimeError("未找到可用的容器引擎(需要 podman 或 docker")
self._ensure_image()
# ----------------------------- 公共方法 -----------------------------
def run_command_in_container(
self,
command: List[str],
volumes: Dict[str, Dict[str, str]],
environment: Optional[Dict[str, str]] = None,
working_dir: str = "/workspace",
name: Optional[str] = None,
detach: bool = False,
remove: bool = True,
) -> Dict[str, Any]:
"""在容器中执行命令,返回执行结果。"""
# 确保挂载目录存在且可写
for host_path, spec in volumes.items():
p = Path(host_path)
p.mkdir(parents=True, exist_ok=True)
try:
p.chmod(0o777)
except Exception:
pass
if self._engine == "docker-sdk" and self._client is not None:
return self._run_with_docker_sdk(
command, volumes, environment, working_dir, name, detach, remove
)
else:
return self._run_with_cli(
command, volumes, environment, working_dir, name, detach, remove
)
def update_database(self, log_dir: Path) -> Dict[str, Any]:
"""在容器中更新 BtToxin_Digger 数据库。"""
cmd = [
"/usr/local/env-execute",
"BtToxin_Digger",
"--update-db",
]
vols = {str(log_dir): {"bind": "/logs", "mode": "rw"}}
result = self.run_command_in_container(
command=cmd, volumes=vols, working_dir="/tmp", name=f"bttoxin_update_db_{int(time.time())}"
)
if result.get("logs"):
(log_dir / "update_db.log").write_text(result["logs"], encoding="utf-8")
return result
def validate_reads_filenames(
self,
input_dir: Path,
platform: str,
reads1_suffix: str,
reads2_suffix: Optional[str] = None,
suffix_len: int = 0,
) -> Dict[str, Any]:
files = list(input_dir.glob("*"))
if platform == "illumina":
r1 = [f for f in files if reads1_suffix and reads1_suffix in f.name]
r2 = [f for f in files if reads2_suffix and reads2_suffix in f.name]
if not r1 or not r2 or len(r1) != len(r2):
return {"valid": False, "error": "Illumina R1/R2 配对数量不匹配或缺失"}
for f1 in r1:
strain = f1.name.replace(reads1_suffix, "")
if not (input_dir / f"{strain}{reads2_suffix}").exists():
return {"valid": False, "error": f"未找到配对文件: {strain}{reads2_suffix}"}
return {
"valid": True,
"strain_count": len(r1),
"suggested_suffix_len": suffix_len or len(reads1_suffix),
}
if platform in ("pacbio", "oxford"):
r = [f for f in files if reads1_suffix and reads1_suffix in f.name]
if not r:
return {"valid": False, "error": f"未找到匹配 {reads1_suffix} 的 reads 文件"}
return {
"valid": True,
"strain_count": len(r),
"suggested_suffix_len": suffix_len or len(reads1_suffix),
}
return {"valid": True}
def run_bttoxin_digger(
self,
input_dir: Path,
output_dir: Path,
log_dir: Path,
sequence_type: str = "nucl",
scaf_suffix: str = ".fna",
threads: int = 4,
bttoxin_db_dir: Optional[Path] = None,
**kwargs: Any,
) -> Dict[str, Any]:
"""在容器中运行 BtToxin_Digger 主分析(单目录方案)。
Args:
bttoxin_db_dir: 外部 bt_toxin 数据库目录路径(可选)。
若提供,将绑定到容器内 /usr/local/bin/BTTCMP_db/bt_toxin
覆盖容器内置的旧数据库。目录结构应为:
bt_toxin/
├── db/ (BLAST 索引文件)
└── seq/ (序列源文件)
"""
# 1) 在宿主输出目录下准备 input_files并复制输入文件
work_input_dir = (output_dir / "input_files").resolve()
work_input_dir.mkdir(parents=True, exist_ok=True)
import shutil
if sequence_type == "nucl":
pattern = f"*{scaf_suffix}"
elif sequence_type == "orfs":
pattern = f"*{kwargs.get('orfs_suffix', '.ffn')}"
elif sequence_type == "prot":
pattern = f"*{kwargs.get('prot_suffix', '.faa')}"
elif sequence_type == "reads":
pattern = "*"
else:
pattern = "*"
copied_files = 0
for f in input_dir.glob(pattern):
if f.is_file():
shutil.copy2(f, work_input_dir / f.name)
copied_files += 1
logger.info(f"已复制 {copied_files} 个输入文件到 {work_input_dir}")
base_cmd: List[str] = [
"/usr/local/env-execute",
"BtToxin_Digger",
"--SeqPath",
"/workspace/input_files",
"--SequenceType",
sequence_type,
"--threads",
str(threads),
]
if sequence_type == "nucl":
base_cmd += ["--Scaf_suffix", scaf_suffix]
elif sequence_type == "orfs":
base_cmd += ["--orfs_suffix", kwargs.get("orfs_suffix", ".ffn")]
elif sequence_type == "prot":
base_cmd += ["--prot_suffix", kwargs.get("prot_suffix", ".faa")]
elif sequence_type == "reads":
platform = kwargs.get("platform", "illumina")
base_cmd += ["--platform", platform]
if platform == "illumina":
r1 = kwargs.get("reads1_suffix", "_R1.fastq.gz")
r2 = kwargs.get("reads2_suffix", "_R2.fastq.gz")
sfx = kwargs.get("suffix_len") or len(r1)
v = self.validate_reads_filenames(work_input_dir, platform, r1, r2, sfx)
if not v.get("valid"):
raise ValueError(f"Reads 文件验证失败: {v.get('error')}")
sfx = v.get("suggested_suffix_len", sfx)
base_cmd += ["--reads1", r1, "--reads2", r2, "--suffix_len", str(sfx)]
elif platform in ("pacbio", "oxford"):
r = kwargs.get("reads1_suffix", ".fastq.gz")
gsize = kwargs.get("genome_size", "6.07m")
sfx = kwargs.get("suffix_len") or len(r)
v = self.validate_reads_filenames(work_input_dir, platform, r, None, sfx)
if not v.get("valid"):
raise ValueError(f"Reads 文件验证失败: {v.get('error')}")
sfx = v.get("suggested_suffix_len", sfx)
base_cmd += ["--reads1", r, "--genomeSize", gsize, "--suffix_len", str(sfx)]
elif platform == "hybrid":
short1 = kwargs.get("short1")
short2 = kwargs.get("short2")
long = kwargs.get("long")
if not all([short1, short2, long]):
raise ValueError("hybrid 需要 short1/short2/long 三个完整文件名")
for fn in (short1, short2, long):
if not (work_input_dir / fn).exists():
raise ValueError(f"文件不存在: {fn}")
base_cmd += [
"--short1",
short1,
"--short2",
short2,
"--long",
long,
"--hout",
"/workspace/Results/Assembles/Hybrid",
]
if kwargs.get("assemble_only"):
base_cmd.append("--assemble_only")
# 2) 挂载输出目录(含 input_files、日志目录、以及可选的外部数据库
volumes = {
str(output_dir.resolve()): {"bind": "/workspace", "mode": "rw"},
str(log_dir.resolve()): {"bind": "/data/logs", "mode": "rw"},
}
# 绑定外部 bt_toxin 数据库(覆盖容器内置旧库)
if bttoxin_db_dir is not None:
db_path = Path(bttoxin_db_dir).resolve()
if db_path.exists() and (db_path / "db").exists():
volumes[str(db_path)] = {
"bind": "/usr/local/bin/BTTCMP_db/bt_toxin",
"mode": "ro",
}
logger.info(f"绑定外部数据库: {db_path} -> /usr/local/bin/BTTCMP_db/bt_toxin")
else:
logger.warning(f"外部数据库目录不存在或结构不完整: {bttoxin_db_dir}")
logger.info("开始 BtToxin_Digger 分析...")
final_cmd = base_cmd
working_dir = "/workspace"
result = self.run_command_in_container(
command=final_cmd,
volumes=volumes,
working_dir=working_dir,
name=f"bttoxin_digger_{int(time.time())}",
)
# 保存容器日志
logs_path = log_dir / "digger_execution.log"
if result.get("logs"):
logs_path.write_text(result["logs"], encoding="utf-8")
logger.info(f"容器日志已保存: {logs_path}")
# 验证输出
results_dir = output_dir / "Results"
if result.get("success") and results_dir.exists():
files = [f for f in results_dir.rglob("*") if f.is_file()]
result["output_files"] = len(files)
else:
result["output_files"] = 0
return result
# ----------------------------- 内部实现 -----------------------------
def _ensure_image(self) -> None:
if self._engine == "docker-sdk" and self._client is not None:
try:
self._client.images.get(self.image)
return
except Exception:
logger.info(f"拉取镜像 {self.image} (platform={self.platform}) ...")
self._client.images.pull(self.image, platform=self.platform)
else:
# CLI 模式:先尝试拉取
cli = "podman" if self._engine == "podman-cli" else "docker"
try:
subprocess.run(
[cli, "pull", "--platform", self.platform, self.image],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
except Exception as err:
logger.warning(f"{cli} pull 失败: {err}")
def _run_with_docker_sdk(
self,
command: List[str],
volumes: Dict[str, Dict[str, str]],
environment: Optional[Dict[str, str]],
working_dir: str,
name: Optional[str],
detach: bool,
remove: bool,
) -> Dict[str, Any]:
assert self._client is not None
try:
# 注意docker SDK 在 detach=False 时返回的是日志字节串,而非容器对象。
# 这里统一以 detach=True 运行,然后等待并抓取日志,最后按需删除容器。
container = self._client.containers.run(
image=self.image,
command=command,
volumes=volumes,
environment=environment or {},
working_dir=working_dir,
platform=self.platform,
name=name,
user="0:0", # 以 root 运行,避免挂载目录权限问题
detach=True,
remove=False, # 获取日志后再删
stdout=True,
stderr=True,
)
exit_info = container.wait()
code = exit_info.get("StatusCode", 1)
logs = container.logs().decode("utf-8", errors="ignore")
if remove:
try:
container.remove()
except Exception:
pass
return {"success": code == 0, "exit_code": code, "logs": logs, "status": "completed" if code == 0 else "failed"}
except Exception as e:
logger.error(f"docker SDK 运行失败: {e}", exc_info=True)
return {"success": False, "error": str(e), "exit_code": -1, "status": "error"}
def _run_with_cli(
self,
command: List[str],
volumes: Dict[str, Dict[str, str]],
environment: Optional[Dict[str, str]],
working_dir: str,
name: Optional[str],
detach: bool,
remove: bool,
) -> Dict[str, Any]:
cli = "podman" if self._engine == "podman-cli" else "docker"
cmd: List[str] = [cli, "run", "--rm" if remove and not detach else ""]
cmd = [c for c in cmd if c]
cmd += ["--platform", self.platform]
# 以 root 运行,避免权限问题
cmd += ["--user", "0:0"]
if name:
cmd += ["--name", name]
for host, spec in volumes.items():
bind = spec.get("bind")
mode = spec.get("mode", "rw")
# Podman(Linux) 下附加 :Z 处理 SELinux 标注;其他平台保持不变
mount_mode = mode
if self._engine == "podman-cli" and os.name == "posix" and sys.platform.startswith("linux"):
mount_mode = f"{mode},Z"
cmd += ["-v", f"{host}:{bind}:{mount_mode}"]
for k, v in (environment or {}).items():
cmd += ["-e", f"{k}={v}"]
cmd += ["-w", working_dir, self.image]
cmd += command
try:
if detach:
# 后台运行CLI 简化返回
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
return {"success": True, "status": "running", "pid": p.pid}
else:
proc = subprocess.run(cmd, capture_output=True, text=True)
out = (proc.stdout or "") + (proc.stderr or "")
return {"success": proc.returncode == 0, "exit_code": proc.returncode, "logs": out, "status": "completed" if proc.returncode == 0 else "failed"}
except Exception as e:
logger.error(f"{cli} 运行失败: {e}", exc_info=True)
return {"success": False, "error": str(e), "exit_code": -1, "status": "error"}