Files
bttoxin-pipeline/scripts/pixi_runner.py
hotwa ae4c6351d9 feat: migrate from Docker to pixi for BtToxin_Digger execution
- Add pixi.toml with digger and pipeline environments
- Implement PixiRunner class replacing DockerContainerManager
- Add run_digger_stage.py for standalone digger execution
- Update run_single_fna_pipeline.py to use PixiRunner
- Remove docker dependency from pyproject.toml
- Delete docker_client.py (no longer needed)

BREAKING CHANGE: Docker is no longer required. Use 'pixi install' instead.
2026-01-08 16:58:45 +08:00

672 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""PixiRunner - 在 pixi 环境中执行 BtToxin_Digger 的运行器。
此模块提供 PixiRunner 类,用于替代 DockerContainerManager
在本地 pixi 环境中运行 BtToxin_Digger 分析。
Example:
runner = PixiRunner()
result = runner.run_bttoxin_digger(
input_dir=Path("input"),
output_dir=Path("output"),
log_dir=Path("logs"),
)
"""
from __future__ import annotations
import json
import logging
import shutil
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
class PixiRunnerError(Exception):
"""PixiRunner 相关错误的基类。"""
pass
class PixiNotInstalledError(PixiRunnerError):
"""pixi 未安装时抛出的错误。"""
pass
class EnvironmentNotFoundError(PixiRunnerError):
"""pixi 环境不存在时抛出的错误。"""
pass
class BtToxinDiggerNotFoundError(PixiRunnerError):
"""BtToxin_Digger 未找到时抛出的错误。"""
pass
class PixiRunner:
"""在 pixi 环境中执行 BtToxin_Digger 的运行器。
此类提供与 DockerContainerManager 兼容的接口,
但使用本地 pixi 环境而非 Docker 容器执行命令。
Attributes:
pixi_project_dir: pixi.toml 所在目录
env_name: pixi 环境名称,默认为 "digger"
"""
def __init__(
self,
pixi_project_dir: Optional[Path] = None,
env_name: str = "digger",
) -> None:
"""初始化运行器,验证 pixi 和环境可用性。
Args:
pixi_project_dir: pixi.toml 所在目录,默认为项目根目录
env_name: pixi 环境名称,默认为 "digger"
Raises:
PixiNotInstalledError: pixi 未安装
EnvironmentNotFoundError: pixi 环境不存在
BtToxinDiggerNotFoundError: BtToxin_Digger 未找到
"""
self.env_name = env_name
# 确定 pixi 项目目录
if pixi_project_dir is None:
# 默认为脚本所在目录的父目录(项目根目录)
self.pixi_project_dir = Path(__file__).resolve().parents[1]
else:
self.pixi_project_dir = Path(pixi_project_dir).resolve()
# 验证 pixi.toml 存在
pixi_toml = self.pixi_project_dir / "pixi.toml"
if not pixi_toml.exists():
raise PixiRunnerError(
f"pixi.toml 配置文件不存在: {pixi_toml}"
"请确保在项目根目录运行,或指定正确的 pixi_project_dir。"
)
def check_environment(self) -> Dict[str, Any]:
"""检查 pixi 环境和 BtToxin_Digger 可用性。
Returns:
包含环境状态的字典:
- pixi_installed: bool, pixi 是否已安装
- env_exists: bool, 指定环境是否存在
- bttoxin_available: bool, BtToxin_Digger 是否可用
- error: str | None, 错误信息(包含可操作的指导)
"""
result: Dict[str, Any] = {
"pixi_installed": False,
"env_exists": False,
"bttoxin_available": False,
"error": None,
}
# 检查 pixi 是否安装
try:
proc = subprocess.run(
["pixi", "--version"],
capture_output=True,
text=True,
timeout=10,
)
if proc.returncode == 0:
result["pixi_installed"] = True
else:
result["error"] = (
"pixi 未安装。请访问 https://pixi.sh 安装 pixi"
"或运行: curl -fsSL https://pixi.sh/install.sh | bash"
)
return result
except FileNotFoundError:
result["error"] = (
"pixi 未安装。请访问 https://pixi.sh 安装 pixi"
"或运行: curl -fsSL https://pixi.sh/install.sh | bash"
)
return result
except subprocess.TimeoutExpired:
result["error"] = "pixi 命令超时。请检查 pixi 安装是否正确。"
return result
except Exception as e:
result["error"] = f"检查 pixi 时出错: {e}。请确保 pixi 已正确安装。"
return result
# 检查环境是否存在
try:
proc = subprocess.run(
["pixi", "info", "--json"],
capture_output=True,
text=True,
timeout=30,
cwd=str(self.pixi_project_dir),
)
if proc.returncode == 0:
try:
info = json.loads(proc.stdout)
envs = info.get("environments_info", [])
env_names = [e.get("name") for e in envs if e.get("name")]
if self.env_name in env_names:
result["env_exists"] = True
else:
result["error"] = (
f"{self.env_name} 环境不存在。"
f"请在项目根目录运行 'pixi install' 安装依赖。"
)
return result
except json.JSONDecodeError:
# 回退到简单检查
result["env_exists"] = True
else:
result["error"] = (
f"{self.env_name} 环境不存在。"
f"请在项目根目录运行 'pixi install' 安装依赖。"
)
return result
except Exception as e:
result["error"] = f"检查环境时出错: {e}。请运行 'pixi install' 安装依赖。"
return result
# 检查 BtToxin_Digger 是否可用
try:
proc = subprocess.run(
["pixi", "run", "-e", self.env_name, "BtToxin_Digger", "--help"],
capture_output=True,
text=True,
timeout=60,
cwd=str(self.pixi_project_dir),
)
output = proc.stdout + proc.stderr
if proc.returncode == 0 or "BtToxin_Digger" in output or "Usage" in output:
result["bttoxin_available"] = True
else:
result["error"] = (
"BtToxin_Digger 未找到。请确保 pixi install 已完成,"
"并且 pixi.toml 中包含 bttoxin_digger 依赖。"
)
return result
except FileNotFoundError:
result["error"] = (
"pixi 命令未找到。请确保 pixi 已正确安装并在 PATH 中。"
)
return result
except subprocess.TimeoutExpired:
result["error"] = (
"检查 BtToxin_Digger 超时。请运行 'pixi install' 确保环境已正确安装。"
)
return result
except Exception as e:
result["error"] = (
f"检查 BtToxin_Digger 时出错: {e}"
"请运行 'pixi install' 安装依赖。"
)
return result
return result
def build_digger_command(
self,
input_dir: Path,
sequence_type: str = "nucl",
scaf_suffix: str = ".fna",
threads: int = 4,
**kwargs: Any,
) -> List[str]:
"""构建 BtToxin_Digger 命令列表。
Args:
input_dir: 输入文件目录
sequence_type: 序列类型 (nucl/orfs/prot/reads)
scaf_suffix: scaffold 文件后缀
threads: 线程数
**kwargs: 其他 BtToxin_Digger 参数
Returns:
命令参数列表,以 'pixi run -e digger BtToxin_Digger' 开头
"""
cmd: List[str] = [
"pixi", "run", "-e", self.env_name,
"BtToxin_Digger",
"--SeqPath", str(input_dir),
"--SequenceType", sequence_type,
"--threads", str(threads),
]
if sequence_type == "nucl":
cmd += ["--Scaf_suffix", scaf_suffix]
elif sequence_type == "orfs":
cmd += ["--orfs_suffix", kwargs.get("orfs_suffix", ".ffn")]
elif sequence_type == "prot":
cmd += ["--prot_suffix", kwargs.get("prot_suffix", ".faa")]
elif sequence_type == "reads":
platform = kwargs.get("platform", "illumina")
cmd += ["--platform", platform]
if platform == "illumina":
r1 = kwargs.get("reads1_suffix", "_R1.fastq.gz")
r2 = kwargs.get("reads2_suffix", "_R2.fastq.gz")
sfx = kwargs.get("suffix_len") or len(r1)
cmd += ["--reads1", r1, "--reads2", r2, "--suffix_len", str(sfx)]
elif platform in ("pacbio", "oxford"):
r = kwargs.get("reads1_suffix", ".fastq.gz")
gsize = kwargs.get("genome_size", "6.07m")
sfx = kwargs.get("suffix_len") or len(r)
cmd += ["--reads1", r, "--genomeSize", gsize, "--suffix_len", str(sfx)]
elif platform == "hybrid":
short1 = kwargs.get("short1")
short2 = kwargs.get("short2")
long_reads = kwargs.get("long")
if short1:
cmd += ["--short1", short1]
if short2:
cmd += ["--short2", short2]
if long_reads:
cmd += ["--long", long_reads]
hout = kwargs.get("hout")
if hout:
cmd += ["--hout", hout]
if kwargs.get("assemble_only"):
cmd.append("--assemble_only")
return cmd
def run_bttoxin_digger(
self,
input_dir: Path,
output_dir: Path,
log_dir: Path,
sequence_type: str = "nucl",
scaf_suffix: str = ".fna",
threads: int = 4,
bttoxin_db_dir: Optional[Path] = None,
**kwargs: Any,
) -> Dict[str, Any]:
"""执行 BtToxin_Digger 分析。
Args:
input_dir: 输入文件目录
output_dir: 输出目录
log_dir: 日志目录
sequence_type: 序列类型 (nucl/orfs/prot/reads)
scaf_suffix: scaffold 文件后缀
threads: 线程数
bttoxin_db_dir: 外部数据库目录(可选)
**kwargs: 其他 BtToxin_Digger 参数
Returns:
结果字典,包含:
- success: bool, 执行是否成功
- exit_code: int, 进程退出码
- logs: str, 执行日志
- status: str, 状态 (completed/failed/error)
- error: str (可选), 错误信息
- output_files: int (可选), 输出文件数量
"""
# 确保目录存在
output_dir = Path(output_dir).resolve()
log_dir = Path(log_dir).resolve()
input_dir = Path(input_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
log_dir.mkdir(parents=True, exist_ok=True)
# 准备工作目录结构
work_input_dir = output_dir / "input_files"
work_input_dir.mkdir(parents=True, exist_ok=True)
# 复制输入文件
if sequence_type == "nucl":
pattern = f"*{scaf_suffix}"
elif sequence_type == "orfs":
pattern = f"*{kwargs.get('orfs_suffix', '.ffn')}"
elif sequence_type == "prot":
pattern = f"*{kwargs.get('prot_suffix', '.faa')}"
else:
pattern = "*"
copied_files = 0
for f in input_dir.glob(pattern):
if f.is_file():
shutil.copy2(f, work_input_dir / f.name)
copied_files += 1
logger.info(f"已复制 {copied_files} 个输入文件到 {work_input_dir}")
# 设置外部数据库(如果提供)
if bttoxin_db_dir is not None:
db_setup_ok = self._setup_external_db(bttoxin_db_dir)
if not db_setup_ok:
logger.warning("外部数据库设置失败,将使用默认数据库")
# 构建命令
cmd = self.build_digger_command(
input_dir=work_input_dir,
sequence_type=sequence_type,
scaf_suffix=scaf_suffix,
threads=threads,
**kwargs,
)
logger.info(f"执行命令: {' '.join(cmd)}")
# 执行命令
try:
proc = subprocess.run(
cmd,
capture_output=True,
text=True,
cwd=str(output_dir),
)
logs = f"=== STDOUT ===\n{proc.stdout}\n=== STDERR ===\n{proc.stderr}"
# 保存日志
log_file = log_dir / "digger_execution.log"
log_file.write_text(logs, encoding="utf-8")
logger.info(f"日志已保存: {log_file}")
# 构建结果
result: Dict[str, Any] = {
"success": proc.returncode == 0,
"exit_code": proc.returncode,
"logs": logs,
"status": "completed" if proc.returncode == 0 else "failed",
}
if proc.returncode != 0:
result["error"] = (
f"BtToxin_Digger 执行失败 (exit={proc.returncode}): "
f"{proc.stderr[:500] if proc.stderr else 'No error output'}"
)
# 统计输出文件
results_dir = output_dir / "Results"
if results_dir.exists():
files = [f for f in results_dir.rglob("*") if f.is_file()]
result["output_files"] = len(files)
else:
result["output_files"] = 0
return result
except FileNotFoundError as e:
error_msg = (
f"命令执行失败: {e}"
"请确保 pixi 已正确安装并在 PATH 中。"
"安装说明: https://pixi.sh"
)
logger.error(error_msg)
return {
"success": False,
"exit_code": -1,
"logs": "",
"status": "error",
"error": error_msg,
}
except subprocess.TimeoutExpired as e:
error_msg = f"命令执行超时: {e}"
logger.error(error_msg)
return {
"success": False,
"exit_code": -1,
"logs": "",
"status": "error",
"error": error_msg,
}
except Exception as e:
error_msg = f"执行 BtToxin_Digger 时出错: {e}"
logger.error(error_msg, exc_info=True)
return {
"success": False,
"exit_code": -1,
"logs": "",
"status": "error",
"error": error_msg,
}
def _setup_external_db(self, bttoxin_db_dir: Path) -> bool:
"""设置外部数据库符号链接。
将 external_dbs/bt_toxin 链接到 pixi 环境的 BTTCMP_db/bt_toxin。
Args:
bttoxin_db_dir: 外部数据库目录路径
Returns:
bool: 设置是否成功
"""
db_path = Path(bttoxin_db_dir).resolve()
# 验证数据库目录结构
if not db_path.exists():
logger.warning(f"外部数据库目录不存在: {db_path}")
return False
if not (db_path / "db").exists():
logger.warning(f"外部数据库目录结构不完整,缺少 db 子目录: {db_path}")
return False
# 获取 pixi 环境路径
try:
proc = subprocess.run(
["pixi", "info", "--json"],
capture_output=True,
text=True,
timeout=30,
cwd=str(self.pixi_project_dir),
)
if proc.returncode != 0:
logger.warning("无法获取 pixi 环境信息")
return False
info = json.loads(proc.stdout)
envs_info = info.get("environments_info", [])
env_prefix = None
for env in envs_info:
if env.get("name") == self.env_name:
env_prefix = env.get("prefix")
break
if not env_prefix:
logger.warning(f"未找到 {self.env_name} 环境的路径")
return False
# 创建符号链接
# BtToxin_Digger 期望数据库在 BTTCMP_db/bt_toxin
env_path = Path(env_prefix)
bttcmp_db_dir = env_path / "bin" / "BTTCMP_db"
target_link = bttcmp_db_dir / "bt_toxin"
# 确保 BTTCMP_db 目录存在
bttcmp_db_dir.mkdir(parents=True, exist_ok=True)
# 如果已存在符号链接或目录,先删除
if target_link.exists() or target_link.is_symlink():
if target_link.is_symlink():
target_link.unlink()
elif target_link.is_dir():
shutil.rmtree(target_link)
# 创建符号链接
target_link.symlink_to(db_path)
logger.info(f"已创建数据库符号链接: {target_link} -> {db_path}")
return True
except json.JSONDecodeError as e:
logger.warning(f"解析 pixi info 输出失败: {e}")
return False
except PermissionError as e:
logger.warning(f"无法创建数据库符号链接(权限不足): {e}")
return False
except Exception as e:
logger.warning(f"设置外部数据库时出错: {e}")
return False
def build_shotter_command(
pixi_project_dir: Path,
script_path: Path,
toxicity_csv: Path,
all_toxins: Path,
output_dir: Path,
min_identity: float = 0.0,
min_coverage: float = 0.0,
allow_unknown_families: bool = True,
require_index_hit: bool = False,
) -> List[str]:
"""构建 Shotter 命令列表(使用 pipeline 环境)。
Args:
pixi_project_dir: pixi.toml 所在目录
script_path: bttoxin_shoter.py 脚本路径
toxicity_csv: 毒性数据 CSV 文件路径
all_toxins: All_Toxins.txt 文件路径
output_dir: 输出目录
min_identity: 最小 identity 阈值
min_coverage: 最小 coverage 阈值
allow_unknown_families: 是否允许未知家族
require_index_hit: 是否要求索引命中
Returns:
命令参数列表,以 'pixi run -e pipeline python' 开头
"""
cmd: List[str] = [
"pixi", "run", "-e", "pipeline",
"python", str(script_path),
"--toxicity_csv", str(toxicity_csv),
"--all_toxins", str(all_toxins),
"--output_dir", str(output_dir),
]
if min_identity and min_identity > 0:
cmd += ["--min_identity", str(min_identity)]
if min_coverage and min_coverage > 0:
cmd += ["--min_coverage", str(min_coverage)]
if not allow_unknown_families:
cmd += ["--disallow_unknown_families"]
if require_index_hit:
cmd += ["--require_index_hit"]
return cmd
def build_plot_command(
pixi_project_dir: Path,
script_path: Path,
strain_scores: Path,
toxin_support: Path,
species_scores: Path,
out_dir: Path,
merge_unresolved: bool = True,
report_mode: str = "paper",
lang: str = "zh",
per_hit_strain: Optional[str] = None,
) -> List[str]:
"""构建 Plot 命令列表(使用 pipeline 环境)。
Args:
pixi_project_dir: pixi.toml 所在目录
script_path: plot_shotter.py 脚本路径
strain_scores: strain_target_scores.tsv 文件路径
toxin_support: toxin_support.tsv 文件路径
species_scores: strain_target_species_scores.tsv 文件路径
out_dir: 输出目录
merge_unresolved: 是否合并 unresolved 列
report_mode: 报告模式 (paper/summary)
lang: 报告语言 (zh/en)
per_hit_strain: 可选的菌株名称用于绘制 per-hit 热图
Returns:
命令参数列表,以 'pixi run -e pipeline python' 开头
"""
cmd: List[str] = [
"pixi", "run", "-e", "pipeline",
"python", str(script_path),
"--strain_scores", str(strain_scores),
"--toxin_support", str(toxin_support),
"--species_scores", str(species_scores),
"--out_dir", str(out_dir),
"--report_mode", report_mode,
"--lang", lang,
]
if merge_unresolved:
cmd.append("--merge_unresolved")
if per_hit_strain:
cmd += ["--per_hit_strain", per_hit_strain]
return cmd
def create_pipeline_bundle(
bundle_path: Path,
digger_dir: Path,
shotter_dir: Path,
) -> bool:
"""创建 pipeline 结果打包文件。
Args:
bundle_path: 输出的 tar.gz 文件路径
digger_dir: digger 输出目录
shotter_dir: shotter 输出目录
Returns:
bool: 打包是否成功
"""
import tarfile
try:
bundle_path.parent.mkdir(parents=True, exist_ok=True)
with tarfile.open(bundle_path, "w:gz") as tar:
if digger_dir.exists():
tar.add(digger_dir, arcname="digger")
if shotter_dir.exists():
tar.add(shotter_dir, arcname="shotter")
return True
except Exception as e:
logger.error(f"创建打包文件失败: {e}")
return False
def verify_bundle_contents(bundle_path: Path) -> Dict[str, Any]:
"""验证打包文件内容。
Args:
bundle_path: tar.gz 文件路径
Returns:
包含验证结果的字典:
- valid: bool, 打包是否有效
- has_digger: bool, 是否包含 digger 目录
- has_shotter: bool, 是否包含 shotter 目录
- members: List[str], 所有成员名称
"""
import tarfile
result: Dict[str, Any] = {
"valid": False,
"has_digger": False,
"has_shotter": False,
"members": [],
}
try:
with tarfile.open(bundle_path, "r:gz") as tar:
members = tar.getnames()
result["members"] = members
result["has_digger"] = any(m.startswith("digger") for m in members)
result["has_shotter"] = any(m.startswith("shotter") for m in members)
result["valid"] = result["has_digger"] or result["has_shotter"]
except Exception as e:
logger.error(f"验证打包文件失败: {e
}")
return result