#!/usr/bin/env python3 """PixiRunner - 在 pixi 环境中执行 BtToxin_Digger 的运行器。 此模块提供 PixiRunner 类,用于替代 DockerContainerManager, 在本地 pixi 环境中运行 BtToxin_Digger 分析。 Example: runner = PixiRunner() result = runner.run_bttoxin_digger( input_dir=Path("input"), output_dir=Path("output"), log_dir=Path("logs"), ) """ from __future__ import annotations import json import logging import shutil import subprocess from pathlib import Path from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) class PixiRunnerError(Exception): """PixiRunner 相关错误的基类。""" pass class PixiNotInstalledError(PixiRunnerError): """pixi 未安装时抛出的错误。""" pass class EnvironmentNotFoundError(PixiRunnerError): """pixi 环境不存在时抛出的错误。""" pass class BtToxinDiggerNotFoundError(PixiRunnerError): """BtToxin_Digger 未找到时抛出的错误。""" pass class PixiRunner: """在 pixi 环境中执行 BtToxin_Digger 的运行器。 此类提供与 DockerContainerManager 兼容的接口, 但使用本地 pixi 环境而非 Docker 容器执行命令。 Attributes: pixi_project_dir: pixi.toml 所在目录 env_name: pixi 环境名称,默认为 "digger" """ def __init__( self, pixi_project_dir: Optional[Path] = None, env_name: str = "digger", ) -> None: """初始化运行器,验证 pixi 和环境可用性。 Args: pixi_project_dir: pixi.toml 所在目录,默认为项目根目录 env_name: pixi 环境名称,默认为 "digger" Raises: PixiNotInstalledError: pixi 未安装 EnvironmentNotFoundError: pixi 环境不存在 BtToxinDiggerNotFoundError: BtToxin_Digger 未找到 """ self.env_name = env_name # 确定 pixi 项目目录 if pixi_project_dir is None: # 默认为脚本所在目录的父目录(项目根目录) self.pixi_project_dir = Path(__file__).resolve().parents[1] else: self.pixi_project_dir = Path(pixi_project_dir).resolve() # 验证 pixi.toml 存在 pixi_toml = self.pixi_project_dir / "pixi.toml" if not pixi_toml.exists(): raise PixiRunnerError( f"pixi.toml 配置文件不存在: {pixi_toml}。" "请确保在项目根目录运行,或指定正确的 pixi_project_dir。" ) def check_environment(self) -> Dict[str, Any]: """检查 pixi 环境和 BtToxin_Digger 可用性。 Returns: 包含环境状态的字典: - pixi_installed: bool, pixi 是否已安装 - env_exists: bool, 指定环境是否存在 - bttoxin_available: bool, BtToxin_Digger 是否可用 - error: str | None, 错误信息(包含可操作的指导) """ result: Dict[str, Any] = { "pixi_installed": False, "env_exists": False, "bttoxin_available": False, "error": None, } # 检查 pixi 是否安装 try: proc = subprocess.run( ["pixi", "--version"], capture_output=True, text=True, timeout=10, ) if proc.returncode == 0: result["pixi_installed"] = True else: result["error"] = ( "pixi 未安装。请访问 https://pixi.sh 安装 pixi," "或运行: curl -fsSL https://pixi.sh/install.sh | bash" ) return result except FileNotFoundError: result["error"] = ( "pixi 未安装。请访问 https://pixi.sh 安装 pixi," "或运行: curl -fsSL https://pixi.sh/install.sh | bash" ) return result except subprocess.TimeoutExpired: result["error"] = "pixi 命令超时。请检查 pixi 安装是否正确。" return result except Exception as e: result["error"] = f"检查 pixi 时出错: {e}。请确保 pixi 已正确安装。" return result # 检查环境是否存在 try: proc = subprocess.run( ["pixi", "info", "--json"], capture_output=True, text=True, timeout=30, cwd=str(self.pixi_project_dir), ) if proc.returncode == 0: try: info = json.loads(proc.stdout) envs = info.get("environments_info", []) env_names = [e.get("name") for e in envs if e.get("name")] if self.env_name in env_names: result["env_exists"] = True else: result["error"] = ( f"{self.env_name} 环境不存在。" f"请在项目根目录运行 'pixi install' 安装依赖。" ) return result except json.JSONDecodeError: # 回退到简单检查 result["env_exists"] = True else: result["error"] = ( f"{self.env_name} 环境不存在。" f"请在项目根目录运行 'pixi install' 安装依赖。" ) return result except Exception as e: result["error"] = f"检查环境时出错: {e}。请运行 'pixi install' 安装依赖。" return result # 检查 BtToxin_Digger 是否可用 try: proc = subprocess.run( ["pixi", "run", "-e", self.env_name, "BtToxin_Digger", "--help"], capture_output=True, text=True, timeout=60, cwd=str(self.pixi_project_dir), ) output = proc.stdout + proc.stderr if proc.returncode == 0 or "BtToxin_Digger" in output or "Usage" in output: result["bttoxin_available"] = True else: result["error"] = ( "BtToxin_Digger 未找到。请确保 pixi install 已完成," "并且 pixi.toml 中包含 bttoxin_digger 依赖。" ) return result except FileNotFoundError: result["error"] = ( "pixi 命令未找到。请确保 pixi 已正确安装并在 PATH 中。" ) return result except subprocess.TimeoutExpired: result["error"] = ( "检查 BtToxin_Digger 超时。请运行 'pixi install' 确保环境已正确安装。" ) return result except Exception as e: result["error"] = ( f"检查 BtToxin_Digger 时出错: {e}。" "请运行 'pixi install' 安装依赖。" ) return result return result def build_digger_command( self, input_dir: Path, sequence_type: str = "nucl", scaf_suffix: str = ".fna", threads: int = 4, **kwargs: Any, ) -> List[str]: """构建 BtToxin_Digger 命令列表。 Args: input_dir: 输入文件目录 sequence_type: 序列类型 (nucl/orfs/prot/reads) scaf_suffix: scaffold 文件后缀 threads: 线程数 **kwargs: 其他 BtToxin_Digger 参数 Returns: 命令参数列表,以 'pixi run -e digger BtToxin_Digger' 开头 """ cmd: List[str] = [ "pixi", "run", "-e", self.env_name, "BtToxin_Digger", "--SeqPath", str(input_dir), "--SequenceType", sequence_type, "--threads", str(threads), ] if sequence_type == "nucl": cmd += ["--Scaf_suffix", scaf_suffix] elif sequence_type == "orfs": cmd += ["--orfs_suffix", kwargs.get("orfs_suffix", ".ffn")] elif sequence_type == "prot": cmd += ["--prot_suffix", kwargs.get("prot_suffix", ".faa")] elif sequence_type == "reads": platform = kwargs.get("platform", "illumina") cmd += ["--platform", platform] if platform == "illumina": r1 = kwargs.get("reads1_suffix", "_R1.fastq.gz") r2 = kwargs.get("reads2_suffix", "_R2.fastq.gz") sfx = kwargs.get("suffix_len") or len(r1) cmd += ["--reads1", r1, "--reads2", r2, "--suffix_len", str(sfx)] elif platform in ("pacbio", "oxford"): r = kwargs.get("reads1_suffix", ".fastq.gz") gsize = kwargs.get("genome_size", "6.07m") sfx = kwargs.get("suffix_len") or len(r) cmd += ["--reads1", r, "--genomeSize", gsize, "--suffix_len", str(sfx)] elif platform == "hybrid": short1 = kwargs.get("short1") short2 = kwargs.get("short2") long_reads = kwargs.get("long") if short1: cmd += ["--short1", short1] if short2: cmd += ["--short2", short2] if long_reads: cmd += ["--long", long_reads] hout = kwargs.get("hout") if hout: cmd += ["--hout", hout] if kwargs.get("assemble_only"): cmd.append("--assemble_only") return cmd def run_bttoxin_digger( self, input_dir: Path, output_dir: Path, log_dir: Path, sequence_type: str = "nucl", scaf_suffix: str = ".fna", threads: int = 4, bttoxin_db_dir: Optional[Path] = None, **kwargs: Any, ) -> Dict[str, Any]: """执行 BtToxin_Digger 分析。 Args: input_dir: 输入文件目录 output_dir: 输出目录 log_dir: 日志目录 sequence_type: 序列类型 (nucl/orfs/prot/reads) scaf_suffix: scaffold 文件后缀 threads: 线程数 bttoxin_db_dir: 外部数据库目录(可选) **kwargs: 其他 BtToxin_Digger 参数 Returns: 结果字典,包含: - success: bool, 执行是否成功 - exit_code: int, 进程退出码 - logs: str, 执行日志 - status: str, 状态 (completed/failed/error) - error: str (可选), 错误信息 - output_files: int (可选), 输出文件数量 """ # 确保目录存在 output_dir = Path(output_dir).resolve() log_dir = Path(log_dir).resolve() input_dir = Path(input_dir).resolve() output_dir.mkdir(parents=True, exist_ok=True) log_dir.mkdir(parents=True, exist_ok=True) # 准备工作目录结构 work_input_dir = output_dir / "input_files" work_input_dir.mkdir(parents=True, exist_ok=True) # 复制输入文件 if sequence_type == "nucl": pattern = f"*{scaf_suffix}" elif sequence_type == "orfs": pattern = f"*{kwargs.get('orfs_suffix', '.ffn')}" elif sequence_type == "prot": pattern = f"*{kwargs.get('prot_suffix', '.faa')}" else: pattern = "*" copied_files = 0 for f in input_dir.glob(pattern): if f.is_file(): shutil.copy2(f, work_input_dir / f.name) copied_files += 1 logger.info(f"已复制 {copied_files} 个输入文件到 {work_input_dir}") # 设置外部数据库(如果提供) if bttoxin_db_dir is not None: db_setup_ok = self._setup_external_db(bttoxin_db_dir) if not db_setup_ok: logger.warning("外部数据库设置失败,将使用默认数据库") # 构建命令 cmd = self.build_digger_command( input_dir=work_input_dir, sequence_type=sequence_type, scaf_suffix=scaf_suffix, threads=threads, **kwargs, ) logger.info(f"执行命令: {' '.join(cmd)}") # 执行命令 try: proc = subprocess.run( cmd, capture_output=True, text=True, cwd=str(output_dir), ) logs = f"=== STDOUT ===\n{proc.stdout}\n=== STDERR ===\n{proc.stderr}" # 保存日志 log_file = log_dir / "digger_execution.log" log_file.write_text(logs, encoding="utf-8") logger.info(f"日志已保存: {log_file}") # 构建结果 result: Dict[str, Any] = { "success": proc.returncode == 0, "exit_code": proc.returncode, "logs": logs, "status": "completed" if proc.returncode == 0 else "failed", } if proc.returncode != 0: result["error"] = ( f"BtToxin_Digger 执行失败 (exit={proc.returncode}): " f"{proc.stderr[:500] if proc.stderr else 'No error output'}" ) # 统计输出文件 results_dir = output_dir / "Results" if results_dir.exists(): files = [f for f in results_dir.rglob("*") if f.is_file()] result["output_files"] = len(files) else: result["output_files"] = 0 return result except FileNotFoundError as e: error_msg = ( f"命令执行失败: {e}。" "请确保 pixi 已正确安装并在 PATH 中。" "安装说明: https://pixi.sh" ) logger.error(error_msg) return { "success": False, "exit_code": -1, "logs": "", "status": "error", "error": error_msg, } except subprocess.TimeoutExpired as e: error_msg = f"命令执行超时: {e}" logger.error(error_msg) return { "success": False, "exit_code": -1, "logs": "", "status": "error", "error": error_msg, } except Exception as e: error_msg = f"执行 BtToxin_Digger 时出错: {e}" logger.error(error_msg, exc_info=True) return { "success": False, "exit_code": -1, "logs": "", "status": "error", "error": error_msg, } def _setup_external_db(self, bttoxin_db_dir: Path) -> bool: """设置外部数据库符号链接。 将 external_dbs/bt_toxin 链接到 pixi 环境的 BTTCMP_db/bt_toxin。 Args: bttoxin_db_dir: 外部数据库目录路径 Returns: bool: 设置是否成功 """ db_path = Path(bttoxin_db_dir).resolve() # 验证数据库目录结构 if not db_path.exists(): logger.warning(f"外部数据库目录不存在: {db_path}") return False if not (db_path / "db").exists(): logger.warning(f"外部数据库目录结构不完整,缺少 db 子目录: {db_path}") return False # 获取 pixi 环境路径 try: proc = subprocess.run( ["pixi", "info", "--json"], capture_output=True, text=True, timeout=30, cwd=str(self.pixi_project_dir), ) if proc.returncode != 0: logger.warning("无法获取 pixi 环境信息") return False info = json.loads(proc.stdout) envs_info = info.get("environments_info", []) env_prefix = None for env in envs_info: if env.get("name") == self.env_name: env_prefix = env.get("prefix") break if not env_prefix: logger.warning(f"未找到 {self.env_name} 环境的路径") return False # 创建符号链接 # BtToxin_Digger 期望数据库在 BTTCMP_db/bt_toxin env_path = Path(env_prefix) bttcmp_db_dir = env_path / "bin" / "BTTCMP_db" target_link = bttcmp_db_dir / "bt_toxin" # 确保 BTTCMP_db 目录存在 bttcmp_db_dir.mkdir(parents=True, exist_ok=True) # 如果已存在符号链接或目录,先删除 if target_link.exists() or target_link.is_symlink(): if target_link.is_symlink(): target_link.unlink() elif target_link.is_dir(): shutil.rmtree(target_link) # 创建符号链接 target_link.symlink_to(db_path) logger.info(f"已创建数据库符号链接: {target_link} -> {db_path}") return True except json.JSONDecodeError as e: logger.warning(f"解析 pixi info 输出失败: {e}") return False except PermissionError as e: logger.warning(f"无法创建数据库符号链接(权限不足): {e}") return False except Exception as e: logger.warning(f"设置外部数据库时出错: {e}") return False def build_shotter_command( pixi_project_dir: Path, script_path: Path, toxicity_csv: Path, all_toxins: Path, output_dir: Path, min_identity: float = 0.0, min_coverage: float = 0.0, allow_unknown_families: bool = True, require_index_hit: bool = False, ) -> List[str]: """构建 Shotter 命令列表(使用 pipeline 环境)。 Args: pixi_project_dir: pixi.toml 所在目录 script_path: bttoxin_shoter.py 脚本路径 toxicity_csv: 毒性数据 CSV 文件路径 all_toxins: All_Toxins.txt 文件路径 output_dir: 输出目录 min_identity: 最小 identity 阈值 min_coverage: 最小 coverage 阈值 allow_unknown_families: 是否允许未知家族 require_index_hit: 是否要求索引命中 Returns: 命令参数列表,以 'pixi run -e pipeline python' 开头 """ cmd: List[str] = [ "pixi", "run", "-e", "pipeline", "python", str(script_path), "--toxicity_csv", str(toxicity_csv), "--all_toxins", str(all_toxins), "--output_dir", str(output_dir), ] if min_identity and min_identity > 0: cmd += ["--min_identity", str(min_identity)] if min_coverage and min_coverage > 0: cmd += ["--min_coverage", str(min_coverage)] if not allow_unknown_families: cmd += ["--disallow_unknown_families"] if require_index_hit: cmd += ["--require_index_hit"] return cmd def build_plot_command( pixi_project_dir: Path, script_path: Path, strain_scores: Path, toxin_support: Path, species_scores: Path, out_dir: Path, merge_unresolved: bool = True, report_mode: str = "paper", lang: str = "zh", per_hit_strain: Optional[str] = None, ) -> List[str]: """构建 Plot 命令列表(使用 pipeline 环境)。 Args: pixi_project_dir: pixi.toml 所在目录 script_path: plot_shotter.py 脚本路径 strain_scores: strain_target_scores.tsv 文件路径 toxin_support: toxin_support.tsv 文件路径 species_scores: strain_target_species_scores.tsv 文件路径 out_dir: 输出目录 merge_unresolved: 是否合并 unresolved 列 report_mode: 报告模式 (paper/summary) lang: 报告语言 (zh/en) per_hit_strain: 可选的菌株名称用于绘制 per-hit 热图 Returns: 命令参数列表,以 'pixi run -e pipeline python' 开头 """ cmd: List[str] = [ "pixi", "run", "-e", "pipeline", "python", str(script_path), "--strain_scores", str(strain_scores), "--toxin_support", str(toxin_support), "--species_scores", str(species_scores), "--out_dir", str(out_dir), "--report_mode", report_mode, "--lang", lang, ] if merge_unresolved: cmd.append("--merge_unresolved") if per_hit_strain: cmd += ["--per_hit_strain", per_hit_strain] return cmd def create_pipeline_bundle( bundle_path: Path, digger_dir: Path, shotter_dir: Path, ) -> bool: """创建 pipeline 结果打包文件。 Args: bundle_path: 输出的 tar.gz 文件路径 digger_dir: digger 输出目录 shotter_dir: shotter 输出目录 Returns: bool: 打包是否成功 """ import tarfile try: bundle_path.parent.mkdir(parents=True, exist_ok=True) with tarfile.open(bundle_path, "w:gz") as tar: if digger_dir.exists(): tar.add(digger_dir, arcname="digger") if shotter_dir.exists(): tar.add(shotter_dir, arcname="shotter") return True except Exception as e: logger.error(f"创建打包文件失败: {e}") return False def verify_bundle_contents(bundle_path: Path) -> Dict[str, Any]: """验证打包文件内容。 Args: bundle_path: tar.gz 文件路径 Returns: 包含验证结果的字典: - valid: bool, 打包是否有效 - has_digger: bool, 是否包含 digger 目录 - has_shotter: bool, 是否包含 shotter 目录 - members: List[str], 所有成员名称 """ import tarfile result: Dict[str, Any] = { "valid": False, "has_digger": False, "has_shotter": False, "members": [], } try: with tarfile.open(bundle_path, "r:gz") as tar: members = tar.getnames() result["members"] = members result["has_digger"] = any(m.startswith("digger") for m in members) result["has_shotter"] = any(m.startswith("shotter") for m in members) result["valid"] = result["has_digger"] or result["has_shotter"] except Exception as e: logger.error(f"验证打包文件失败: {e }") return result