feat: migrate from Docker to pixi for BtToxin_Digger execution

- Add pixi.toml with digger and pipeline environments - Implement PixiRunner class replacing DockerContainerManager - Add run_digger_stage.py for standalone digger execution - Update run_single_fna_pipeline.py to use PixiRunner - Remove docker dependency from pyproject.toml - Delete docker_client.py (no longer needed) BREAKING CHANGE: Docker is no longer required. Use 'pixi install' instead.
2026-01-08 16:58:45 +08:00
parent 1c0e8f90a5
commit ae4c6351d9
7 changed files with 7457 additions and 470 deletions
--- a/scripts/pixi_runner.py
+++ b/scripts/pixi_runner.py
@@ -0,0 +1,671 @@
+#!/usr/bin/env python3
+"""PixiRunner - 在 pixi 环境中执行 BtToxin_Digger 的运行器。
+
+此模块提供 PixiRunner 类，用于替代 DockerContainerManager，
+在本地 pixi 环境中运行 BtToxin_Digger 分析。
+
+Example:
+    runner = PixiRunner()
+    result = runner.run_bttoxin_digger(
+        input_dir=Path("input"),
+        output_dir=Path("output"),
+        log_dir=Path("logs"),
+    )
+"""
+from __future__ import annotations
+
+import json
+import logging
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class PixiRunnerError(Exception):
+    """PixiRunner 相关错误的基类。"""
+    pass
+
+
+class PixiNotInstalledError(PixiRunnerError):
+    """pixi 未安装时抛出的错误。"""
+    pass
+
+
+class EnvironmentNotFoundError(PixiRunnerError):
+    """pixi 环境不存在时抛出的错误。"""
+    pass
+
+
+class BtToxinDiggerNotFoundError(PixiRunnerError):
+    """BtToxin_Digger 未找到时抛出的错误。"""
+    pass
+
+
+class PixiRunner:
+    """在 pixi 环境中执行 BtToxin_Digger 的运行器。
+
+    此类提供与 DockerContainerManager 兼容的接口，
+    但使用本地 pixi 环境而非 Docker 容器执行命令。
+    
+    Attributes:
+        pixi_project_dir: pixi.toml 所在目录
+        env_name: pixi 环境名称，默认为 "digger"
+    """
+
+    def __init__(
+        self,
+        pixi_project_dir: Optional[Path] = None,
+        env_name: str = "digger",
+    ) -> None:
+        """初始化运行器，验证 pixi 和环境可用性。
+        
+        Args:
+            pixi_project_dir: pixi.toml 所在目录，默认为项目根目录
+            env_name: pixi 环境名称，默认为 "digger"
+            
+        Raises:
+            PixiNotInstalledError: pixi 未安装
+            EnvironmentNotFoundError: pixi 环境不存在
+            BtToxinDiggerNotFoundError: BtToxin_Digger 未找到
+        """
+        self.env_name = env_name
+        
+        # 确定 pixi 项目目录
+        if pixi_project_dir is None:
+            # 默认为脚本所在目录的父目录（项目根目录）
+            self.pixi_project_dir = Path(__file__).resolve().parents[1]
+        else:
+            self.pixi_project_dir = Path(pixi_project_dir).resolve()
+        
+        # 验证 pixi.toml 存在
+        pixi_toml = self.pixi_project_dir / "pixi.toml"
+        if not pixi_toml.exists():
+            raise PixiRunnerError(
+                f"pixi.toml 配置文件不存在: {pixi_toml}。"
+                "请确保在项目根目录运行，或指定正确的 pixi_project_dir。"
+            )
+
+    def check_environment(self) -> Dict[str, Any]:
+        """检查 pixi 环境和 BtToxin_Digger 可用性。
+        
+        Returns:
+            包含环境状态的字典:
+            - pixi_installed: bool, pixi 是否已安装
+            - env_exists: bool, 指定环境是否存在
+            - bttoxin_available: bool, BtToxin_Digger 是否可用
+            - error: str | None, 错误信息（包含可操作的指导）
+        """
+        result: Dict[str, Any] = {
+            "pixi_installed": False,
+            "env_exists": False,
+            "bttoxin_available": False,
+            "error": None,
+        }
+        
+        # 检查 pixi 是否安装
+        try:
+            proc = subprocess.run(
+                ["pixi", "--version"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if proc.returncode == 0:
+                result["pixi_installed"] = True
+            else:
+                result["error"] = (
+                    "pixi 未安装。请访问 https://pixi.sh 安装 pixi，"
+                    "或运行: curl -fsSL https://pixi.sh/install.sh | bash"
+                )
+                return result
+        except FileNotFoundError:
+            result["error"] = (
+                "pixi 未安装。请访问 https://pixi.sh 安装 pixi，"
+                "或运行: curl -fsSL https://pixi.sh/install.sh | bash"
+            )
+            return result
+        except subprocess.TimeoutExpired:
+            result["error"] = "pixi 命令超时。请检查 pixi 安装是否正确。"
+            return result
+        except Exception as e:
+            result["error"] = f"检查 pixi 时出错: {e}。请确保 pixi 已正确安装。"
+            return result
+        
+        # 检查环境是否存在
+        try:
+            proc = subprocess.run(
+                ["pixi", "info", "--json"],
+                capture_output=True,
+                text=True,
+                timeout=30,
+                cwd=str(self.pixi_project_dir),
+            )
+            if proc.returncode == 0:
+                try:
+                    info = json.loads(proc.stdout)
+                    envs = info.get("environments_info", [])
+                    env_names = [e.get("name") for e in envs if e.get("name")]
+                    if self.env_name in env_names:
+                        result["env_exists"] = True
+                    else:
+                        result["error"] = (
+                            f"{self.env_name} 环境不存在。"
+                            f"请在项目根目录运行 'pixi install' 安装依赖。"
+                        )
+                        return result
+                except json.JSONDecodeError:
+                    # 回退到简单检查
+                    result["env_exists"] = True
+            else:
+                result["error"] = (
+                    f"{self.env_name} 环境不存在。"
+                    f"请在项目根目录运行 'pixi install' 安装依赖。"
+                )
+                return result
+        except Exception as e:
+            result["error"] = f"检查环境时出错: {e}。请运行 'pixi install' 安装依赖。"
+            return result
+        
+        # 检查 BtToxin_Digger 是否可用
+        try:
+            proc = subprocess.run(
+                ["pixi", "run", "-e", self.env_name, "BtToxin_Digger", "--help"],
+                capture_output=True,
+                text=True,
+                timeout=60,
+                cwd=str(self.pixi_project_dir),
+            )
+            output = proc.stdout + proc.stderr
+            if proc.returncode == 0 or "BtToxin_Digger" in output or "Usage" in output:
+                result["bttoxin_available"] = True
+            else:
+                result["error"] = (
+                    "BtToxin_Digger 未找到。请确保 pixi install 已完成，"
+                    "并且 pixi.toml 中包含 bttoxin_digger 依赖。"
+                )
+                return result
+        except FileNotFoundError:
+            result["error"] = (
+                "pixi 命令未找到。请确保 pixi 已正确安装并在 PATH 中。"
+            )
+            return result
+        except subprocess.TimeoutExpired:
+            result["error"] = (
+                "检查 BtToxin_Digger 超时。请运行 'pixi install' 确保环境已正确安装。"
+            )
+            return result
+        except Exception as e:
+            result["error"] = (
+                f"检查 BtToxin_Digger 时出错: {e}。"
+                "请运行 'pixi install' 安装依赖。"
+            )
+            return result
+        
+        return result
+
+
+    def build_digger_command(
+        self,
+        input_dir: Path,
+        sequence_type: str = "nucl",
+        scaf_suffix: str = ".fna",
+        threads: int = 4,
+        **kwargs: Any,
+    ) -> List[str]:
+        """构建 BtToxin_Digger 命令列表。
+        
+        Args:
+            input_dir: 输入文件目录
+            sequence_type: 序列类型 (nucl/orfs/prot/reads)
+            scaf_suffix: scaffold 文件后缀
+            threads: 线程数
+            **kwargs: 其他 BtToxin_Digger 参数
+            
+        Returns:
+            命令参数列表，以 'pixi run -e digger BtToxin_Digger' 开头
+        """
+        cmd: List[str] = [
+            "pixi", "run", "-e", self.env_name,
+            "BtToxin_Digger",
+            "--SeqPath", str(input_dir),
+            "--SequenceType", sequence_type,
+            "--threads", str(threads),
+        ]
+        
+        if sequence_type == "nucl":
+            cmd += ["--Scaf_suffix", scaf_suffix]
+        elif sequence_type == "orfs":
+            cmd += ["--orfs_suffix", kwargs.get("orfs_suffix", ".ffn")]
+        elif sequence_type == "prot":
+            cmd += ["--prot_suffix", kwargs.get("prot_suffix", ".faa")]
+        elif sequence_type == "reads":
+            platform = kwargs.get("platform", "illumina")
+            cmd += ["--platform", platform]
+            if platform == "illumina":
+                r1 = kwargs.get("reads1_suffix", "_R1.fastq.gz")
+                r2 = kwargs.get("reads2_suffix", "_R2.fastq.gz")
+                sfx = kwargs.get("suffix_len") or len(r1)
+                cmd += ["--reads1", r1, "--reads2", r2, "--suffix_len", str(sfx)]
+            elif platform in ("pacbio", "oxford"):
+                r = kwargs.get("reads1_suffix", ".fastq.gz")
+                gsize = kwargs.get("genome_size", "6.07m")
+                sfx = kwargs.get("suffix_len") or len(r)
+                cmd += ["--reads1", r, "--genomeSize", gsize, "--suffix_len", str(sfx)]
+            elif platform == "hybrid":
+                short1 = kwargs.get("short1")
+                short2 = kwargs.get("short2")
+                long_reads = kwargs.get("long")
+                if short1:
+                    cmd += ["--short1", short1]
+                if short2:
+                    cmd += ["--short2", short2]
+                if long_reads:
+                    cmd += ["--long", long_reads]
+                hout = kwargs.get("hout")
+                if hout:
+                    cmd += ["--hout", hout]
+        
+        if kwargs.get("assemble_only"):
+            cmd.append("--assemble_only")
+        
+        return cmd
+
+    def run_bttoxin_digger(
+        self,
+        input_dir: Path,
+        output_dir: Path,
+        log_dir: Path,
+        sequence_type: str = "nucl",
+        scaf_suffix: str = ".fna",
+        threads: int = 4,
+        bttoxin_db_dir: Optional[Path] = None,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        """执行 BtToxin_Digger 分析。
+        
+        Args:
+            input_dir: 输入文件目录
+            output_dir: 输出目录
+            log_dir: 日志目录
+            sequence_type: 序列类型 (nucl/orfs/prot/reads)
+            scaf_suffix: scaffold 文件后缀
+            threads: 线程数
+            bttoxin_db_dir: 外部数据库目录（可选）
+            **kwargs: 其他 BtToxin_Digger 参数
+            
+        Returns:
+            结果字典，包含:
+            - success: bool, 执行是否成功
+            - exit_code: int, 进程退出码
+            - logs: str, 执行日志
+            - status: str, 状态 (completed/failed/error)
+            - error: str (可选), 错误信息
+            - output_files: int (可选), 输出文件数量
+        """
+        # 确保目录存在
+        output_dir = Path(output_dir).resolve()
+        log_dir = Path(log_dir).resolve()
+        input_dir = Path(input_dir).resolve()
+        
+        output_dir.mkdir(parents=True, exist_ok=True)
+        log_dir.mkdir(parents=True, exist_ok=True)
+        
+        # 准备工作目录结构
+        work_input_dir = output_dir / "input_files"
+        work_input_dir.mkdir(parents=True, exist_ok=True)
+        
+        # 复制输入文件
+        if sequence_type == "nucl":
+            pattern = f"*{scaf_suffix}"
+        elif sequence_type == "orfs":
+            pattern = f"*{kwargs.get('orfs_suffix', '.ffn')}"
+        elif sequence_type == "prot":
+            pattern = f"*{kwargs.get('prot_suffix', '.faa')}"
+        else:
+            pattern = "*"
+        
+        copied_files = 0
+        for f in input_dir.glob(pattern):
+            if f.is_file():
+                shutil.copy2(f, work_input_dir / f.name)
+                copied_files += 1
+        
+        logger.info(f"已复制 {copied_files} 个输入文件到 {work_input_dir}")
+        
+        # 设置外部数据库（如果提供）
+        if bttoxin_db_dir is not None:
+            db_setup_ok = self._setup_external_db(bttoxin_db_dir)
+            if not db_setup_ok:
+                logger.warning("外部数据库设置失败，将使用默认数据库")
+        
+        # 构建命令
+        cmd = self.build_digger_command(
+            input_dir=work_input_dir,
+            sequence_type=sequence_type,
+            scaf_suffix=scaf_suffix,
+            threads=threads,
+            **kwargs,
+        )
+        
+        logger.info(f"执行命令: {' '.join(cmd)}")
+        
+        # 执行命令
+        try:
+            proc = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                cwd=str(output_dir),
+            )
+            
+            logs = f"=== STDOUT ===\n{proc.stdout}\n=== STDERR ===\n{proc.stderr}"
+            
+            # 保存日志
+            log_file = log_dir / "digger_execution.log"
+            log_file.write_text(logs, encoding="utf-8")
+            logger.info(f"日志已保存: {log_file}")
+            
+            # 构建结果
+            result: Dict[str, Any] = {
+                "success": proc.returncode == 0,
+                "exit_code": proc.returncode,
+                "logs": logs,
+                "status": "completed" if proc.returncode == 0 else "failed",
+            }
+            
+            if proc.returncode != 0:
+                result["error"] = (
+                    f"BtToxin_Digger 执行失败 (exit={proc.returncode}): "
+                    f"{proc.stderr[:500] if proc.stderr else 'No error output'}"
+                )
+            
+            # 统计输出文件
+            results_dir = output_dir / "Results"
+            if results_dir.exists():
+                files = [f for f in results_dir.rglob("*") if f.is_file()]
+                result["output_files"] = len(files)
+            else:
+                result["output_files"] = 0
+            
+            return result
+            
+        except FileNotFoundError as e:
+            error_msg = (
+                f"命令执行失败: {e}。"
+                "请确保 pixi 已正确安装并在 PATH 中。"
+                "安装说明: https://pixi.sh"
+            )
+            logger.error(error_msg)
+            return {
+                "success": False,
+                "exit_code": -1,
+                "logs": "",
+                "status": "error",
+                "error": error_msg,
+            }
+        except subprocess.TimeoutExpired as e:
+            error_msg = f"命令执行超时: {e}"
+            logger.error(error_msg)
+            return {
+                "success": False,
+                "exit_code": -1,
+                "logs": "",
+                "status": "error",
+                "error": error_msg,
+            }
+        except Exception as e:
+            error_msg = f"执行 BtToxin_Digger 时出错: {e}"
+            logger.error(error_msg, exc_info=True)
+            return {
+                "success": False,
+                "exit_code": -1,
+                "logs": "",
+                "status": "error",
+                "error": error_msg,
+            }
+
+
+    def _setup_external_db(self, bttoxin_db_dir: Path) -> bool:
+        """设置外部数据库符号链接。
+        
+        将 external_dbs/bt_toxin 链接到 pixi 环境的 BTTCMP_db/bt_toxin。
+        
+        Args:
+            bttoxin_db_dir: 外部数据库目录路径
+            
+        Returns:
+            bool: 设置是否成功
+        """
+        db_path = Path(bttoxin_db_dir).resolve()
+        
+        # 验证数据库目录结构
+        if not db_path.exists():
+            logger.warning(f"外部数据库目录不存在: {db_path}")
+            return False
+        
+        if not (db_path / "db").exists():
+            logger.warning(f"外部数据库目录结构不完整，缺少 db 子目录: {db_path}")
+            return False
+        
+        # 获取 pixi 环境路径
+        try:
+            proc = subprocess.run(
+                ["pixi", "info", "--json"],
+                capture_output=True,
+                text=True,
+                timeout=30,
+                cwd=str(self.pixi_project_dir),
+            )
+            if proc.returncode != 0:
+                logger.warning("无法获取 pixi 环境信息")
+                return False
+            
+            info = json.loads(proc.stdout)
+            envs_info = info.get("environments_info", [])
+            
+            env_prefix = None
+            for env in envs_info:
+                if env.get("name") == self.env_name:
+                    env_prefix = env.get("prefix")
+                    break
+            
+            if not env_prefix:
+                logger.warning(f"未找到 {self.env_name} 环境的路径")
+                return False
+            
+            # 创建符号链接
+            # BtToxin_Digger 期望数据库在 BTTCMP_db/bt_toxin
+            env_path = Path(env_prefix)
+            bttcmp_db_dir = env_path / "bin" / "BTTCMP_db"
+            target_link = bttcmp_db_dir / "bt_toxin"
+            
+            # 确保 BTTCMP_db 目录存在
+            bttcmp_db_dir.mkdir(parents=True, exist_ok=True)
+            
+            # 如果已存在符号链接或目录，先删除
+            if target_link.exists() or target_link.is_symlink():
+                if target_link.is_symlink():
+                    target_link.unlink()
+                elif target_link.is_dir():
+                    shutil.rmtree(target_link)
+            
+            # 创建符号链接
+            target_link.symlink_to(db_path)
+            logger.info(f"已创建数据库符号链接: {target_link} -> {db_path}")
+            return True
+            
+        except json.JSONDecodeError as e:
+            logger.warning(f"解析 pixi info 输出失败: {e}")
+            return False
+        except PermissionError as e:
+            logger.warning(f"无法创建数据库符号链接（权限不足）: {e}")
+            return False
+        except Exception as e:
+            logger.warning(f"设置外部数据库时出错: {e}")
+            return False
+
+
+def build_shotter_command(
+    pixi_project_dir: Path,
+    script_path: Path,
+    toxicity_csv: Path,
+    all_toxins: Path,
+    output_dir: Path,
+    min_identity: float = 0.0,
+    min_coverage: float = 0.0,
+    allow_unknown_families: bool = True,
+    require_index_hit: bool = False,
+) -> List[str]:
+    """构建 Shotter 命令列表（使用 pipeline 环境）。
+    
+    Args:
+        pixi_project_dir: pixi.toml 所在目录
+        script_path: bttoxin_shoter.py 脚本路径
+        toxicity_csv: 毒性数据 CSV 文件路径
+        all_toxins: All_Toxins.txt 文件路径
+        output_dir: 输出目录
+        min_identity: 最小 identity 阈值
+        min_coverage: 最小 coverage 阈值
+        allow_unknown_families: 是否允许未知家族
+        require_index_hit: 是否要求索引命中
+        
+    Returns:
+        命令参数列表，以 'pixi run -e pipeline python' 开头
+    """
+    cmd: List[str] = [
+        "pixi", "run", "-e", "pipeline",
+        "python", str(script_path),
+        "--toxicity_csv", str(toxicity_csv),
+        "--all_toxins", str(all_toxins),
+        "--output_dir", str(output_dir),
+    ]
+    
+    if min_identity and min_identity > 0:
+        cmd += ["--min_identity", str(min_identity)]
+    if min_coverage and min_coverage > 0:
+        cmd += ["--min_coverage", str(min_coverage)]
+    if not allow_unknown_families:
+        cmd += ["--disallow_unknown_families"]
+    if require_index_hit:
+        cmd += ["--require_index_hit"]
+    
+    return cmd
+
+
+def build_plot_command(
+    pixi_project_dir: Path,
+    script_path: Path,
+    strain_scores: Path,
+    toxin_support: Path,
+    species_scores: Path,
+    out_dir: Path,
+    merge_unresolved: bool = True,
+    report_mode: str = "paper",
+    lang: str = "zh",
+    per_hit_strain: Optional[str] = None,
+) -> List[str]:
+    """构建 Plot 命令列表（使用 pipeline 环境）。
+    
+    Args:
+        pixi_project_dir: pixi.toml 所在目录
+        script_path: plot_shotter.py 脚本路径
+        strain_scores: strain_target_scores.tsv 文件路径
+        toxin_support: toxin_support.tsv 文件路径
+        species_scores: strain_target_species_scores.tsv 文件路径
+        out_dir: 输出目录
+        merge_unresolved: 是否合并 unresolved 列
+        report_mode: 报告模式 (paper/summary)
+        lang: 报告语言 (zh/en)
+        per_hit_strain: 可选的菌株名称用于绘制 per-hit 热图
+        
+    Returns:
+        命令参数列表，以 'pixi run -e pipeline python' 开头
+    """
+    cmd: List[str] = [
+        "pixi", "run", "-e", "pipeline",
+        "python", str(script_path),
+        "--strain_scores", str(strain_scores),
+        "--toxin_support", str(toxin_support),
+        "--species_scores", str(species_scores),
+        "--out_dir", str(out_dir),
+        "--report_mode", report_mode,
+        "--lang", lang,
+    ]
+    
+    if merge_unresolved:
+        cmd.append("--merge_unresolved")
+    
+    if per_hit_strain:
+        cmd += ["--per_hit_strain", per_hit_strain]
+    
+    return cmd
+
+
+
+def create_pipeline_bundle(
+    bundle_path: Path,
+    digger_dir: Path,
+    shotter_dir: Path,
+) -> bool:
+    """创建 pipeline 结果打包文件。
+    
+    Args:
+        bundle_path: 输出的 tar.gz 文件路径
+        digger_dir: digger 输出目录
+        shotter_dir: shotter 输出目录
+        
+    Returns:
+        bool: 打包是否成功
+    """
+    import tarfile
+    
+    try:
+        bundle_path.parent.mkdir(parents=True, exist_ok=True)
+        with tarfile.open(bundle_path, "w:gz") as tar:
+            if digger_dir.exists():
+                tar.add(digger_dir, arcname="digger")
+            if shotter_dir.exists():
+                tar.add(shotter_dir, arcname="shotter")
+        return True
+    except Exception as e:
+        logger.error(f"创建打包文件失败: {e}")
+        return False
+
+
+def verify_bundle_contents(bundle_path: Path) -> Dict[str, Any]:
+    """验证打包文件内容。
+    
+    Args:
+        bundle_path: tar.gz 文件路径
+        
+    Returns:
+        包含验证结果的字典:
+        - valid: bool, 打包是否有效
+        - has_digger: bool, 是否包含 digger 目录
+        - has_shotter: bool, 是否包含 shotter 目录
+        - members: List[str], 所有成员名称
+    """
+    import tarfile
+    
+    result: Dict[str, Any] = {
+        "valid": False,
+        "has_digger": False,
+        "has_shotter": False,
+        "members": [],
+    }
+    
+    try:
+        with tarfile.open(bundle_path, "r:gz") as tar:
+            members = tar.getnames()
+            result["members"] = members
+            result["has_digger"] = any(m.startswith("digger") for m in members)
+            result["has_shotter"] = any(m.startswith("shotter") for m in members)
+            result["valid"] = result["has_digger"] or result["has_shotter"]
+    except Exception as e:
+        logger.error(f"验证打包文件失败: {e
+}")
+    
+    return result