first add

2025-10-13 21:05:00 +08:00
parent c7744836e9
commit d71163df00
29 changed files with 144656 additions and 37 deletions
--- a/backend/app/utils/docker_client.py
+++ b/backend/app/utils/docker_client.py
@@ -0,0 +1,339 @@
+"""Docker/Podman 容器管理（修正版，支持 arm64/macOS 与 linux/amd64）"""
+from __future__ import annotations
+
+import os
+import subprocess
+import logging
+import time
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+
+try:
+    import docker  # type: ignore
+except Exception:  # 允许在无 docker SDK 环境下使用 podman fallback
+    docker = None  # type: ignore
+
+from ..core.config import settings
+
+
+logger = logging.getLogger(__name__)
+
+
+def _which(cmd: str) -> Optional[str]:
+    from shutil import which
+    return which(cmd)
+
+
+class DockerContainerManager:
+    """容器管理器 - 兼容 Docker 与 Podman。
+
+    优先尝试 docker SDK；若不可用则回落到 podman CLI（或 docker CLI）。
+    在 arm64 主机上默认以 --platform linux/amd64 运行镜像。
+    """
+
+    def __init__(
+        self,
+        image: str = settings.DOCKER_IMAGE,
+        platform: str = settings.DOCKER_PLATFORM,
+    ) -> None:
+        self.image = image
+        self.platform = platform
+
+        self._engine: str = "docker"
+        self._client = None
+
+        # 首选 docker-py 客户端（若可用）
+        if docker is not None:
+            try:
+                self._client = docker.from_env()
+                # 探测 daemon
+                self._client.ping()
+                self._engine = "docker-sdk"
+            except Exception as err:
+                logger.info(f"docker SDK 不可用，将尝试 CLI 回落: {err}")
+                self._client = None
+
+        # CLI 回落：优先 podman，其次 docker
+        if self._client is None:
+            if _which("podman"):
+                self._engine = "podman-cli"
+            elif _which("docker"):
+                self._engine = "docker-cli"
+            else:
+                raise RuntimeError("未找到可用的容器引擎（需要 podman 或 docker）")
+
+        self._ensure_image()
+
+    # ----------------------------- 公共方法 -----------------------------
+    def run_command_in_container(
+        self,
+        command: List[str],
+        volumes: Dict[str, Dict[str, str]],
+        environment: Optional[Dict[str, str]] = None,
+        working_dir: str = "/workspace",
+        name: Optional[str] = None,
+        detach: bool = False,
+        remove: bool = True,
+    ) -> Dict[str, Any]:
+        """在容器中执行命令，返回执行结果。"""
+        if self._engine == "docker-sdk" and self._client is not None:
+            return self._run_with_docker_sdk(
+                command, volumes, environment, working_dir, name, detach, remove
+            )
+        else:
+            return self._run_with_cli(
+                command, volumes, environment, working_dir, name, detach, remove
+            )
+
+    def update_database(self, log_dir: Path) -> Dict[str, Any]:
+        """在容器中更新 BtToxin_Digger 数据库。"""
+        cmd = [
+            "/usr/local/env-execute",
+            "BtToxin_Digger",
+            "--update-db",
+        ]
+        vols = {str(log_dir): {"bind": "/logs", "mode": "rw"}}
+        result = self.run_command_in_container(
+            command=cmd, volumes=vols, working_dir="/tmp", name=f"bttoxin_update_db_{int(time.time())}"
+        )
+        if result.get("logs"):
+            (log_dir / "update_db.log").write_text(result["logs"], encoding="utf-8")
+        return result
+
+    def validate_reads_filenames(
+        self,
+        input_dir: Path,
+        platform: str,
+        reads1_suffix: str,
+        reads2_suffix: Optional[str] = None,
+        suffix_len: int = 0,
+    ) -> Dict[str, Any]:
+        files = list(input_dir.glob("*"))
+        if platform == "illumina":
+            r1 = [f for f in files if reads1_suffix and reads1_suffix in f.name]
+            r2 = [f for f in files if reads2_suffix and reads2_suffix in f.name]
+            if not r1 or not r2 or len(r1) != len(r2):
+                return {"valid": False, "error": "Illumina R1/R2 配对数量不匹配或缺失"}
+            for f1 in r1:
+                strain = f1.name.replace(reads1_suffix, "")
+                if not (input_dir / f"{strain}{reads2_suffix}").exists():
+                    return {"valid": False, "error": f"未找到配对文件: {strain}{reads2_suffix}"}
+            return {
+                "valid": True,
+                "strain_count": len(r1),
+                "suggested_suffix_len": suffix_len or len(reads1_suffix),
+            }
+        if platform in ("pacbio", "oxford"):
+            r = [f for f in files if reads1_suffix and reads1_suffix in f.name]
+            if not r:
+                return {"valid": False, "error": f"未找到匹配 {reads1_suffix} 的 reads 文件"}
+            return {
+                "valid": True,
+                "strain_count": len(r),
+                "suggested_suffix_len": suffix_len or len(reads1_suffix),
+            }
+        return {"valid": True}
+
+    def run_bttoxin_digger(
+        self,
+        input_dir: Path,
+        output_dir: Path,
+        log_dir: Path,
+        sequence_type: str = "nucl",
+        scaf_suffix: str = ".fna",
+        threads: int = 4,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        """在容器中运行 BtToxin_Digger 主分析（工作目录挂载到 /workspace）。"""
+        command: List[str] = [
+            "/usr/local/env-execute",
+            "BtToxin_Digger",
+            "--SeqPath",
+            "/data/input",
+            "--SequenceType",
+            sequence_type,
+            "--threads",
+            str(threads),
+        ]
+
+        if sequence_type == "nucl":
+            command += ["--Scaf_suffix", scaf_suffix]
+        elif sequence_type == "orfs":
+            command += ["--orfs_suffix", kwargs.get("orfs_suffix", ".ffn")]
+        elif sequence_type == "prot":
+            command += ["--prot_suffix", kwargs.get("prot_suffix", ".faa")]
+        elif sequence_type == "reads":
+            platform = kwargs.get("platform", "illumina")
+            command += ["--platform", platform]
+            if platform == "illumina":
+                r1 = kwargs.get("reads1_suffix", "_R1.fastq.gz")
+                r2 = kwargs.get("reads2_suffix", "_R2.fastq.gz")
+                sfx = kwargs.get("suffix_len") or len(r1)
+                v = self.validate_reads_filenames(input_dir, platform, r1, r2, sfx)
+                if not v.get("valid"):
+                    raise ValueError(f"Reads 文件验证失败: {v.get('error')}")
+                sfx = v.get("suggested_suffix_len", sfx)
+                command += ["--reads1", r1, "--reads2", r2, "--suffix_len", str(sfx)]
+            elif platform in ("pacbio", "oxford"):
+                r = kwargs.get("reads1_suffix", ".fastq.gz")
+                gsize = kwargs.get("genome_size", "6.07m")
+                sfx = kwargs.get("suffix_len") or len(r)
+                v = self.validate_reads_filenames(input_dir, platform, r, None, sfx)
+                if not v.get("valid"):
+                    raise ValueError(f"Reads 文件验证失败: {v.get('error')}")
+                sfx = v.get("suggested_suffix_len", sfx)
+                command += ["--reads1", r, "--genomeSize", gsize, "--suffix_len", str(sfx)]
+            elif platform == "hybrid":
+                short1 = kwargs.get("short1")
+                short2 = kwargs.get("short2")
+                long = kwargs.get("long")
+                if not all([short1, short2, long]):
+                    raise ValueError("hybrid 需要 short1/short2/long 三个完整文件名")
+                for fn in (short1, short2, long):
+                    if not (input_dir / fn).exists():
+                        raise ValueError(f"文件不存在: {fn}")
+                command += [
+                    "--short1",
+                    short1,
+                    "--short2",
+                    short2,
+                    "--long",
+                    long,
+                    "--hout",
+                    "/workspace/Results/Assembles/Hybrid",
+                ]
+
+        if kwargs.get("assemble_only"):
+            command.append("--assemble_only")
+
+        volumes = {
+            str(input_dir.resolve()): {"bind": "/data/input", "mode": "ro"},
+            str(output_dir.resolve()): {"bind": "/workspace", "mode": "rw"},
+            str(log_dir.resolve()): {"bind": "/data/logs", "mode": "rw"},
+        }
+
+        logger.info("开始 BtToxin_Digger 分析...")
+        result = self.run_command_in_container(
+            command=command,
+            volumes=volumes,
+            working_dir="/workspace",
+            name=f"bttoxin_digger_{int(time.time())}",
+        )
+
+        # 保存容器日志
+        logs_path = log_dir / "digger_execution.log"
+        if result.get("logs"):
+            logs_path.write_text(result["logs"], encoding="utf-8")
+            logger.info(f"容器日志已保存: {logs_path}")
+
+        # 验证输出
+        results_dir = output_dir / "Results"
+        if result.get("success") and results_dir.exists():
+            files = [f for f in results_dir.rglob("*") if f.is_file()]
+            result["output_files"] = len(files)
+        else:
+            result["output_files"] = 0
+        return result
+
+    # ----------------------------- 内部实现 -----------------------------
+    def _ensure_image(self) -> None:
+        if self._engine == "docker-sdk" and self._client is not None:
+            try:
+                self._client.images.get(self.image)
+                return
+            except Exception:
+                logger.info(f"拉取镜像 {self.image} (platform={self.platform}) ...")
+                self._client.images.pull(self.image, platform=self.platform)
+        else:
+            # CLI 模式：先尝试拉取
+            cli = "podman" if self._engine == "podman-cli" else "docker"
+            try:
+                subprocess.run(
+                    [cli, "pull", "--platform", self.platform, self.image],
+                    check=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                )
+            except Exception as err:
+                logger.warning(f"{cli} pull 失败: {err}")
+
+    def _run_with_docker_sdk(
+        self,
+        command: List[str],
+        volumes: Dict[str, Dict[str, str]],
+        environment: Optional[Dict[str, str]],
+        working_dir: str,
+        name: Optional[str],
+        detach: bool,
+        remove: bool,
+    ) -> Dict[str, Any]:
+        assert self._client is not None
+        try:
+            container = self._client.containers.run(
+                image=self.image,
+                command=command,
+                volumes=volumes,
+                environment=environment or {},
+                working_dir=working_dir,
+                platform=self.platform,
+                name=name,
+                detach=detach,
+                remove=False,  # 等获取日志后再删
+                stdout=True,
+                stderr=True,
+            )
+            if detach:
+                return {"success": True, "container_id": container.id, "status": "running"}
+            exit_info = container.wait()
+            code = exit_info.get("StatusCode", 1)
+            logs = container.logs().decode("utf-8", errors="ignore")
+            if remove:
+                try:
+                    container.remove()
+                except Exception:
+                    pass
+            return {"success": code == 0, "exit_code": code, "logs": logs, "status": "completed" if code == 0 else "failed"}
+        except Exception as e:
+            logger.error(f"docker SDK 运行失败: {e}", exc_info=True)
+            return {"success": False, "error": str(e), "exit_code": -1, "status": "error"}
+
+    def _run_with_cli(
+        self,
+        command: List[str],
+        volumes: Dict[str, Dict[str, str]],
+        environment: Optional[Dict[str, str]],
+        working_dir: str,
+        name: Optional[str],
+        detach: bool,
+        remove: bool,
+    ) -> Dict[str, Any]:
+        cli = "podman" if self._engine == "podman-cli" else "docker"
+        cmd: List[str] = [cli, "run", "--rm" if remove and not detach else ""]
+        cmd = [c for c in cmd if c]
+        cmd += ["--platform", self.platform]
+        if name:
+            cmd += ["--name", name]
+        for host, spec in volumes.items():
+            bind = spec.get("bind")
+            mode = spec.get("mode", "rw")
+            cmd += ["-v", f"{host}:{bind}:{mode}"]
+        for k, v in (environment or {}).items():
+            cmd += ["-e", f"{k}={v}"]
+        cmd += ["-w", working_dir, self.image]
+        cmd += command
+
+        try:
+            if detach:
+                # 后台运行：CLI 简化返回
+                p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+                return {"success": True, "status": "running", "pid": p.pid}
+            else:
+                proc = subprocess.run(cmd, capture_output=True, text=True)
+                out = (proc.stdout or "") + (proc.stderr or "")
+                return {"success": proc.returncode == 0, "exit_code": proc.returncode, "logs": out, "status": "completed" if proc.returncode == 0 else "failed"}
+        except Exception as e:
+            logger.error(f"{cli} 运行失败: {e}", exc_info=True)
+            return {"success": False, "error": str(e), "exit_code": -1, "status": "error"}
+
+