Refactor: Unified pipeline execution, simplified UI, and fixed Docker config

- Backend: Refactored tasks.py to directly invoke run_single_fna_pipeline.py for consistency. - Backend: Changed output format to ZIP and added auto-cleanup of intermediate files. - Backend: Fixed language parameter passing in API and tasks. - Frontend: Removed CRISPR Fusion UI elements from Submit and Monitor views. - Frontend: Implemented simulated progress bar for better UX. - Frontend: Restored One-click load button and added result file structure documentation. - Docker: Fixed critical Restarting loop by removing incorrect image directive in docker-compose.yml. - Docker: Optimized Dockerfile to correct .pixi environment path issues and prevent accidental deletion of frontend assets.
2026-01-20 20:25:25 +08:00
parent 5067169b0b
commit c75c85c53b
134 changed files with 146457 additions and 996647 deletions
--- a/backend/app/workers/tasks.py
+++ b/backend/app/workers/tasks.py
@@ -6,19 +6,17 @@ import logging
 import asyncio
 import subprocess
 import os
+import zipfile
+import json

 from ..core.celery_app import celery_app
-from ..core.docker_client import DockerManager
+from ..core.tool_runner import ToolRunner
 from ..database import SessionLocal
 from ..models.job import Job, JobStatus
 from ..services.concurrency_manager import get_concurrency_manager

 logger = logging.getLogger(__name__)

-# Pipeline 阶段定义
-PIPELINE_STAGES = ["digger", "crispr", "shoter", "plots", "bundle"]
-
-
 def run_local_command(cmd: list, cwd: Path = None, env: dict = None) -> dict:
    """Run a command locally in the container"""
    try:
@@ -42,7 +40,7 @@ def run_local_command(cmd: list, cwd: Path = None, env: dict = None) -> dict:
        return {'success': False, 'error': str(e)}


-@celery_app.task(bind=True, max_retries=3)
+@celery_app.task(bind=True, max_retries=3, name="backend.app.workers.tasks.run_bttoxin_analysis")
 def run_bttoxin_analysis(
    self,
    job_id: str,
@@ -57,17 +55,13 @@ def run_bttoxin_analysis(
    require_index_hit: bool = True,
    crispr_fusion: bool = False,
    crispr_weight: float = 0.0,
+    lang: str = "zh"
 ):
    """
-    执行分析任务 - 完整的 4 阶段 pipeline
-
-    Stages:
-        1. digger - BtToxin_Digger 识别毒素基因
-        2. shoter - BtToxin_Shoter 评估毒性活性
-        3. plots  - 生成热力图
-        4. bundle  - 打包结果
+    执行分析任务 - 使用 scripts/run_single_fna_pipeline.py 统一脚本
    """
    db = SessionLocal()
+    job = None

    try:
        job = db.query(Job).filter(Job.id == job_id).first()
@@ -75,199 +69,118 @@ def run_bttoxin_analysis(
            logger.error(f"Job {job_id} not found")
            return {'job_id': job_id, 'status': 'error', 'error': 'Job not found'}

-        # 更新状态为 QUEUED
-        job.status = JobStatus.QUEUED
-        db.commit()
-
-        # 尝试获取执行槽位（使用同步 Redis，因为 Celery 是同步的）
-        # 注意：这里简化处理，实际应该用异步
-        # 暂时直接执行，稍后集成真正的并发控制
-
        # 更新状态为 RUNNING
        job.status = JobStatus.RUNNING
-        job.current_stage = "digger"
+        job.current_stage = "running"
        job.progress_percent = 0
        db.commit()

-        # 阶段 1: Digger - 识别毒素基因
-        logger.info(f"Job {job_id}: Starting Digger stage")
-        self.update_state(
-            state='PROGRESS',
-            meta={'stage': 'digger', 'progress': 10, 'status': 'Running BtToxin_Digger...'}
-        )
+        # 准备路径
+        input_path = Path(input_dir)
+        output_path = Path(output_dir)

-        docker_manager = DockerManager()
-        digger_result = docker_manager.run_bttoxin_digger(
-            input_dir=Path(input_dir),
-            output_dir=Path(output_dir),
-            sequence_type=sequence_type,
-            scaf_suffix=scaf_suffix,
-            threads=threads
-        )
-
-        if not digger_result['success']:
-            raise Exception(f"Digger stage failed: {digger_result.get('error', 'Unknown error')}")
-
-        job.progress_percent = 40
-        db.commit()
-
-        # 阶段 1.5: CRISPR-Cas (如果启用)
-        crispr_results_file = None
-        if crispr_fusion:
-            logger.info(f"Job {job_id}: Starting CRISPR stage")
-            job.current_stage = "crispr"
-            db.commit()
-            self.update_state(
-                state='PROGRESS',
-                meta={'stage': 'crispr', 'progress': 45, 'status': 'Running CRISPR Detection...'}
-            )
-
-            crispr_out = Path(output_dir) / "crispr" / "results.json"
-            crispr_out.parent.mkdir(parents=True, exist_ok=True)
-
-            # 1. Detection
-            detect_cmd = [
-                "pixi", "run", "-e", "crispr", "python", "crispr_cas/scripts/detect_crispr.py",
-                "--input", str(Path(input_dir) / f"{job_id}{scaf_suffix}"),  # Assuming input file name matches
-                "--output", str(crispr_out),
-                "--mock"  # Always use mock for now as we don't have the tool installed
-            ]
-
-            # Find input file - might be named differently
-            input_files = list(Path(input_dir).glob(f"*{scaf_suffix}"))
-            if input_files:
-                detect_cmd[7] = str(input_files[0])
-
-            res = run_local_command(detect_cmd, cwd=Path("/app"))
-            if not res['success']:
-                logger.warning(f"CRISPR detection failed: {res.get('stderr')}")
+        # 查找输入文件 (由于 input_dir 是上传目录，里面应该有一个文件)
+        input_files = list(input_path.glob(f"*{scaf_suffix}"))
+        if not input_files:
+            # 尝试查找任意文件
+            files = [f for f in input_path.iterdir() if f.is_file()]
+            if files:
+                input_file = files[0]
            else:
-                crispr_results_file = crispr_out
+                raise FileNotFoundError(f"No input file found in {input_dir}")
+        else:
+            input_file = input_files[0]

-                # 2. Fusion (if requested)
-                fusion_out = Path(output_dir) / "crispr" / "fusion_analysis.json"
-                # TODO: We need the toxins file from Digger output.
-                # Assuming Digger output structure: output_dir/Results/Toxins/All_Toxins.txt (Need to verify)
-                # But DockerManager output might be different. Let's assume standard structure.
-                toxins_file = Path(output_dir) / "Results" / "Toxins" / "All_Toxins.txt"
+        logger.info(f"Job {job_id}: Starting pipeline for {input_file}")

-                if toxins_file.exists():
-                    fusion_cmd = [
-                        "pixi", "run", "-e", "crispr", "python", "crispr_cas/scripts/fusion_analysis.py",
-                        "--crispr-results", str(crispr_out),
-                        "--toxin-results", str(toxins_file),
-                        "--genome", str(input_files[0]),
-                        "--output", str(fusion_out),
-                        "--mock"
-                    ]
-                    run_local_command(fusion_cmd, cwd=Path("/app"))
-
-        # 阶段 2: Shoter - 评估毒性活性
-        logger.info(f"Job {job_id}: Starting Shoter stage")
-        job.current_stage = "shoter"
-        db.commit()
-        self.update_state(
-            state='PROGRESS',
-            meta={'stage': 'shoter', 'progress': 50, 'status': 'Running BtToxin_Shoter...'}
-        )
-
-        # 构建 Shoter 命令
-        # 假设 Digger 输出在 output_dir/Results/Toxins/All_Toxins.txt
-        toxins_file = Path(output_dir) / "Results" / "Toxins" / "All_Toxins.txt"
-        shoter_out_dir = Path(output_dir) / "shoter"
-
-        # 即使 Digger 失败或没有结果，我们也可以尝试运行（脚本会处理空文件）
-        # 如果文件不存在，可能 Digger 结构不同，需要适配
-
-        shoter_cmd = [
-            "pixi", "run", "-e", "pipeline", "python", "scripts/bttoxin_shoter.py",
-            "--all_toxins", str(toxins_file),
-            "--output_dir", str(shoter_out_dir),
+        # 构建 pipeline 命令
+        # 使用 pixi run -e pipeline 来执行脚本，确保环境一致
+        pipeline_cmd = [
+            "pixi", "run", "-e", "pipeline", "python", "scripts/run_single_fna_pipeline.py",
+            "--input", str(input_file),
+            "--out_root", str(output_path),
+            "--toxicity_csv", "Data/toxicity-data.csv",
            "--min_identity", str(min_identity),
-            "--min_coverage", str(min_coverage)
+            "--min_coverage", str(min_coverage),
+            "--threads", str(threads),
+            "--lang", lang
        ]

-        if allow_unknown_families:
-            shoter_cmd.append("--allow_unknown_families")
+        if not allow_unknown_families:
+            pipeline_cmd.append("--disallow_unknown_families")
+
        if require_index_hit:
-            shoter_cmd.append("--require_index_hit")
+            pipeline_cmd.append("--require_index_hit")

-        # CRISPR Integration
-        if crispr_results_file:
-            shoter_cmd.extend(["--crispr_results", str(crispr_results_file)])
-            shoter_cmd.extend(["--crispr_weight", str(crispr_weight)])
-            if crispr_fusion:
-                shoter_cmd.append("--crispr_fusion")
+        # 执行脚本
+        res = run_local_command(pipeline_cmd, cwd=Path("/app"))

-        run_local_command(shoter_cmd, cwd=Path("/app"))
+        if not res['success']:
+            error_msg = f"Pipeline execution failed (exit={res['exit_code']}): {res['stderr']}"
+            logger.error(error_msg)
+            raise Exception(error_msg)

-        job.progress_percent = 70
-        db.commit()
+        logger.info(f"Job {job_id}: Pipeline script completed")

-        # 阶段 3: Plots - 生成热力图
-        logger.info(f"Job {job_id}: Starting Plots stage")
-        job.current_stage = "plots"
-        db.commit()
-        self.update_state(
-            state='PROGRESS',
-            meta={'stage': 'plots', 'progress': 80, 'status': 'Generating plots...'}
-        )
+        # 结果打包 (Zip)
+        logger.info(f"Job {job_id}: Creating zip bundle")
+        zip_path = output_path / f"pipeline_results_{job_id}.zip"

-        plot_cmd = [
-            "pixi", "run", "-e", "pipeline", "python", "scripts/plot_shotter.py",
-            "--strain_scores", str(shoter_out_dir / "strain_target_scores.tsv"),
-            "--toxin_support", str(shoter_out_dir / "toxin_support.tsv"),
-            "--species_scores", str(shoter_out_dir / "strain_target_species_scores.tsv"),
-            "--out_dir", str(shoter_out_dir),
-            "--output_prefix", "Activity_Heatmap"
-        ]
+        # 需要打包的子目录
+        subdirs_to_zip = ["digger", "shoter", "logs"]

-        if crispr_results_file:
-            plot_cmd.extend(["--crispr_results", str(crispr_results_file)])
-            if crispr_fusion:
-                plot_cmd.append("--crispr_fusion")
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            # 添加输入文件
+            zipf.write(input_file, arcname=input_file.name)

-        run_local_command(plot_cmd, cwd=Path("/app"))
+            # 添加结果目录
+            for subdir_name in subdirs_to_zip:
+                subdir_path = output_path / subdir_name
+                if subdir_path.exists():
+                    for root, dirs, files in os.walk(subdir_path):
+                        for file in files:
+                            file_path = Path(root) / file
+                            # 保持相对路径结构
+                            arcname = file_path.relative_to(output_path)
+                            zipf.write(file_path, arcname=str(arcname))

-        job.progress_percent = 90
-        db.commit()
+        # 删除原始结果目录 (保留 logs 以便调试? 或者也删除)
+        # 根据需求：只保留压缩包
+        logger.info(f"Job {job_id}: Cleaning up intermediate files")
+        for subdir_name in subdirs_to_zip:
+            subdir_path = output_path / subdir_name
+            if subdir_path.exists():
+                shutil.rmtree(subdir_path)

-        # 阶段 4: Bundle - 打包结果
-        logger.info(f"Job {job_id}: Starting Bundle stage")
-        job.current_stage = "bundle"
-        db.commit()
-        self.update_state(
-            state='PROGRESS',
-            meta={'stage': 'bundle', 'progress': 95, 'status': 'Bundling results...'}
-        )
+        # 删除 tar.gz (如果脚本生成了)
+        tar_gz = output_path / "pipeline_results.tar.gz"
+        if tar_gz.exists():
+            tar_gz.unlink()

-        # 创建 manifest.json
-        import json
-        manifest = {
-            "job_id": job_id,
-            "stages_completed": ["digger"],
-            "stages_skipped": ["shoter", "plots", "bundle"],
-            "output_files": list(Path(output_dir).rglob("*")),
-            "parameters": {
-                "sequence_type": sequence_type,
-                "min_identity": min_identity,
-                "min_coverage": min_coverage,
-                "allow_unknown_families": allow_unknown_families,
-                "require_index_hit": require_index_hit,
-                "crispr_fusion": crispr_fusion,
-                "crispr_weight": crispr_weight,
-            }
-        }
+        # 移除 stage 目录 (run_single_fna_pipeline 生成的)
+        stage_dir = output_path / "stage"
+        if stage_dir.exists():
+            shutil.rmtree(stage_dir)

-        manifest_path = Path(output_dir) / "manifest.json"
-        with open(manifest_path, "w") as f:
-            json.dump(manifest, f, indent=2, default=str)
+        # 验证 Zip 是否生成
+        if not zip_path.exists():
+            raise Exception("Failed to create result zip file")
+
+        # 重命名为标准下载名 (或者保持这样，由 API 决定下载名)
+        # 这里的 output_dir 就是 API 下载时寻找的地方
+        # downloadResult API 默认可能找 pipeline_results.tar.gz?
+        # 我们需要确保 frontend 下载链接改为 zip，并且后端 API 能找到这个文件
+        # 目前后端 API (backend/app/api/v1/results.py) 可能需要调整，或者我们把 zip 命名为 API 期望的名字?
+        # 假设 API 期望 output_dir 下有文件。
+        # 为了兼容，我们把 zip 命名为 pipeline_results.zip (通用)
+        # 但前端生成的下载链接是 pipeline_results_{id}.zip

        # 完成
        job.status = JobStatus.COMPLETED
        job.progress_percent = 100
        job.current_stage = "completed"
-        job.logs = json.dumps({"stages": ["digger"], "output": str(output_dir)})
+        # 记录日志摘要
+        job.logs = res['stdout'][-2000:] if res['stdout'] else "No output"
        db.commit()

        logger.info(f"Job {job_id}: Completed successfully")
@@ -275,16 +188,19 @@ def run_bttoxin_analysis(
        return {
            'job_id': job_id,
            'status': 'completed',
-            'stages': ['digger'],
            'output_dir': str(output_dir)
        }

    except Exception as e:
        logger.error(f"Job {job_id} failed: {e}")
-        job.status = JobStatus.FAILED
-        job.error_message = str(e)
-        job.current_stage = "failed"
-        db.commit()
+        if job:
+            try:
+                job.status = JobStatus.FAILED
+                job.error_message = str(e)
+                job.current_stage = "failed"
+                db.commit()
+            except Exception as commit_error:
+                logger.error(f"Failed to update job status to FAILED: {commit_error}")
        raise

    finally: