bttoxin-pipeline/backend/app/workers/tasks.py

"""Celery 任务 - 支持并发控制和多阶段 pipeline"""
from celery import Task
from pathlib import Path
import shutil
import logging
import asyncio
import subprocess
import os

from ..core.celery_app import celery_app
from ..core.docker_client import DockerManager
from ..database import SessionLocal
from ..models.job import Job, JobStatus
from ..services.concurrency_manager import get_concurrency_manager

logger = logging.getLogger(__name__)

# Pipeline 阶段定义
PIPELINE_STAGES = ["digger", "crispr", "shoter", "plots", "bundle"]


def run_local_command(cmd: list, cwd: Path = None, env: dict = None) -> dict:
    """Run a command locally in the container"""
    try:
        logger.info(f"Running command: {' '.join(cmd)}")
        result = subprocess.run(
            cmd,
            cwd=cwd,
            env=env or os.environ.copy(),
            capture_output=True,
            text=True,
            check=False
        )
        return {
            'success': result.returncode == 0,
            'stdout': result.stdout,
            'stderr': result.stderr,
            'exit_code': result.returncode
        }
    except Exception as e:
        logger.error(f"Command failed: {e}")
        return {'success': False, 'error': str(e)}


@celery_app.task(bind=True, max_retries=3)
def run_bttoxin_analysis(
    self,
    job_id: str,
    input_dir: str,
    output_dir: str,
    sequence_type: str = "nucl",
    scaf_suffix: str = ".fna",
    threads: int = 4,
    min_identity: float = 0.8,
    min_coverage: float = 0.6,
    allow_unknown_families: bool = False,
    require_index_hit: bool = True,
    crispr_fusion: bool = False,
    crispr_weight: float = 0.0,
):
    """
    执行分析任务 - 完整的 4 阶段 pipeline

    Stages:
        1. digger - BtToxin_Digger 识别毒素基因
        2. shoter - BtToxin_Shoter 评估毒性活性
        3. plots  - 生成热力图
        4. bundle  - 打包结果
    """
    db = SessionLocal()

    try:
        job = db.query(Job).filter(Job.id == job_id).first()
        if not job:
            logger.error(f"Job {job_id} not found")
            return {'job_id': job_id, 'status': 'error', 'error': 'Job not found'}

        # 更新状态为 QUEUED
        job.status = JobStatus.QUEUED
        db.commit()

        # 尝试获取执行槽位（使用同步 Redis，因为 Celery 是同步的）
        # 注意：这里简化处理，实际应该用异步
        # 暂时直接执行，稍后集成真正的并发控制

        # 更新状态为 RUNNING
        job.status = JobStatus.RUNNING
        job.current_stage = "digger"
        job.progress_percent = 0
        db.commit()

        # 阶段 1: Digger - 识别毒素基因
        logger.info(f"Job {job_id}: Starting Digger stage")
        self.update_state(
            state='PROGRESS',
            meta={'stage': 'digger', 'progress': 10, 'status': 'Running BtToxin_Digger...'}
        )

        docker_manager = DockerManager()
        digger_result = docker_manager.run_bttoxin_digger(
            input_dir=Path(input_dir),
            output_dir=Path(output_dir),
            sequence_type=sequence_type,
            scaf_suffix=scaf_suffix,
            threads=threads
        )

        if not digger_result['success']:
            raise Exception(f"Digger stage failed: {digger_result.get('error', 'Unknown error')}")

        job.progress_percent = 40
        db.commit()

        # 阶段 1.5: CRISPR-Cas (如果启用)
        crispr_results_file = None
        if crispr_fusion:
            logger.info(f"Job {job_id}: Starting CRISPR stage")
            job.current_stage = "crispr"
            db.commit()
            self.update_state(
                state='PROGRESS',
                meta={'stage': 'crispr', 'progress': 45, 'status': 'Running CRISPR Detection...'}
            )

            crispr_out = Path(output_dir) / "crispr" / "results.json"
            crispr_out.parent.mkdir(parents=True, exist_ok=True)

            # 1. Detection
            detect_cmd = [
                "pixi", "run", "-e", "crispr", "python", "crispr_cas/scripts/detect_crispr.py",
                "--input", str(Path(input_dir) / f"{job_id}{scaf_suffix}"),  # Assuming input file name matches
                "--output", str(crispr_out),
                "--mock"  # Always use mock for now as we don't have the tool installed
            ]

            # Find input file - might be named differently
            input_files = list(Path(input_dir).glob(f"*{scaf_suffix}"))
            if input_files:
                detect_cmd[7] = str(input_files[0])

            res = run_local_command(detect_cmd, cwd=Path("/app"))
            if not res['success']:
                logger.warning(f"CRISPR detection failed: {res.get('stderr')}")
            else:
                crispr_results_file = crispr_out

                # 2. Fusion (if requested)
                fusion_out = Path(output_dir) / "crispr" / "fusion_analysis.json"
                # TODO: We need the toxins file from Digger output.
                # Assuming Digger output structure: output_dir/Results/Toxins/All_Toxins.txt (Need to verify)
                # But DockerManager output might be different. Let's assume standard structure.
                toxins_file = Path(output_dir) / "Results" / "Toxins" / "All_Toxins.txt"

                if toxins_file.exists():
                    fusion_cmd = [
                        "pixi", "run", "-e", "crispr", "python", "crispr_cas/scripts/fusion_analysis.py",
                        "--crispr-results", str(crispr_out),
                        "--toxin-results", str(toxins_file),
                        "--genome", str(input_files[0]),
                        "--output", str(fusion_out),
                        "--mock"
                    ]
                    run_local_command(fusion_cmd, cwd=Path("/app"))

        # 阶段 2: Shoter - 评估毒性活性
        logger.info(f"Job {job_id}: Starting Shoter stage")
        job.current_stage = "shoter"
        db.commit()
        self.update_state(
            state='PROGRESS',
            meta={'stage': 'shoter', 'progress': 50, 'status': 'Running BtToxin_Shoter...'}
        )

        # 构建 Shoter 命令
        # 假设 Digger 输出在 output_dir/Results/Toxins/All_Toxins.txt
        toxins_file = Path(output_dir) / "Results" / "Toxins" / "All_Toxins.txt"
        shoter_out_dir = Path(output_dir) / "shoter"

        # 即使 Digger 失败或没有结果，我们也可以尝试运行（脚本会处理空文件）
        # 如果文件不存在，可能 Digger 结构不同，需要适配

        shoter_cmd = [
            "pixi", "run", "-e", "pipeline", "python", "scripts/bttoxin_shoter.py",
            "--all_toxins", str(toxins_file),
            "--output_dir", str(shoter_out_dir),
            "--min_identity", str(min_identity),
            "--min_coverage", str(min_coverage)
        ]

        if allow_unknown_families:
            shoter_cmd.append("--allow_unknown_families")
        if require_index_hit:
            shoter_cmd.append("--require_index_hit")

        # CRISPR Integration
        if crispr_results_file:
            shoter_cmd.extend(["--crispr_results", str(crispr_results_file)])
            shoter_cmd.extend(["--crispr_weight", str(crispr_weight)])
            if crispr_fusion:
                shoter_cmd.append("--crispr_fusion")

        run_local_command(shoter_cmd, cwd=Path("/app"))

        job.progress_percent = 70
        db.commit()

        # 阶段 3: Plots - 生成热力图
        logger.info(f"Job {job_id}: Starting Plots stage")
        job.current_stage = "plots"
        db.commit()
        self.update_state(
            state='PROGRESS',
            meta={'stage': 'plots', 'progress': 80, 'status': 'Generating plots...'}
        )

        plot_cmd = [
            "pixi", "run", "-e", "pipeline", "python", "scripts/plot_shotter.py",
            "--strain_scores", str(shoter_out_dir / "strain_target_scores.tsv"),
            "--toxin_support", str(shoter_out_dir / "toxin_support.tsv"),
            "--species_scores", str(shoter_out_dir / "strain_target_species_scores.tsv"),
            "--out_dir", str(shoter_out_dir),
            "--output_prefix", "Activity_Heatmap"
        ]

        if crispr_results_file:
            plot_cmd.extend(["--crispr_results", str(crispr_results_file)])
            if crispr_fusion:
                plot_cmd.append("--crispr_fusion")

        run_local_command(plot_cmd, cwd=Path("/app"))

        job.progress_percent = 90
        db.commit()

        # 阶段 4: Bundle - 打包结果
        logger.info(f"Job {job_id}: Starting Bundle stage")
        job.current_stage = "bundle"
        db.commit()
        self.update_state(
            state='PROGRESS',
            meta={'stage': 'bundle', 'progress': 95, 'status': 'Bundling results...'}
        )

        # 创建 manifest.json
        import json
        manifest = {
            "job_id": job_id,
            "stages_completed": ["digger"],
            "stages_skipped": ["shoter", "plots", "bundle"],
            "output_files": list(Path(output_dir).rglob("*")),
            "parameters": {
                "sequence_type": sequence_type,
                "min_identity": min_identity,
                "min_coverage": min_coverage,
                "allow_unknown_families": allow_unknown_families,
                "require_index_hit": require_index_hit,
                "crispr_fusion": crispr_fusion,
                "crispr_weight": crispr_weight,
            }
        }

        manifest_path = Path(output_dir) / "manifest.json"
        with open(manifest_path, "w") as f:
            json.dump(manifest, f, indent=2, default=str)

        # 完成
        job.status = JobStatus.COMPLETED
        job.progress_percent = 100
        job.current_stage = "completed"
        job.logs = json.dumps({"stages": ["digger"], "output": str(output_dir)})
        db.commit()

        logger.info(f"Job {job_id}: Completed successfully")

        return {
            'job_id': job_id,
            'status': 'completed',
            'stages': ['digger'],
            'output_dir': str(output_dir)
        }

    except Exception as e:
        logger.error(f"Job {job_id} failed: {e}")
        job.status = JobStatus.FAILED
        job.error_message = str(e)
        job.current_stage = "failed"
        db.commit()
        raise

    finally:
        db.close()


@celery_app.task
def update_queue_positions():
    """
    定期更新排队任务的位置
    可以通过 Celery Beat 定期调用
    """
    db = SessionLocal()
    try:
        # 获取所有 QUEUED 状态的任务
        queued_jobs = db.query(Job).filter(
            Job.status == JobStatus.QUEUED
        ).order_by(Job.created_at).all()

        for idx, job in enumerate(queued_jobs, start=1):
            job.queue_position = idx

        db.commit()
        logger.info(f"Updated queue positions for {len(queued_jobs)} jobs")

    except Exception as e:
        logger.error(f"Failed to update queue positions: {e}")
        db.rollback()
    finally:
        db.close()