Fix(pipeline): prevent nested zip packaging and update CRISPR dependencies

- Add filter to skip .zip and .tar.gz files when creating result archive - Update CRISPR feature with CASFinder dependencies (hmmer, blast, vmatch, etc.) - Add install-casfinder task for macsydata installation - Remove obsolete CRISPR test files Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-28 20:06:41 +08:00
parent e44692600c
commit 963215de2d
14 changed files with 1210 additions and 537 deletions
--- a/backend/app/workers/tasks.py
+++ b/backend/app/workers/tasks.py
@@ -160,6 +160,10 @@ def run_bttoxin_analysis(
                            if file_path == zip_path:
                                continue

+                            # 防止嵌套打包：忽略可能存在的 zip 或 tar.gz 文件
+                            if file.endswith('.zip') or file.endswith('.tar.gz'):
+                                continue
+
                            # 计算相对路径，例如 digger/Results/foo.txt -> Results/foo.txt
                            rel_path = file_path.relative_to(src_path)
                            # 构造新的归档路径 -> 1_Toxin_Mining/Results/foo.txt
--- a/docker-start.sh
+++ b/docker-start.sh
--- a/docker-stop.sh
+++ b/docker-stop.sh
--- a/docker/scripts/entrypoint.sh
+++ b/docker/scripts/entrypoint.sh
--- a/docker/scripts/switch-to-traefik.sh
+++ b/docker/scripts/switch-to-traefik.sh
--- a/pixi.lock
+++ b/pixi.lock
--- a/pixi.toml
+++ b/pixi.toml
@@ -63,12 +63,27 @@ pytest = "*"
 #
 # 预期依赖（待激活时添加）:
 [feature.crispr.dependencies]
-python = ">=3.9"
-# crisprcasfinder = "*"  # 需要配置安装源
-biopython = "*"
-pandas = ">=2.0.0"
+python = ">=3.10"
+wget = "*"
+curl = "*"
+git = "*"
+java-jdk = "*"
+parallel = "*"
+perl-app-cpanminus = "*"
+hmmer = "*"
+emboss = "*"
+blast = "*"
+perl-bioperl-core = "*"
+perl-xml-simple = "*"
+perl-digest-md5 = "*"
+vmatch = "*"
+muscle = "*"
+prodigal = "*"
+mamba = "*"
+macsyfinder = "==2.1.2"

 [feature.crispr.tasks]
+install-casfinder = "macsydata install -u CASFinder==3.1.0"
 detect = "python tools/crispr_cas_analysis/scripts/detect_crispr.py"
 fusion = "python tools/crispr_cas_analysis/scripts/fusion_analysis.py"

@@ -80,7 +95,7 @@ digger = ["digger"]
 pipeline = ["pipeline"]
 frontend = ["frontend"]
 webbackend = ["webbackend"]
-crispr = ["crispr", "pipeline"] # Inherit pipeline to get pandas etc? Or just pipeline deps. Let's make crispr feature add to pipeline env if possible, or just use pipeline env for running these tools since they are python.
+crispr = ["crispr"]

 # Actually, let's keep it simple. The tools are Python scripts. They can run in the 'pipeline' environment which has python and pandas.
 # The 'crispr' feature defines dependencies.
--- a/scripts/download_bpprc_data.py
+++ b/scripts/download_bpprc_data.py
--- a/scripts/start_web.sh
+++ b/scripts/start_web.sh
--- a/tools/bttoxin_digger/run_digger_pixi.sh
+++ b/tools/bttoxin_digger/run_digger_pixi.sh
--- a/tools/crispr_cas_analysis/pixi.toml
+++ b/tools/crispr_cas_analysis/pixi.toml
@@ -0,0 +1,30 @@
+[workspace]
+authors = ["zly <644706215@qq.com>"]
+channels = ["https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge", "https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda", "https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main"]
+name = "crispr_cas_analysis"
+platforms = ["linux-64"]
+version = "0.1.0"
+
+[dependencies]
+python = ">=3.10"
+wget = "*"
+curl = "*"
+git = "*"
+java-jdk = "*"
+parallel = "*"
+perl-app-cpanminus = "*"
+hmmer = "*"
+emboss = "*"
+blast = "*"
+perl-bioperl-core = "*"
+perl-xml-simple = "*"
+perl-digest-md5 = "*"
+vmatch = "*"
+muscle = "*"
+prodigal = "*"
+mamba = "*"
+macsyfinder = "==2.1.2"
+
+[tasks]
+install-casfinder = "macsydata install -u CASFinder==3.1.0"
+detect = "python scripts/detect_crispr.py"
--- a/tools/crispr_cas_analysis/tests/init.py
+++ b/tools/crispr_cas_analysis/tests/init.py
@@ -1 +0,0 @@
-"""Tests for CRISPR-Cas module"""
--- a/tools/crispr_cas_analysis/tests/test_detect_crispr.py
+++ b/tools/crispr_cas_analysis/tests/test_detect_crispr.py
@@ -1,42 +0,0 @@
-import pytest
-import json
-import shutil
-from pathlib import Path
-from crispr_cas.scripts.detect_crispr import generate_mock_results
-
-def test_generate_mock_results(tmp_path):
-    """Test mock result generation"""
-    input_file = tmp_path / "test_genome.fna"
-    input_file.touch()
-
-    results = generate_mock_results(input_file)
-
-    assert results["strain_id"] == "test_genome"
-    assert "cas_systems" in results
-    assert "arrays" in results
-    assert results["summary"]["has_cas"] is True
-    assert len(results["arrays"]) > 0
-
-def test_script_execution(tmp_path):
-    """Test full script execution via subprocess"""
-    # Create dummy input
-    input_file = tmp_path / "genome.fna"
-    input_file.touch()
-    output_file = tmp_path / "results.json"
-    script_path = Path("crispr_cas/scripts/detect_crispr.py").absolute()
-
-    import subprocess
-    cmd = [
-        "python3", str(script_path),
-        "--input", str(input_file),
-        "--output", str(output_file),
-        "--mock"
-    ]
-
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    assert result.returncode == 0
-    assert output_file.exists()
-
-    with open(output_file) as f:
-        data = json.load(f)
-        assert data["strain_id"] == "genome"
--- a/tools/crispr_cas_analysis/tests/test_fusion_analysis.py
+++ b/tools/crispr_cas_analysis/tests/test_fusion_analysis.py
@@ -1,93 +0,0 @@
-import pytest
-import json
-from pathlib import Path
-import sys
-
-# Add project root to path to allow importing modules
-sys.path.insert(0, str(Path(__file__).parents[2]))
-
-from crispr_cas.scripts.fusion_analysis import calculate_distance, perform_fusion_analysis
-
-def test_calculate_distance():
-    """Test genomic distance calculation"""
-    # Same contig, no overlap
-    # Range1: 100-200, Range2: 300-400 -> Dist 100
-    assert calculate_distance("c1:100-200", "c1:300-400") == 100
-
-    # Same contig, overlap
-    # Range1: 100-300, Range2: 200-400 -> Dist 0
-    assert calculate_distance("c1:100-300", "c1:200-400") == 0
-
-    # Different contig
-    assert calculate_distance("c1:100-200", "c2:300-400") == -1
-
-    # Invalid format
-    assert calculate_distance("invalid", "c1:100-200") == -1
-
-def test_fusion_analysis_logic(tmp_path):
-    """Test main analysis logic with mock data"""
-
-    # Mock CRISPR data
-    crispr_data = {
-        "strain_id": "test_strain",
-        "arrays": [
-            {
-                "id": "A1",
-                "contig": "contig_1",
-                "start": 1000,
-                "end": 2000,
-                "spacers": [{"sequence": "ATGC"}]
-            }
-        ]
-    }
-
-    # Mock toxin file (just a placeholder for path)
-    toxin_file = tmp_path / "toxins.txt"
-    toxin_file.touch()
-
-    # Run analysis in mock mode
-    # In mock mode, the script generates its own toxin list:
-    # {"name": "Cry1Ac1", "position": "contig_1:10000-12000"}
-    # Distance: 10000 - 2000 = 8000 (< 10000 threshold) -> Should match
-
-    results = perform_fusion_analysis(crispr_data, toxin_file, mock=True)
-
-    assert results["strain_id"] == "test_strain"
-    assert len(results["associations"]) > 0
-
-    # Check for proximity match
-    proximity_matches = [a for a in results["associations"] if a["type"] == "proximity"]
-    assert len(proximity_matches) > 0
-    assert proximity_matches[0]["distance"] == 8000
-
-def test_script_execution(tmp_path):
-    """Test full script execution via subprocess"""
-
-    # Create input files
-    crispr_file = tmp_path / "crispr.json"
-    with open(crispr_file, 'w') as f:
-        json.dump({"strain_id": "test", "arrays": []}, f)
-
-    toxin_file = tmp_path / "toxins.txt"
-    toxin_file.touch()
-
-    genome_file = tmp_path / "genome.fna"
-    genome_file.touch()
-
-    output_file = tmp_path / "output.json"
-
-    script_path = Path("crispr_cas/scripts/fusion_analysis.py").absolute()
-
-    import subprocess
-    cmd = [
-        "python3", str(script_path),
-        "--crispr-results", str(crispr_file),
-        "--toxin-results", str(toxin_file),
-        "--genome", str(genome_file),
-        "--output", str(output_file),
-        "--mock"
-    ]
-
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    assert result.returncode == 0
-    assert output_file.exists()