Fix(pipeline): prevent nested zip packaging and update CRISPR dependencies

- Add filter to skip .zip and .tar.gz files when creating result archive
- Update CRISPR feature with CASFinder dependencies (hmmer, blast, vmatch, etc.)
- Add install-casfinder task for macsydata installation
- Remove obsolete CRISPR test files

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
zly
2026-01-28 20:06:41 +08:00
parent e44692600c
commit 963215de2d
14 changed files with 1210 additions and 537 deletions

View File

@@ -160,6 +160,10 @@ def run_bttoxin_analysis(
if file_path == zip_path:
continue
# 防止嵌套打包:忽略可能存在的 zip 或 tar.gz 文件
if file.endswith('.zip') or file.endswith('.tar.gz'):
continue
# 计算相对路径,例如 digger/Results/foo.txt -> Results/foo.txt
rel_path = file_path.relative_to(src_path)
# 构造新的归档路径 -> 1_Toxin_Mining/Results/foo.txt

0
docker-start.sh Executable file → Normal file
View File

0
docker-stop.sh Executable file → Normal file
View File

0
docker/scripts/entrypoint.sh Executable file → Normal file
View File

0
docker/scripts/switch-to-traefik.sh Executable file → Normal file
View File

1552
pixi.lock

File diff suppressed because it is too large Load Diff

View File

@@ -63,12 +63,27 @@ pytest = "*"
#
# 预期依赖(待激活时添加):
[feature.crispr.dependencies]
python = ">=3.9"
# crisprcasfinder = "*" # 需要配置安装源
biopython = "*"
pandas = ">=2.0.0"
python = ">=3.10"
wget = "*"
curl = "*"
git = "*"
java-jdk = "*"
parallel = "*"
perl-app-cpanminus = "*"
hmmer = "*"
emboss = "*"
blast = "*"
perl-bioperl-core = "*"
perl-xml-simple = "*"
perl-digest-md5 = "*"
vmatch = "*"
muscle = "*"
prodigal = "*"
mamba = "*"
macsyfinder = "==2.1.2"
[feature.crispr.tasks]
install-casfinder = "macsydata install -u CASFinder==3.1.0"
detect = "python tools/crispr_cas_analysis/scripts/detect_crispr.py"
fusion = "python tools/crispr_cas_analysis/scripts/fusion_analysis.py"
@@ -80,7 +95,7 @@ digger = ["digger"]
pipeline = ["pipeline"]
frontend = ["frontend"]
webbackend = ["webbackend"]
crispr = ["crispr", "pipeline"] # Inherit pipeline to get pandas etc? Or just pipeline deps. Let's make crispr feature add to pipeline env if possible, or just use pipeline env for running these tools since they are python.
crispr = ["crispr"]
# Actually, let's keep it simple. The tools are Python scripts. They can run in the 'pipeline' environment which has python and pandas.
# The 'crispr' feature defines dependencies.

0
scripts/download_bpprc_data.py Executable file → Normal file
View File

0
scripts/start_web.sh Executable file → Normal file
View File

0
tools/bttoxin_digger/run_digger_pixi.sh Executable file → Normal file
View File

View File

@@ -0,0 +1,30 @@
[workspace]
authors = ["zly <644706215@qq.com>"]
channels = ["https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge", "https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda", "https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main"]
name = "crispr_cas_analysis"
platforms = ["linux-64"]
version = "0.1.0"
[dependencies]
python = ">=3.10"
wget = "*"
curl = "*"
git = "*"
java-jdk = "*"
parallel = "*"
perl-app-cpanminus = "*"
hmmer = "*"
emboss = "*"
blast = "*"
perl-bioperl-core = "*"
perl-xml-simple = "*"
perl-digest-md5 = "*"
vmatch = "*"
muscle = "*"
prodigal = "*"
mamba = "*"
macsyfinder = "==2.1.2"
[tasks]
install-casfinder = "macsydata install -u CASFinder==3.1.0"
detect = "python scripts/detect_crispr.py"

View File

@@ -1 +0,0 @@
"""Tests for CRISPR-Cas module"""

View File

@@ -1,42 +0,0 @@
import pytest
import json
import shutil
from pathlib import Path
from crispr_cas.scripts.detect_crispr import generate_mock_results
def test_generate_mock_results(tmp_path):
"""Test mock result generation"""
input_file = tmp_path / "test_genome.fna"
input_file.touch()
results = generate_mock_results(input_file)
assert results["strain_id"] == "test_genome"
assert "cas_systems" in results
assert "arrays" in results
assert results["summary"]["has_cas"] is True
assert len(results["arrays"]) > 0
def test_script_execution(tmp_path):
"""Test full script execution via subprocess"""
# Create dummy input
input_file = tmp_path / "genome.fna"
input_file.touch()
output_file = tmp_path / "results.json"
script_path = Path("crispr_cas/scripts/detect_crispr.py").absolute()
import subprocess
cmd = [
"python3", str(script_path),
"--input", str(input_file),
"--output", str(output_file),
"--mock"
]
result = subprocess.run(cmd, capture_output=True, text=True)
assert result.returncode == 0
assert output_file.exists()
with open(output_file) as f:
data = json.load(f)
assert data["strain_id"] == "genome"

View File

@@ -1,93 +0,0 @@
import pytest
import json
from pathlib import Path
import sys
# Add project root to path to allow importing modules
sys.path.insert(0, str(Path(__file__).parents[2]))
from crispr_cas.scripts.fusion_analysis import calculate_distance, perform_fusion_analysis
def test_calculate_distance():
"""Test genomic distance calculation"""
# Same contig, no overlap
# Range1: 100-200, Range2: 300-400 -> Dist 100
assert calculate_distance("c1:100-200", "c1:300-400") == 100
# Same contig, overlap
# Range1: 100-300, Range2: 200-400 -> Dist 0
assert calculate_distance("c1:100-300", "c1:200-400") == 0
# Different contig
assert calculate_distance("c1:100-200", "c2:300-400") == -1
# Invalid format
assert calculate_distance("invalid", "c1:100-200") == -1
def test_fusion_analysis_logic(tmp_path):
"""Test main analysis logic with mock data"""
# Mock CRISPR data
crispr_data = {
"strain_id": "test_strain",
"arrays": [
{
"id": "A1",
"contig": "contig_1",
"start": 1000,
"end": 2000,
"spacers": [{"sequence": "ATGC"}]
}
]
}
# Mock toxin file (just a placeholder for path)
toxin_file = tmp_path / "toxins.txt"
toxin_file.touch()
# Run analysis in mock mode
# In mock mode, the script generates its own toxin list:
# {"name": "Cry1Ac1", "position": "contig_1:10000-12000"}
# Distance: 10000 - 2000 = 8000 (< 10000 threshold) -> Should match
results = perform_fusion_analysis(crispr_data, toxin_file, mock=True)
assert results["strain_id"] == "test_strain"
assert len(results["associations"]) > 0
# Check for proximity match
proximity_matches = [a for a in results["associations"] if a["type"] == "proximity"]
assert len(proximity_matches) > 0
assert proximity_matches[0]["distance"] == 8000
def test_script_execution(tmp_path):
"""Test full script execution via subprocess"""
# Create input files
crispr_file = tmp_path / "crispr.json"
with open(crispr_file, 'w') as f:
json.dump({"strain_id": "test", "arrays": []}, f)
toxin_file = tmp_path / "toxins.txt"
toxin_file.touch()
genome_file = tmp_path / "genome.fna"
genome_file.touch()
output_file = tmp_path / "output.json"
script_path = Path("crispr_cas/scripts/fusion_analysis.py").absolute()
import subprocess
cmd = [
"python3", str(script_path),
"--crispr-results", str(crispr_file),
"--toxin-results", str(toxin_file),
"--genome", str(genome_file),
"--output", str(output_file),
"--mock"
]
result = subprocess.run(cmd, capture_output=True, text=True)
assert result.returncode == 0
assert output_file.exists()