Fix(pipeline): prevent nested zip packaging and update CRISPR dependencies

- Add filter to skip .zip and .tar.gz files when creating result archive
- Update CRISPR feature with CASFinder dependencies (hmmer, blast, vmatch, etc.)
- Add install-casfinder task for macsydata installation
- Remove obsolete CRISPR test files

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
zly
2026-01-28 20:06:41 +08:00
parent e44692600c
commit 963215de2d
14 changed files with 1210 additions and 537 deletions

View File

@@ -0,0 +1,30 @@
[workspace]
authors = ["zly <644706215@qq.com>"]
channels = ["https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge", "https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda", "https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main"]
name = "crispr_cas_analysis"
platforms = ["linux-64"]
version = "0.1.0"
[dependencies]
python = ">=3.10"
wget = "*"
curl = "*"
git = "*"
java-jdk = "*"
parallel = "*"
perl-app-cpanminus = "*"
hmmer = "*"
emboss = "*"
blast = "*"
perl-bioperl-core = "*"
perl-xml-simple = "*"
perl-digest-md5 = "*"
vmatch = "*"
muscle = "*"
prodigal = "*"
mamba = "*"
macsyfinder = "==2.1.2"
[tasks]
install-casfinder = "macsydata install -u CASFinder==3.1.0"
detect = "python scripts/detect_crispr.py"

View File

@@ -1 +0,0 @@
"""Tests for CRISPR-Cas module"""

View File

@@ -1,42 +0,0 @@
import pytest
import json
import shutil
from pathlib import Path
from crispr_cas.scripts.detect_crispr import generate_mock_results
def test_generate_mock_results(tmp_path):
"""Test mock result generation"""
input_file = tmp_path / "test_genome.fna"
input_file.touch()
results = generate_mock_results(input_file)
assert results["strain_id"] == "test_genome"
assert "cas_systems" in results
assert "arrays" in results
assert results["summary"]["has_cas"] is True
assert len(results["arrays"]) > 0
def test_script_execution(tmp_path):
"""Test full script execution via subprocess"""
# Create dummy input
input_file = tmp_path / "genome.fna"
input_file.touch()
output_file = tmp_path / "results.json"
script_path = Path("crispr_cas/scripts/detect_crispr.py").absolute()
import subprocess
cmd = [
"python3", str(script_path),
"--input", str(input_file),
"--output", str(output_file),
"--mock"
]
result = subprocess.run(cmd, capture_output=True, text=True)
assert result.returncode == 0
assert output_file.exists()
with open(output_file) as f:
data = json.load(f)
assert data["strain_id"] == "genome"

View File

@@ -1,93 +0,0 @@
import pytest
import json
from pathlib import Path
import sys
# Add project root to path to allow importing modules
sys.path.insert(0, str(Path(__file__).parents[2]))
from crispr_cas.scripts.fusion_analysis import calculate_distance, perform_fusion_analysis
def test_calculate_distance():
"""Test genomic distance calculation"""
# Same contig, no overlap
# Range1: 100-200, Range2: 300-400 -> Dist 100
assert calculate_distance("c1:100-200", "c1:300-400") == 100
# Same contig, overlap
# Range1: 100-300, Range2: 200-400 -> Dist 0
assert calculate_distance("c1:100-300", "c1:200-400") == 0
# Different contig
assert calculate_distance("c1:100-200", "c2:300-400") == -1
# Invalid format
assert calculate_distance("invalid", "c1:100-200") == -1
def test_fusion_analysis_logic(tmp_path):
"""Test main analysis logic with mock data"""
# Mock CRISPR data
crispr_data = {
"strain_id": "test_strain",
"arrays": [
{
"id": "A1",
"contig": "contig_1",
"start": 1000,
"end": 2000,
"spacers": [{"sequence": "ATGC"}]
}
]
}
# Mock toxin file (just a placeholder for path)
toxin_file = tmp_path / "toxins.txt"
toxin_file.touch()
# Run analysis in mock mode
# In mock mode, the script generates its own toxin list:
# {"name": "Cry1Ac1", "position": "contig_1:10000-12000"}
# Distance: 10000 - 2000 = 8000 (< 10000 threshold) -> Should match
results = perform_fusion_analysis(crispr_data, toxin_file, mock=True)
assert results["strain_id"] == "test_strain"
assert len(results["associations"]) > 0
# Check for proximity match
proximity_matches = [a for a in results["associations"] if a["type"] == "proximity"]
assert len(proximity_matches) > 0
assert proximity_matches[0]["distance"] == 8000
def test_script_execution(tmp_path):
"""Test full script execution via subprocess"""
# Create input files
crispr_file = tmp_path / "crispr.json"
with open(crispr_file, 'w') as f:
json.dump({"strain_id": "test", "arrays": []}, f)
toxin_file = tmp_path / "toxins.txt"
toxin_file.touch()
genome_file = tmp_path / "genome.fna"
genome_file.touch()
output_file = tmp_path / "output.json"
script_path = Path("crispr_cas/scripts/fusion_analysis.py").absolute()
import subprocess
cmd = [
"python3", str(script_path),
"--crispr-results", str(crispr_file),
"--toxin-results", str(toxin_file),
"--genome", str(genome_file),
"--output", str(output_file),
"--mock"
]
result = subprocess.run(cmd, capture_output=True, text=True)
assert result.returncode == 0
assert output_file.exists()