Files
bttoxin-pipeline/crispr_cas/scripts/detect_crispr.py

139 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
CRISPR-Cas Detection Wrapper
Wrapper for CRISPRCasFinder or similar tools to detect CRISPR arrays and Cas genes.
"""
import argparse
import json
import logging
import shutil
import subprocess
import sys
from pathlib import Path
from typing import Dict, List, Any
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser(description="Detect CRISPR arrays and Cas genes in genome")
parser.add_argument("--input", "-i", type=Path, required=True, help="Input genome file (.fna)")
parser.add_argument("--output", "-o", type=Path, required=True, help="Output JSON results file")
parser.add_argument("--tool-path", type=Path, default=None, help="Path to CRISPRCasFinder.pl")
parser.add_argument("--mock", action="store_true", help="Use mock data (for testing without external tools)")
return parser.parse_args()
def check_dependencies(tool_path: Path = None) -> bool:
"""Check if CRISPRCasFinder is available"""
if tool_path and tool_path.exists():
return True
# Check in PATH
if shutil.which("CRISPRCasFinder.pl"):
return True
return False
def generate_mock_results(genome_file: Path) -> Dict[str, Any]:
"""Generate mock CRISPR results for testing"""
logger.info(f"Generating mock CRISPR results for {genome_file.name}")
strain_id = genome_file.stem
return {
"strain_id": strain_id,
"cas_systems": [
{
"type": "I-E",
"subtype": "I-E",
"position": "contig_1:15000-25000",
"genes": ["cas1", "cas2", "cas3", "casA", "casB", "casC", "casD", "casE"]
}
],
"arrays": [
{
"id": "CRISPR_1",
"contig": "contig_1",
"start": 12345,
"end": 12678,
"consensus_repeat": "GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC",
"num_spacers": 5,
"spacers": [
{"sequence": "ATGCGTCGACATGCGTCGACATGCGTCGAC", "position": 1},
{"sequence": "CGTAGCTAGCCGTAGCTAGCCGTAGCTAGC", "position": 2},
{"sequence": "TGCATGCATGTGCATGCATGTGCATGCATG", "position": 3},
{"sequence": "GCTAGCTAGCGCTAGCTAGCGCTAGCTAGC", "position": 4},
{"sequence": "AAAAATTTTTAAAAATTTTTAAAAATTTTT", "position": 5}
]
},
{
"id": "CRISPR_2",
"contig": "contig_2",
"start": 50000,
"end": 50500,
"consensus_repeat": "GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC",
"num_spacers": 8,
"spacers": [
{"sequence": "CCCGGGAAACCCGGGAAACCCGGGAAA", "position": 1}
]
}
],
"summary": {
"has_cas": True,
"has_crispr": True,
"num_arrays": 2,
"num_spacers": 13,
"cas_types": ["I-E"]
},
"metadata": {
"tool": "CRISPRCasFinder",
"version": "Mock-v1.0",
"date": "2025-01-14"
}
}
def run_crisprcasfinder(input_file: Path, output_file: Path, tool_path: Path = None):
"""Run actual CRISPRCasFinder tool (Placeholder)"""
# This would implement the actual subprocess call to CRISPRCasFinder.pl
# For now, we raise NotImplementedError unless mock is used
raise NotImplementedError("Real tool integration not yet implemented. Use --mock flag.")
def main():
args = parse_args()
if not args.input.exists():
logger.error(f"Input file not found: {args.input}")
sys.exit(1)
# Create parent directory for output if needed
args.output.parent.mkdir(parents=True, exist_ok=True)
try:
if args.mock:
results = generate_mock_results(args.input)
else:
if not check_dependencies(args.tool_path):
logger.warning("CRISPRCasFinder not found. Falling back to mock data.")
results = generate_mock_results(args.input)
else:
# Real implementation would go here
run_crisprcasfinder(args.input, args.output, args.tool_path)
return
# Write results
with open(args.output, 'w') as f:
json.dump(results, f, indent=2)
logger.info(f"Results written to {args.output}")
except Exception as e:
logger.error(f"Error executing CRISPR detection: {e}")
sys.exit(1)
if __name__ == "__main__":
main()