#!/usr/bin/env python3 """ CRISPR-Cas Detection Wrapper Wrapper for CRISPRCasFinder or similar tools to detect CRISPR arrays and Cas genes. """ import argparse import json import logging import shutil import subprocess import sys from pathlib import Path from typing import Dict, List, Any # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def parse_args(): parser = argparse.ArgumentParser(description="Detect CRISPR arrays and Cas genes in genome") parser.add_argument("--input", "-i", type=Path, required=True, help="Input genome file (.fna)") parser.add_argument("--output", "-o", type=Path, required=True, help="Output JSON results file") parser.add_argument("--tool-path", type=Path, default=None, help="Path to CRISPRCasFinder.pl") parser.add_argument("--mock", action="store_true", help="Use mock data (for testing without external tools)") return parser.parse_args() def check_dependencies(tool_path: Path = None) -> bool: """Check if CRISPRCasFinder is available""" if tool_path and tool_path.exists(): return True # Check in PATH if shutil.which("CRISPRCasFinder.pl"): return True return False def generate_mock_results(genome_file: Path) -> Dict[str, Any]: """Generate mock CRISPR results for testing""" logger.info(f"Generating mock CRISPR results for {genome_file.name}") strain_id = genome_file.stem return { "strain_id": strain_id, "cas_systems": [ { "type": "I-E", "subtype": "I-E", "position": "contig_1:15000-25000", "genes": ["cas1", "cas2", "cas3", "casA", "casB", "casC", "casD", "casE"] } ], "arrays": [ { "id": "CRISPR_1", "contig": "contig_1", "start": 12345, "end": 12678, "consensus_repeat": "GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC", "num_spacers": 5, "spacers": [ {"sequence": "ATGCGTCGACATGCGTCGACATGCGTCGAC", "position": 1}, {"sequence": "CGTAGCTAGCCGTAGCTAGCCGTAGCTAGC", "position": 2}, {"sequence": "TGCATGCATGTGCATGCATGTGCATGCATG", "position": 3}, {"sequence": "GCTAGCTAGCGCTAGCTAGCGCTAGCTAGC", "position": 4}, {"sequence": "AAAAATTTTTAAAAATTTTTAAAAATTTTT", "position": 5} ] }, { "id": "CRISPR_2", "contig": "contig_2", "start": 50000, "end": 50500, "consensus_repeat": "GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC", "num_spacers": 8, "spacers": [ {"sequence": "CCCGGGAAACCCGGGAAACCCGGGAAA", "position": 1} ] } ], "summary": { "has_cas": True, "has_crispr": True, "num_arrays": 2, "num_spacers": 13, "cas_types": ["I-E"] }, "metadata": { "tool": "CRISPRCasFinder", "version": "Mock-v1.0", "date": "2025-01-14" } } def run_crisprcasfinder(input_file: Path, output_file: Path, tool_path: Path = None): """Run actual CRISPRCasFinder tool (Placeholder)""" # This would implement the actual subprocess call to CRISPRCasFinder.pl # For now, we raise NotImplementedError unless mock is used raise NotImplementedError("Real tool integration not yet implemented. Use --mock flag.") def main(): args = parse_args() if not args.input.exists(): logger.error(f"Input file not found: {args.input}") sys.exit(1) # Create parent directory for output if needed args.output.parent.mkdir(parents=True, exist_ok=True) try: if args.mock: results = generate_mock_results(args.input) else: if not check_dependencies(args.tool_path): logger.warning("CRISPRCasFinder not found. Falling back to mock data.") results = generate_mock_results(args.input) else: # Real implementation would go here run_crisprcasfinder(args.input, args.output, args.tool_path) return # Write results with open(args.output, 'w') as f: json.dump(results, f, indent=2) logger.info(f"Results written to {args.output}") except Exception as e: logger.error(f"Error executing CRISPR detection: {e}") sys.exit(1) if __name__ == "__main__": main()